In [None]:
import csv
import sys
import re
import json
import time
import operator
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix,coo_matrix,csc_matrix
import pandas as pd
from datetime import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from operator import itemgetter
import scipy as sp
import scipy.sparse

In [None]:
#Install sparse dot topn

#!pip3 install git+https://github.com/ing-bank/sparse_dot_topn.git
if sys.version_info[0] >= 3:
    from sparse_dot_topn import sparse_dot_topn as ct
    from sparse_dot_topn import sparse_dot_topn_threaded as ct_thread
else:
    import sparse_dot_topn as ct
    import sparse_dot_topn_threaded as ct_thread

In [None]:


def timeit(method):
    """
    Standard Python decorator that measures the execution time of a method;
    """
    def timed(*args, **kw):
        start = time.time()
        result = method(*args, **kw)
        end = time.time()
        
        #print(f"{method.__name__}:  {(end - start):.2f} s")
        return result
    return timed


def read_file(path: str, set_record_id_as_index: bool=False) -> pd.DataFrame:
    return pd.read_csv(path, dtype=str, escapechar="\\", index_col="record_id" if set_record_id_as_index else None)


#@timeit
def load_training_data(data_path: str, row_subset: float=1, train_split: float=0.7, shuffle: bool=False, seed=None):
    '''
    Load the training set and divide it into training and test splits.
    "LinkedID" is the value that we want to predict
    :param data_path: path to the dataset to load;
    :param row_subset: use only the specified fraction of rows in the dataset (value in (0, 1]);
    :param train_split: fraction of rows placed in the training set;
    :param shuffle: if True, shuffle the rows before splitting or subsetting the data;
    '''
    if row_subset <= 0 or row_subset > 1:
        row_subset = 1
    
    data = read_file(training_file, set_record_id_as_index=True)
    if shuffle:
        data = data.sample(frac=1, random_state=seed)
    # Obtain the specified subset of rows;
    data = data.iloc[:int(np.ceil(len(data) * row_subset))]
        
    X = data.drop(columns="linked_id")
    y = data["linked_id"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_split, shuffle=shuffle, random_state =seed)
    
    return X_train, X_test, y_train, y_test


def recall_at_k(resultTable : pd.DataFrame, trainingData: pd.DataFrame, testingData: pd.DataFrame) -> dict:
    """
    Given a list of K predictions for each query, first retrieve the correct ID from the test data,
    then look in the training data the percentage of records that have been successfully identified.
    
    For example, given query "1234-M", first retrieve the correct ID "1234" from the test data,
    then obtain from the training data all records that refer to "1234", 
    and finally look how many of them we have found;
    """
    
    # Obtain all the predictions for each record in the test set;
    perQueryRecords = resultTable.groupby("queried_record_id")
    
    # Group training records by their LinkedID truth value;
    groupedTrainingRecords = trainingData.groupby("linked_id")

    totalRecall = 0.0

    allRecords = dict()
    
    start = time.time()
    for i, (queriedRecordID, group) in enumerate(perQueryRecords):
        #if i % 1000 == 0 and i > 0:
            #print(f"processed {i}/{len(perQueryRecords)} records, {100 * i / len(perQueryRecords):.2f}%")
            #print(f"\tcurrent recall: {(totalRecall / i):.2f}")
            #print(f"\ttime elapsed: {(time.time() - start):.2f} s")
        
        try:
            queriedLinkedID = testingData.loc[queriedRecordID, "linked_id"]
        except IndexError:
            raise IndexError("ID {queriedRecordID} not found in testing data!")
        
        try:
            allRelevantRecords = set(groupedTrainingRecords.get_group(queriedLinkedID).index.values)
        except KeyError:
            allRelevantRecords = set()
        setPredictedRecords = set(group["predicted_record_id"])
        selectedRelevantRecords = setPredictedRecords.intersection(allRelevantRecords)
        recall = 1
        if (len(allRelevantRecords) > 0):
            recall = len(selectedRelevantRecords) / len(allRelevantRecords)

        totalRecall += recall
        allRecords[queriedRecordID] = [queriedRecordID, recall, len(selectedRelevantRecords), len(allRelevantRecords)]
    
    # Store the results in a summary table;
    result_table =  pd.DataFrame.from_dict(
                        allRecords,
                        orient='index',
                        columns=["QueriedRecordID", "Recall@K", "SelectedRecords", "AllRelevantRecords"]
                    )
    # Compute the filtered recall, which considers only queries with at least one relevant record in the training data;
    queries_with_relevant_records = result_table[result_table["AllRelevantRecords"] > 0]
    filtered_recall = np.mean(queries_with_relevant_records["SelectedRecords"] / queries_with_relevant_records["AllRelevantRecords"])

    return {
            "AverageRecall" : totalRecall / len(perQueryRecords),
            "AverageFilteredRecall": filtered_recall,
            "perQueryResult" : result_table
            }
    
def precision_at_k(resultTable : pd.DataFrame, trainingData: pd.DataFrame, testingData: pd.DataFrame) -> dict:
    """
    Given a list of K predictions for each query, first retrieve the correct ID from the test data,
    then look in the training data the percentage of records that are actually relevant;
    
    For example, given query "1234-M", first retrieve the correct ID "1234" from the test data,
    then obtain from the training data all records that refer to "1234", 
    and finally look how many of the records we have found are actually referring to "1234"
    """
    
    # Obtain all the predictions for each record in the test set;
    perQueryRecords = resultTable.groupby("queried_record_id")
    
    # Group training records by their LinkedID truth value;
    groupedTrainingRecords = trainingData.groupby("linked_id")

    totalPrecision = 0.0
    numberOfPredictionsForRelevantRecords = 0

    allRecords = dict()
    
    start = time.time()
    for i, (queriedRecordID, group) in enumerate(perQueryRecords):
        #if i % 1000 == 0 and i > 0:
            #print(f"processed {i}/{len(perQueryRecords)} records, {100 * i / len(perQueryRecords):.2f}%")
            #print(f"\tcurrent precision: {(totalPrecision / i):.2f}")
            #print(f"\ttime elapsed: {(time.time() - start):.2f} s")
        
        try:
            queriedLinkedID = testingData.loc[queriedRecordID, "linked_id"]
        except IndexError:
            raise IndexError("ID {queriedRecordID} not found in testing data!")
        
        try:
            allRelevantRecords = set(groupedTrainingRecords.get_group(queriedLinkedID).index.values)
        except KeyError:
            allRelevantRecords = set()
        setPredictedRecords = set(group["predicted_record_id"])
        selectedRelevantRecords = setPredictedRecords.intersection(allRelevantRecords)
        precision = 1
        if (len(allRelevantRecords) > 0):
            precision = len(selectedRelevantRecords) / len(setPredictedRecords)
            numberOfPredictionsForRelevantRecords += len(setPredictedRecords)

        totalPrecision += precision
        allRecords[queriedRecordID] = [queriedRecordID, precision, len(selectedRelevantRecords), len(allRelevantRecords)]
    
    # Store the results in a summary table;
    result_table =  pd.DataFrame.from_dict(
                        allRecords,
                        orient='index',
                        columns=["QueriedRecordID", "Precision@K", "SelectedRecords", "AllRelevantRecords"]
                    )
    # Compute the filtered recall, which considers only queries with at least one relevant record in the training data;
    queries_with_relevant_records = result_table[result_table["AllRelevantRecords"] > 0]
    filtered_precision = np.mean(queries_with_relevant_records["SelectedRecords"] / numberOfPredictionsForRelevantRecords)

    return {
            "AveragePrecision" : totalPrecision / len(perQueryRecords),
            "AverageFilteredPrecision": filtered_precision,
            "perQueryResult" : result_table
            }   

#%%




@timeit
def prediction_dict_to_df(predictions: dict) -> pd.DataFrame:
    # Turn the prediction dict into a series of tuples;
    results = []
    for query_id, pred_list in predictions.items():
        for p in pred_list:
            results += [[query_id, p]]
    return pd.DataFrame(results, columns=["queried_record_id", "predicted_record_id"])

@timeit
def prediction_dict_to_kaggle_df(predictions: dict) -> pd.DataFrame:
    # Turn the prediction dict into a series of tuples;
    results = []
    for query_id, pred_list in predictions.items():
        results += [[query_id, " ".join(pred_list)]]
    return pd.DataFrame(results, columns=["queried_record_id", "predicted_record_id"])


# A class for matching one list of strings to another
#@ray.remote
class StringMatch():
    
    def __init__(self,source_name_id,target_name_id,source_names,target_names,source_ad_id,target_ad_id,source_addresses,target_addresses,source_em_id,target_em_id,source_emails,target_emails,source_nu_id,target_nu_id,source_numbers,target_numbers):
        
        #self.target_link_ids=target_linked_ids
        #self.source_record_ids=source_ids
        #self.target_record_ids=target_ids
        
        
        self.source_name_id=source_name_id
        self.target_name_id=target_name_id
        self.source_names = source_names
        self.target_names = target_names
        self.ct_vect      = None
        self.tfidf_vect   = None
        self.vocab        = None
        self.sprse_mtx    = None
        
        
        self.source_ad_id=source_ad_id
        self.target_ad_id=target_ad_id
        self.source_addresses = source_addresses
        self.target_addresses = target_addresses
        self.ct_vect_ad      = None
        self.tfidf_vect_ad  = None
        self.vocab_ad        = None
        self.sprse_mtx_ad   = None
        
        
        self.source_em_id=source_em_id
        self.target_em_id=target_em_id
        self.source_emails = source_emails
        self.target_emails = target_emails
        self.ct_vect_em      = None
        self.tfidf_vect_em  = None
        self.vocab_em        = None
        self.sprse_mtx_em   = None
        

        self.source_nu_id=source_nu_id
        self.target_nu_id=target_nu_id
        self.source_numbers = source_numbers
        self.target_numbers = target_numbers
        self.ct_vect_nu      = None
        self.tfidf_vect_nu  = None
        self.vocab_nu        = None
        self.sprse_mtx_nu   = None
        
        
        
        
    def tokenize(self, analyzer='char_wb', n=3):
        '''
        Tokenizes the list of strings, based on the selected analyzer
        :param str analyzer: Type of analyzer ('char_wb', 'word'). Default is trigram
        :param str n: If using n-gram analyzer, the gram length
        '''
        # Create initial count vectorizer & fit it on both lists to get vocab
        self.ct_vect = CountVectorizer(analyzer=analyzer, ngram_range=(n-1, n))
        self.vocab   = self.ct_vect.fit(self.source_names  + self.target_names).vocabulary_
    
        self.ct_vect_ad = CountVectorizer(analyzer=analyzer, ngram_range=(n-1, n))
        self.vocab_ad   = self.ct_vect_ad.fit(self.source_addresses + self.target_addresses).vocabulary_
        
        
        self.ct_vect_em = CountVectorizer(analyzer=analyzer, ngram_range=(n-1, n))
        self.vocab_em   = self.ct_vect_em.fit(self.source_emails + self.target_emails).vocabulary_
        
        self.ct_vect_nu = CountVectorizer(analyzer=analyzer, ngram_range=(n-1, n))
        self.vocab_nu   = self.ct_vect_nu.fit(self.source_numbers + self.target_numbers).vocabulary_
        
        # Create tf-idf vectorizer
        self.tfidf_vect  = TfidfVectorizer(vocabulary=self.vocab, analyzer=analyzer, ngram_range=(n-1, n))
        self.tfidf_vect_ad  = TfidfVectorizer(vocabulary=self.vocab_ad, analyzer=analyzer, ngram_range=(n-1, n)) 
        self.tfidf_vect_em  = TfidfVectorizer(vocabulary=self.vocab_em, analyzer=analyzer, ngram_range=(n-1, n))
        self.tfidf_vect_nu  = TfidfVectorizer(vocabulary=self.vocab_nu, analyzer=analyzer, ngram_range=(n-1, n))
        
    def match(self, ntop=500, lower_bound=0, output_fmt='dict'):
        '''
        Main match function. Default settings return only the top candidate for every source string.
        
        :param int ntop: The number of top-n candidates that should be returned
        :param float lower_bound: The lower-bound threshold for keeping a candidate, between 0-1.
                                   Default set to 0, so consider all canidates
        :param str output_fmt: The output format. Either dataframe ('df') or dict ('dict')
        '''
        self._awesome_cossim_top(ntop,lower_bound,use_threads=True,n_jobs=20)
        
        if output_fmt == 'df':
            
            match_output = self._make_matchdf()
           
            
        elif output_fmt == 'dict':
            
            match_output_name,match_output_ad,match_output_em,match_output_nu = self._make_matchdict()
            
            
        return match_output_name,match_output_ad,match_output_em,match_output_nu
        
        
    def _awesome_cossim_top(self, ntop, lower_bound,use_threads=True,n_jobs=20):
        ''' https://gist.github.com/ymwdalex/5c363ddc1af447a9ff0b58ba14828fd6#file-awesome_sparse_dot_top-py '''
        # To CSR Matrix, if needed
        A = self.tfidf_vect.fit_transform(self.source_names).tocsr()
        B = self.tfidf_vect.fit_transform(self.target_names).transpose().tocsr()
        M, _ = A.shape
        _, N = B.shape
        
        
        idx_dtype = np.int32

        nnz_max = M * ntop

        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct_thread.sparse_dot_topn_threaded(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data, n_jobs)

        self.sprse_mtx = csr_matrix((data,indices,indptr), shape=(M,N))
        
        
        C = self.tfidf_vect_ad.fit_transform(self.source_addresses).tocsr()
        D = self.tfidf_vect_ad.fit_transform(self.target_addresses).transpose().tocsr()
        M_C, _ = C.shape
        _, N_D = D.shape
        
        
        nnz_max = M_C * ntop
        indptr = np.zeros(M_C+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=C.dtype)
        
        ct_thread.sparse_dot_topn_threaded(
            M_C, N_D, np.asarray(C.indptr, dtype=idx_dtype),
            np.asarray(C.indices, dtype=idx_dtype),
            C.data,
            np.asarray(D.indptr, dtype=idx_dtype),
            np.asarray(D.indices, dtype=idx_dtype),
            D.data,
            ntop,
            lower_bound,
            indptr, indices, data, n_jobs)
        
        self.sprse_mtx_ad= csr_matrix((data,indices,indptr), shape=(M_C,N_D))
        
        
        E = self.tfidf_vect_em.fit_transform(self.source_emails).tocsr()
        F = self.tfidf_vect_em.fit_transform(self.target_emails).transpose().tocsr()
        M_E, _ = E.shape
        _, N_F = F.shape
        
        
        nnz_max = M_E * ntop
        indptr = np.zeros(M_E+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=E.dtype)
        
        ct_thread.sparse_dot_topn_threaded(
            M_E, N_F, np.asarray(E.indptr, dtype=idx_dtype),
            np.asarray(E.indices, dtype=idx_dtype),
            E.data,
            np.asarray(F.indptr, dtype=idx_dtype),
            np.asarray(F.indices, dtype=idx_dtype),
            F.data,
            ntop,
            lower_bound,
            indptr, indices, data,n_jobs)
        
        self.sprse_mtx_em= csr_matrix((data,indices,indptr), shape=(M_E,N_F))
        
        
        G = self.tfidf_vect_nu.fit_transform(self.source_numbers).tocsr()
        H = self.tfidf_vect_nu.fit_transform(self.target_numbers).transpose().tocsr()
        M_G, _ = G.shape
        _, N_H = H.shape
        
        nnz_max = M_G * ntop
        indptr = np.zeros(M_G+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=G.dtype)
        
        ct_thread.sparse_dot_topn_threaded(
            M_G, N_H, np.asarray(G.indptr, dtype=idx_dtype),
            np.asarray(G.indices, dtype=idx_dtype),
            G.data,
            np.asarray(H.indptr, dtype=idx_dtype),
            np.asarray(H.indices, dtype=idx_dtype),
            H.data,
            ntop,
            lower_bound,
            indptr, indices, data, n_jobs)
        
        self.sprse_mtx_nu= csr_matrix((data,indices,indptr), shape=(M_G,N_H))
        
        
        
    def _make_matchdf(self):
        ''' Build dataframe for result return '''
        # CSR matrix -> COO matrix
        cx = self.sprse_mtx.tocoo()

        # COO matrix to list of tuples
        #match_list_us = [[] for i in range(len(self.source_record_ids))]
        match_list=[]
        for row,col,val in zip(cx.row, cx.col, cx.data):
            match_list.append((row, self.source_names[row], col, self.target_names[col], val))
            
        # List of tuples to dataframe
        colnames = ['Row Idx', 'Title', 'Candidate Idx', 'Candidate Title', 'Score']
        match_df = pd.DataFrame(match_list, columns=colnames)

        return match_df

    
    def _make_matchdict(self):
        ''' Build dictionary for result return '''
        # CSR matrix -> COO matrix
        cx_name = self.sprse_mtx.tocoo()

        # dict value should be tuple of values
        match_dict_name = {}
        for row,col,val in zip(cx_name.row, cx_name.col, cx_name.data):
            if match_dict_name.get(self.source_name_id[row]):
                match_dict_name[self.source_name_id[row]].append(self.target_name_id[col])
            else:
                match_dict_name[self.source_name_id[row]] = [self.target_name_id[col]]

        
        
        cx_ad = self.sprse_mtx_ad.tocoo()
        match_dict_ad = {}
        for row,col,val in zip(cx_ad.row, cx_ad.col, cx_ad.data):
            if match_dict_ad.get(self.source_ad_id[row]):
                match_dict_ad[self.source_ad_id[row]].append(self.target_ad_id[col])
            else:
                match_dict_ad[self.source_ad_id[row]] = [self.target_ad_id[col]]
        
        
        cx_em= self.sprse_mtx_em.tocoo()
        match_dict_em = {}
        for row,col,val in zip(cx_em.row, cx_em.col, cx_em.data):
            if match_dict_em.get(self.source_em_id[row]):
                match_dict_em[self.source_em_id[row]].append(self.target_em_id[col])
            else:
                match_dict_em[self.source_em_id[row]] = [self.target_em_id[col]]
        
        
        cx_nu= self.sprse_mtx_nu.tocoo()
        match_dict_nu = {}
        for row,col,val in zip(cx_nu.row, cx_nu.col, cx_nu.data):
            if match_dict_nu.get(self.source_nu_id[row]):
                match_dict_nu[self.source_nu_id[row]].append(self.target_nu_id[col])
            else:
                match_dict_nu[self.source_nu_id[row]] = [self.target_nu_id[col]]
        
        
        
        
        return match_dict_name,match_dict_ad,match_dict_em,match_dict_nu

In [None]:
#Loading dataset
train=pd.read_csv('../panama-papers-polimi/data/panama_train_expanded_2.csv')
test=pd.read_csv('../panama-papers-polimi/data/panama_test_expanded_2.csv')


training_file = "../panama-papers-polimi/data/entity-resolution_advanced-topics-training_data.csv"
train_or = read_file(training_file, set_record_id_as_index=False)
        

y_train_or = train_or[["linked_id"]]
train.phone=train.phone.apply(str)
test.phone=test.phone.apply(str)

In [None]:
train=train.fillna('-1')
test=test.fillna('-1')

train = pd.concat([train,y_train_or['linked_id']],axis=1)


train = train[['name','record_id','address','email','phone','linked_id']]
test = test[['name','record_id','address','email','phone']]

In [None]:
#Treat strings that are too short as missing values
train.name=train.name.apply(lambda x : '-1' if len(x)<=2 else x)
train.address=train.address.apply(lambda x : '-1' if len(x)<=2 else x)
train.email=train.email.apply(lambda x : '-1' if len(x)<=2 else x)
train.phone=train.phone.apply(lambda x : '-1' if len(x)<=2 else x)

test.name=test.name.apply(lambda x : '-1' if len(x)<=2 else x)
test.address=test.address.apply(lambda x : '-1' if len(x)<=2 else x)
test.email=test.email.apply(lambda x : '-1' if len(x)<=2 else x)
test.phone=test.phone.apply(lambda x : '-1' if len(x)<=2 else x)

train=train.fillna('-1')
test=test.fillna('-1')

In [None]:
#Create list of valid features
test_name_id=test.record_id[test.name!='-1']
test_ad_id=test.record_id[test.address!='-1']
test_em_id=test.record_id[test.email!='-1']
test_nu_id=test.record_id[test.phone!='-1']

train_name_id=train.record_id[train.name!='-1']
train_ad_id=train.record_id[train.address!='-1']
train_em_id=train.record_id[train.email!='-1']
train_nu_id=train.record_id[train.phone!='-1']

test_name_id=test_name_id.tolist()
test_ad_id=test_ad_id.tolist()
test_em_id=test_em_id.tolist()
test_nu_id=test_nu_id.tolist()

train_name_id=train_name_id.tolist()
train_ad_id=train_ad_id.tolist()
train_em_id=train_em_id.tolist()
train_nu_id=train_nu_id.tolist()




test_name_list=test.name[test.name!='-1']
test_address_list=test.address[test.address!='-1']
test_email_list=test.email[test.email!='-1']
test_number_list=test.phone[test.phone!='-1']


train_name_list=train.name[train.name!='-1']
train_address_list=train.address[train.address!='-1']
train_email_list=train.email[train.email!='-1']
train_number_list=train.phone[train.phone!='-1']



train_name_list = train_name_list.tolist()
train_address_list = train_address_list.tolist()
train_email_list=train_email_list.tolist()
train_number_list=train_number_list.tolist()



test_name_list = test_name_list.tolist()
test_address_list = test_address_list.tolist()
test_email_list=test_email_list.tolist()
test_number_list=test_number_list.tolist()


In [None]:
#Inizialization of tfidf model used to create small blocks
best_matches = StringMatch(test_name_id,train_name_id,test_name_list,train_name_list,test_ad_id,train_ad_id,test_address_list,train_address_list,test_em_id,train_em_id,test_email_list,train_email_list,test_nu_id,train_nu_id,test_number_list,train_number_list)
#Tokenization and creation of tfidf matrices
best_matches.tokenize()
#Sparse dot multiplication
match_output_name,match_output_ad,match_output_em,match_output_nu= best_matches.match()

In [None]:
#create unique dict of good candidates from the 4 matrices similarity scores
d = {}
for key in set(list(match_output_name.keys()) + list(match_output_ad.keys()) + list(match_output_em.keys()) + list(match_output_nu.keys())):
    try:
        d.setdefault(key,[]).append(match_output_name[key])        
    except KeyError:
        pass

    try:
        d.setdefault(key,[]).append(match_output_ad[key])          
    except KeyError:
        pass

    
    try:
        d.setdefault(key,[]).append(match_output_em[key])        
    except KeyError:
        pass

    try:
        d.setdefault(key,[]).append(match_output_nu[key])          
    except KeyError:
        pass
    

for key,value in d.items():
    d[key]=[x for b in d[key] for x in b]

#remove duplicates 
for key,value in d.items():
    d[key]=list(set(d[key]))


for key,value in d.items():
    d[key]=[t for t in d[key] if type(t)==str]

In [None]:
#create dataframes to evaluate performance with Recall and Precision as metrics
train_2=train.copy()
test_2=test.copy()
train_2 = train_2[['name','record_id','address','email','phone']]
test_2 = test_2[['name','record_id','address','email','phone']]
test_2['linked_id']=test_2['record_id']
test_2.set_index('record_id',inplace=True)
train_2.set_index('record_id',inplace=True)

y_test=test_2[['linked_id']]
y_test.linked_id=y_test.linked_id.apply(lambda x : x.split('-')[0])
train.set_index('record_id',inplace=True)
y_train=train[['linked_id']]

test_3=test.copy()
test_3=test_3[['name','record_id','address','email','phone']]
test_3.set_index('record_id',inplace=True)
train_m = train_2.merge(y_train, left_index=True, right_index=True)
test_m = test_3.merge(y_test, left_index=True, right_index=True)

In [None]:
#Create new merged features from name,address,phone,email
train_copy=train_copy.fillna('')
test=test.fillna('') 
train_copy.name=train_copy.name.apply(lambda x : '' if x=='-1' else x)
train_copy.address=train_copy.address.apply(lambda x : '' if x=='-1' else x)
train_copy.email=train_copy.email.apply(lambda x : '' if x=='-1' else x)
train_copy.phone=train_copy.phone.apply(lambda x : '' if x=='-1' else x)

test.name=test.name.apply(lambda x : '' if x=='-1' else x)
test.address=test.address.apply(lambda x : '' if x=='-1' else x)
test.email=test.email.apply(lambda x : '' if x=='-1' else x)
test.phone=test.phone.apply(lambda x : '' if x=='-1' else x)

train_copy=train_copy[['record_id','name','address','email','phone']]
train_copy['complessivo']=train_copy['name']+' '+train_copy['address']+' '+train_copy['email']+' '+train_copy['phone']
train_copy.set_index('record_id',inplace=True)

test['complessivo']=test['name']+' '+test['address']+' '+test['email']+' '+test['phone']

In [None]:
cc=0
K=10
tot=10
match_df={}

In [None]:
#Vectorized fuzzy string matching operation in order to calculate all the similarity scores for one test instance in just one step
def partial_match_totale(x,y):
    return(fuzz.partial_token_set_ratio(x,y)*0.1)
partial_match_vector_totale = np.vectorize(partial_match_totale)

In [None]:
#Use small blocks created with tfidf to perform fast fuzzy matching

for row_i in zip(test['record_id'],test['complessivo']):
    row_i_vector = np.array(row_i[1:])
    print(cc)
    if not d.get(row_i[0]):
        print(row_i[0])
        match_df[row_i[0]]=[]
        cc+=1
        continue

    identical_values_per_row=pd.Series([0]*len(d[row_i[0]]))
    identical_values_per_row.index=train_copy.loc[d[row_i[0]],'complessivo'].index

    #Fuzzy match with features concatenated
    df0 = pd.DataFrame([row_i_vector[0]])
    df0.columns = ['Match0']

    compare0 = pd.DataFrame(train_copy.loc[d[row_i[0]],'complessivo'])
    compare0.columns = ['compare0']

    df0['Key0'] = 1
    compare0['Key0'] = 1
    combined_dataframe0 = df0.merge(compare0,on="Key0",how="left")


    identical_values_per_row += partial_match_vector_totale(combined_dataframe0['Match0'],combined_dataframe0['compare0'])
    best_matches = identical_values_per_row.sort_values(ascending=False)[:10]
    match_df[row_i[0]]=list(y_train.loc[best_matches.index.values]['linked_id'].values)
    cc+=1
    if cc % 1000 == 0 and cc > 0:
        print(match_df[row_i[0]])

In [None]:
#create kaggle format predictions and calculate metrics

pred_df_kaggle = prediction_dict_to_kaggle_df(match_df)
pred_df_kaggle.to_csv("../panama-papers-polimi/preds/kaggle_fuzzy_partial_token_set_top_10"+'.csv', index=False)
pred_df = prediction_dict_to_df(match_df)

#%% 5. Compute recall@K;
recall_dict = recall_at_k(pred_df, train_m, test_m)
print(recall_dict["AverageRecall"])
print(recall_dict["AverageFilteredRecall"])

#%% 6. Compute MAP@K;
precision_dict = precision_at_k(pred_df, train_m, test_m)
print(precision_dict["AveragePrecision"])
print(precision_dict["AverageFilteredPrecision"])
