In [2]:
import numpy as np
import pandas as pd
import Levenshtein

In [62]:
def leventstein_similarity(str1:str, str2:str):
    return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))

In [86]:
def same_similarity(str1:str, str2:str):
    return str1==str2

In [66]:
def tokenize_date(date):
    # Tokenize the date string into year, month, and day
    return set(date.split('-'))

def date_similarity(date1, date2, threshold=1):
    
    if date1 is None or date2 is None:
        return 0
    
    tokens1 = tokenize_date(date1)
    tokens2 = tokenize_date(date2)
    
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    jaccard_similarity = intersection / union
    
    if jaccard_similarity >= threshold:
        return True
    else:
        return False

date1 = "2000-09-14"
date2 = "2000-14-09"
threshold = 0.7  # You can adjust this threshold as needed

if date_similarity(date1, date2, threshold):
    print("Dates are similar.")
else:
    print("Dates are not similar.")

Dates are similar.


In [68]:
def get_similarity_matrix(similarity_function, col1, col2):

    similarity_matrix = np.zeros((len(col1), len(col2)))

    for i, entry1 in enumerate(col1):
        for j, entry2 in enumerate(col2):

            similarity_matrix[i, j] = similarity_function(entry1, entry2) 
        # print(f"Row {i}")     

    return similarity_matrix

In [75]:
def create_result(df1, df2, similarity_matrix):
    matches = pd.DataFrame()
    matches['df1'] = df1['id']
    matches['df2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['id'].values
    matches['name1'] = df1['name'].values
    matches['birthdate1'] = df1['birthdate'].values
    matches['name2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['name'].values
    matches['birthdate2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['birthdate'].values
    matches['score'] = similarity_matrix.max(axis=1)
    
    return matches

In [38]:
df_eafc = pd.read_xml('../Football-Project/data/xml-files/eafc_final.xml')
df_fm23 = pd.read_xml('../Football-Project/data/xml-files/fm23_final.xml')
df_tm = pd.read_xml('../Football-Project/data/xml-files/tm_final.xml')

In [39]:
df_eafc_sampled = df_eafc.sample(n=100, random_state=42)
df_fm23_sampled = df_fm23.sample(n=100, random_state=42)
df_tm_sampled = df_tm.sample(n=100, random_state=42)

In [69]:
similarity_matrix_name = get_similarity_matrix(leventstein_similarity, df_fm23_sampled['name'], df_tm['name'])
similarity_matrix_date = get_similarity_matrix(date_similarity, df_fm23_sampled['birthdate'], df_tm['birthdate'])


In [90]:
df = create_result(df_fm23_sampled, df_tm, (similarity_matrix_name+similarity_matrix_date)/2)

In [91]:
df[df['score']<.5]

Unnamed: 0,df1,df2,name1,birthdate1,name2,birthdate2,score
15641,fm_16641,tm_5467,Arnau Pradas,2006-03-24,Arnau Tenas,2001-05-30,0.333333
19,fm_1019,tm_4090,Christian Fuchs,1986-04-07,Christian Burgess,1991-10-07,0.352941
16821,fm_17821,tm_2131,Timothé Viel,2006-05-19,Timothy Tillman,1999-01-04,0.3
12959,fm_13959,tm_10005,Ollie Eagle,2005-12-08,Ollie Tanner,2002-05-13,0.333333
12852,fm_13852,tm_6059,Ifeoluwa Oni,2005-03-13,Gianluca Mancini,1996-04-17,0.1875
6973,fm_7973,tm_7001,Saad El Haddad,2005-07-24,Munir El Haddadi,1995-09-01,0.3125
16526,fm_17526,tm_8228,Mathis Coudour,2006-02-08,Mathis Touho,2004-11-24,0.357143
3978,fm_4978,tm_8582,Esteban Achak,2005-03-03,Esteban Salles,1994-03-02,0.285714
11071,fm_12071,tm_1158,Jay Spearing,1988-11-25,Ryan Spaulding,1998-09-10,0.25
7326,fm_8326,tm_6406,Davide Zanoni,2005-03-25,Davide Faraoni,1991-10-25,0.392857


In [None]:
print(df_eafc['name'][0], df_fm23['name'][14173])

Rúben Fernandes Rúben Fernandes
