In [1]:
import numpy as np
import pandas as pd
import Levenshtein

In [2]:
def leventstein_similarity(str1:str, str2:str):
    return 1 - Levenshtein.distance(str1, str2) / max(len(str1), len(str2))

In [3]:
def same_similarity(str1:str, str2:str):
    return str1==str2

In [4]:
def tokenize_date(date):
    # Tokenize the date string into year, month, and day
    return set(date.split('-'))

def date_similarity(date1, date2, threshold=1):
    
    if date1 is None or date2 is None:
        return 0
    
    tokens1 = tokenize_date(date1)
    tokens2 = tokenize_date(date2)
    
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    jaccard_similarity = intersection / union
    
    # if jaccard_similarity >= threshold:
    #     return True
    # else:
    #     return False
    return jaccard_similarity

date1 = "2000-09-14"
date2 = "2000-14-09"
threshold = 0.7  # You can adjust this threshold as needed

if date_similarity(date1, date2, threshold):
    print("Dates are similar.")
else:
    print("Dates are not similar.")

Dates are similar.


In [5]:
def get_similarity_matrix(similarity_function, col1, col2):

    similarity_matrix = np.zeros((len(col1), len(col2)))

    for i, entry1 in enumerate(col1):
        for j, entry2 in enumerate(col2):

            similarity_matrix[i, j] = similarity_function(entry1, entry2) 
        # print(f"Row {i}")     

    return similarity_matrix

In [13]:
def create_result(df1, df2, similarity_matrix, normalization=1):
    matches = pd.DataFrame()
    matches['df1'] = df1['id']
    matches['df2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['id'].values
    matches['name1'] = df1['name'].values
    matches['birthdate1'] = df1['birthdate'].values
    matches['name2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['name'].values
    matches['birthdate2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['birthdate'].values
    matches['nat1'] = df1['nationality']
    matches['nat2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['nationality'].values
    # matches['positions1'] = df1['positions']
    # matches['positions2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['positions'].values
    matches['club1'] = df1['club']
    matches['club2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['club'].values
    # matches['current_mv_1'] = df1['current_market_value']
    # matches['current_mv_2'] = df2.iloc[similarity_matrix.argmax(axis=1)]['current_market_value'].values
    matches['score'] = similarity_matrix.max(axis=1)/normalization
    
    return matches

In [14]:
# df_eafc = pd.read_xml('../Football-Project/data/xml-files/eafc_final.xml')
df_fm23 = pd.read_xml('../../Football-Project/data/xml-files/fm23_final.xml')
df_tm = pd.read_xml('../../Football-Project/data/xml-files/tm_final.xml')
subset_size = 600
df_fm23_sampled = df_fm23.sample(n=subset_size, random_state=42)

In [15]:
similarity_matrix_name = get_similarity_matrix(leventstein_similarity, df_fm23_sampled['name'], df_tm['name'])
similarity_matrix_date = get_similarity_matrix(date_similarity, df_fm23_sampled['birthdate'], df_tm['birthdate'])

In [16]:
df_name = create_result(df_fm23_sampled, df_tm, similarity_matrix_name)
df_date = create_result(df_fm23_sampled, df_tm, similarity_matrix_date)
df = create_result(df_fm23_sampled, df_tm, (1.4*similarity_matrix_name) + (0.6*similarity_matrix_date), 2)
perfect_matches = df[df.score == 1]
perfect_matches["score"] = True
perfect_matches.to_csv('perfect_matches_fm_tm.csv', index=False)
print(len(df[df.score <= 0.50]))
print(len(perfect_matches))

236
180


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  perfect_matches["score"] = True


In [17]:
pd.set_option('display.max_rows', None)
display(df[df.score <= 0.50].sort_values(by=['score'], ascending=False))
non_matches = df[df.score <= 0.50].sort_values(by=['score'], ascending=False)
non_matches["score"] = False
non_matches.to_csv('non_matches_fm_tm.csv', index=False)

Unnamed: 0,df1,df2,name1,birthdate1,name2,birthdate2,nat1,nat2,club1,club2,score
4168,fm_5168,tm_2417,Ramez Al-Mazry,2004-01-08,Saleh Al-Saeed,2004-08-26,GER,Saudi Arabia,FC Viktoria Köln,,0.5
7621,fm_8621,tm_5258,Moussa Seck,2006-01-13,Moustapha Seck,1996-02-23,ITA,Spain,Reggina,Portimonense SC,0.5
15337,fm_16337,tm_1182,André Mesquita,1997-10-10,Rodrigues,1997-10-10,POR,Brazil,Santa Clara,,0.5
7947,fm_8947,tm_5551,Samuel Gningue,2006-07-01,Samuel Gigot,1993-10-12,ITA,France,Lazio,Olympique Marseille,0.5
16545,fm_17545,tm_8446,Jonathan Do Marcolino,2006-05-10,Alan Do Marcolino,2002-03-19,FRA,Gabon,Rennes,Stade Rennais FC,0.5
1023,fm_2023,tm_4666,Sonny Kittel,1993-01-06,Soner Dikmen,1993-09-01,GER,Turkey,HSV,Konyaspor,0.5
11550,fm_12550,tm_10390,Tomas Galvez,2005-01-28,Tim Goller,2005-01-26,FIN,Germany,Man City,Hertha BSC,0.5
14789,fm_15789,tm_5211,Adrián Diéguez,1996-02-04,Maxime Dominguez,1996-02-01,ESP,Switzerland,Ponferrada,Gil Vicente FC,0.5
11952,fm_12952,tm_5604,Lemar Gordon,2005-11-17,Leny Yoro,2005-11-13,ENG,France,Fulham,LOSC Lille,0.5
8908,fm_9908,tm_5996,Osman Zorlu,1999-07-15,Tommaso Pobega,1999-07-15,TUR,Italy,Giresunspor,AC Milan,0.5


In [18]:
corner_cases = df[(df.score > 0.50) & (df.score < 1)]
corner_cases["score"] = False
corner_cases.to_csv('corner_cases_fm_tm.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corner_cases["score"] = False
