In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df = pd.read_csv("data/football_rec/players_21.csv")
df.rename(columns={'height_cm':'Height(cm)','weight_kg':'Weight(kg)','skill_moves':'Skill Moves',
                        'attacking_crossing':'Crossing','attacking_finishing':'Finishing',
                        'attacking_heading_accuracy':'Heading Accuracy','attacking_short_passing':'Short Passing',
                        'attacking_volleys':'Volleys','skill_dribbling':'Dribbling',
                        'skill_curve':'Curve','skill_fk_accuracy':'FK. Accuracy',
                        'skill_long_passing':'Long Passing','skill_ball_control':'Ball Control',
                        'movement_acceleration':'Acceleration','movement_sprint_speed':'Sprint Speed',
                        'movement_agility':'Agility','movement_reactions':'Reactions',
                        'movement_balance':'Balance','power_shot_power':'Shot Power',
                        'power_jumping':'Jumping','power_stamina':'Stamina',
                        'power_strength':'Strength','power_long_shots':'Long Shots',
                        'mentality_aggression':'Aggression','mentality_interceptions':'Interceptions',
                        'mentality_positioning':'Positioning','mentality_vision':'Vision',
                        'mentality_penalties':'Penalties','mentality_composure':'Composure',
                        'defending_marking':'Marking','defending_standing_tackle':'Standing Tackle',
                        'defending_sliding_tackle':'Sliding Tackle'},inplace=True)
df.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,Height(cm),Weight(kg),nationality,club_name,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,33,1987-06-24,170,72,Argentina,FC Barcelona,...,66+3,65+3,65+3,65+3,66+3,62+3,52+3,52+3,52+3,62+3
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,35,1985-02-05,187,83,Portugal,Juventus,...,65+3,61+3,61+3,61+3,65+3,61+3,54+3,54+3,54+3,61+3
2,200389,https://sofifa.com/player/200389/jan-oblak/210002,J. Oblak,Jan Oblak,27,1993-01-07,188,87,Slovenia,Atlético Madrid,...,32+3,36+3,36+3,36+3,32+3,32+3,33+3,33+3,33+3,32+3
3,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,31,1988-08-21,184,80,Poland,FC Bayern München,...,64+3,65+3,65+3,65+3,64+3,61+3,60+3,60+3,60+3,61+3
4,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,28,1992-02-05,175,68,Brazil,Paris Saint-Germain,...,67+3,62+3,62+3,62+3,67+3,62+3,49+3,49+3,49+3,62+3


In [3]:
def sim_pos(row):
    return row.player_positions.partition(',')[0]

def foot_trans(row):
    if row.preferred_foot=='Right':
        return 5-row.weak_foot
    else:
        return row.weak_foot-5
    
def pos_trans(row):
    if row.player_positions in ['ST','CF']:
        return 'ST'
    if row.player_positions in ['LW','RW','LM','RM']:
        return 'WF'
    if row.player_positions in ['CAM','CDM','CM']:
        return 'MF'
    if row.player_positions in ['LWB','RWB','LB','RB']:
        return 'WB'
    if row.player_positions in ['CB']:
        return 'CB'
    if row.player_positions in ['GK']:
        return 'GK'

def calc_marking(row):
    return int((10*row['defending']-3*row['Standing Tackle']-2*row['Interceptions']
                -row['Heading Accuracy']-row['Sliding Tackle'])/3) if row['player_positions']!='GK' else np.nan
    
df['player_positions']=df.apply(sim_pos,axis=1)
df['Right Foot']=df.apply(foot_trans,axis=1)
df['rough_position']=df.apply(pos_trans,axis=1)
df['Marking']=df.apply(calc_marking, axis=1)

In [4]:
gkdata=df[df['player_positions']=='GK'].copy()
gkdata=gkdata.dropna(axis=1,how='all')
gkdata=gkdata.dropna(axis=0,how='any',subset=('gk_diving','gk_handling','gk_kicking','gk_reflexes','gk_speed','gk_positioning'))
nogkdata=df[df['player_positions']!='GK'].copy()
nogkdata=nogkdata.dropna(axis=1,how='all')
nogkdata=nogkdata.dropna(axis=0,how='any',subset=('pace','shooting','passing','dribbling','defending','physic','pace'))

In [5]:
nogkdata

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,Height(cm),Weight(kg),nationality,club_name,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,Right Foot,rough_position
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,33,1987-06-24,170,72,Argentina,FC Barcelona,...,65+3,65+3,66+3,62+3,52+3,52+3,52+3,62+3,-1,WF
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,35,1985-02-05,187,83,Portugal,Juventus,...,61+3,61+3,65+3,61+3,54+3,54+3,54+3,61+3,1,ST
3,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,31,1988-08-21,184,80,Poland,FC Bayern München,...,65+3,65+3,64+3,61+3,60+3,60+3,60+3,61+3,1,ST
4,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,28,1992-02-05,175,68,Brazil,Paris Saint-Germain,...,62+3,62+3,67+3,62+3,49+3,49+3,49+3,62+3,0,WF
5,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,29,1991-06-28,181,70,Belgium,Manchester City,...,80+3,80+3,79+3,75+3,69+3,69+3,69+3,75+3,0,MF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,256679,https://sofifa.com/player/256679/kevin-angulo/...,K. Angulo,Kevin Angulo,24,1996-04-13,176,73,Colombia,América de Cali,...,47+2,47+2,47+2,47+2,46+2,46+2,46+2,47+2,3,MF
18940,257710,https://sofifa.com/player/257710/mengxuan-zhan...,Zhang Mengxuan,Mengxuan Zhang,21,1999-04-26,177,70,China PR,Chongqing Dangdai Lifan FC SWM Team,...,41+2,41+2,42+2,45+2,47+2,47+2,47+2,45+2,3,CB
18941,250989,https://sofifa.com/player/250989/zhenghao-wang...,Wang Zhenghao,王政豪,20,2000-06-28,185,74,China PR,Tianjin TEDA FC,...,42+2,42+2,42+2,44+2,47+2,47+2,47+2,44+2,3,CB
18942,257697,https://sofifa.com/player/257697/zitong-chen/2...,Chen Zitong,Zitong Chen,23,1997-02-20,186,80,China PR,Shijiazhuang Ever Bright F.C.,...,49+2,49+2,47+2,47+2,49+2,49+2,49+2,47+2,3,MF


In [6]:
# attributes=['short_name','Height(cm)','Weight(kg)','Right Foot','Skill Moves',
#             'Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
#             'Dribbling','Curve','FK. Accuracy','Long Passing','Ball Control',
#             'Acceleration','Sprint Speed','Agility','Reactions','Balance',
#             'Shot Power','Jumping','Stamina','Strength','Long Shots',
#             'Aggression','Interceptions','Positioning','Vision','Penalties','Composure',
#             'Marking','Standing Tackle','Sliding Tackle']

attributes=['short_name','Height(cm)','Weight(kg)','Right Foot','Skill Moves',
            'Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
            'Dribbling','Curve','FK. Accuracy','Long Passing','Ball Control',
            'Acceleration','Sprint Speed','Balance', 'Shot Power','Jumping','Stamina','Strength','Long Shots',
            'Aggression','Interceptions','Positioning','Vision','Penalties','Composure',
            'Marking','Standing Tackle','Sliding Tackle']


In [7]:
rec_features = nogkdata[attributes].copy()

In [8]:
vectors = rec_features.iloc[:, 1:].apply(lambda x: x/np.linalg.norm(x), axis=1)


In [9]:
def get_similar_names(name):
    
    input_vector = vectors.iloc[rec_features.index[rec_features['short_name'] == name].tolist()[0]].values
    similarity_scores = cosine_similarity([input_vector], vectors)[0]
    closest_rows = rec_features.iloc[np.argsort(similarity_scores)[::-1][:10]]
    
    return closest_rows['short_name'].tolist()

In [10]:
get_similar_names("M. Salah")

['K. Benzema',
 'V. Germain',
 'D. McGoldrick',
 'D. Bouanga',
 'F. Thauvin',
 'Iago Aspas',
 'Luo Guofu',
 'Leo Baptistao',
 'Borja Mayoral',
 'R. Lewandowski']

In [11]:
vectors.iloc[rec_features.index[rec_features['short_name'] == "Neymar Jr"].tolist()[0]].values

array([0.38953205, 0.15064776, 0.        , 0.00860844, 0.20229842,
       0.17647309, 0.11836609, 0.20229842, 0.17647309, 0.18938575,
       0.18292942, 0.1786252 , 0.2001463 , 0.19799419, 0.16571253,
       0.16356042, 0.16356042, 0.19584208, 0.13558298, 0.19153786,
       0.1592562 , 0.19584208, 0.16356042, 0.14203931, 0.18938575,
       0.20229842, 0.18077731, 0.19584208, 0.14634353, 0.1398872 ,
       0.11406187])

In [12]:
rec_features.index[rec_features['short_name']=="Neymar Jr"].tolist()[0]

4

In [13]:
vectors.iloc[4].values

array([0.38953205, 0.15064776, 0.        , 0.00860844, 0.20229842,
       0.17647309, 0.11836609, 0.20229842, 0.17647309, 0.18938575,
       0.18292942, 0.1786252 , 0.2001463 , 0.19799419, 0.16571253,
       0.16356042, 0.16356042, 0.19584208, 0.13558298, 0.19153786,
       0.1592562 , 0.19584208, 0.16356042, 0.14203931, 0.18938575,
       0.20229842, 0.18077731, 0.19584208, 0.14634353, 0.1398872 ,
       0.11406187])

In [27]:
player_stats.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,AerWon%
0,1,Max Aarons,ENG,DF,Norwich City,Premier League,22.0,2000,34,32,...,0.03,1.41,1.16,0.0,0.06,0.03,5.53,0.47,1.59,22.7
1,2,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,34.0,1987,34,34,...,0.0,0.06,1.39,0.0,0.03,0.0,6.77,2.02,1.36,59.8
2,3,Salis Abdul Samed,GHA,MF,Clermont Foot,Ligue 1,22.0,2000,31,29,...,0.0,0.36,1.24,0.0,0.0,0.0,8.76,0.88,0.88,50.0
3,4,Laurent Abergel,FRA,MF,Lorient,Ligue 1,29.0,1993,34,34,...,0.03,0.79,2.23,0.0,0.0,0.0,8.87,0.43,0.43,50.0
7,8,Tammy Abraham,ENG,FW,Roma,Serie A,24.0,1997,37,36,...,0.5,0.7,0.64,0.03,0.03,0.0,3.67,2.39,2.89,45.3


In [165]:
player_stats_og = pd.read_csv(r"data/football_rec/player_stats.csv",sep=';',encoding='latin-1')

In [166]:
att_attributes = ["Player","Pos","Goals","SoT%","G/Sh","G/SoT","Assists","Tkl","TklWon","TklAtt3rd","TklDri%","Press","Press%","PresAtt3rd",
                 "PresMid3rd","TouAttPen","TouAtt3rd","TouMid3rd","Car3rd","DriSucc%","PKwon","AerWon%"]

In [167]:
fw_features = player_stats_og[att_attributes].copy()

In [168]:
fw_features.columns[fw_features.isna().any()].tolist()

[]

In [169]:
norms = np.linalg.norm(fw_features.iloc[:, 2:], axis=0)
norms[norms == 0] = 1e-10

stats_vectors = fw_features.iloc[:, 2:].apply(lambda x: x/norms, axis=1)

In [170]:
def get_similar_players(name):
    
    input_vector = stats_vectors.iloc[fw_features.index[fw_features['Player'] == name].tolist()[0]].values
    similarity_scores = cosine_similarity([input_vector], stats_vectors)[0]
    closest_rows = fw_features.iloc[np.argsort(similarity_scores)[::-1][:10]]
    return closest_rows['Player'].tolist()

In [196]:
n_closest = get_similar_players('Sadio Mané')

In [197]:
n_closest

['Sadio Mané',
 'Iago Aspas',
 'Marko Arnautovi?',
 'Andrea Belotti',
 'Alexis Sánchez',
 'Raheem Sterling',
 'Kai Havertz',
 'Kingsley Coman',
 'José Luis Morales',
 'Joaquín Correa']

In [195]:
fw_features[fw_features['Player'].isin(n_closest)]

Unnamed: 0,Player,Pos,Goals,SoT%,G/Sh,G/SoT,Assists,Tkl,TklWon,TklAtt3rd,...,Press%,PresAtt3rd,PresMid3rd,TouAttPen,TouAtt3rd,TouMid3rd,Car3rd,DriSucc%,PKwon,AerWon%
121,Marko Arnautovi?,FW,0.47,35.2,0.13,0.38,0.03,0.74,0.5,0.2,...,29.7,7.06,6.62,6.22,22.3,19.4,1.1,59.3,0.0,48.0
136,Iago Aspas,FW,0.5,44.0,0.17,0.38,0.15,0.41,0.2,0.2,...,24.8,6.01,4.61,6.56,28.3,21.3,1.84,59.0,0.0,36.1
261,Andrea Belotti,FW,0.53,29.1,0.09,0.31,0.07,0.66,0.46,0.39,...,27.8,6.64,5.2,6.51,21.5,17.6,1.18,56.5,0.07,41.9
555,Kingsley Coman,FWDF,0.4,43.5,0.13,0.3,0.13,1.06,0.6,0.4,...,28.7,5.7,5.17,8.08,39.1,23.1,2.45,57.7,0.0,36.7
575,Joaquín Correa,FW,0.53,45.7,0.17,0.38,0.09,1.14,0.61,0.7,...,24.5,10.4,7.37,7.98,28.4,17.5,2.28,40.6,0.0,38.2
1144,Kai Havertz,FWMF,0.4,41.1,0.14,0.35,0.15,1.04,0.65,0.25,...,27.0,9.35,6.37,7.01,26.2,20.3,1.19,48.8,0.05,39.6
1573,José Luis Morales,FWMF,0.43,42.1,0.13,0.31,0.23,0.83,0.53,0.3,...,23.8,6.23,7.15,5.4,23.1,16.7,1.92,45.8,0.03,38.6
1640,Sadio Mané,FW,0.51,37.4,0.16,0.43,0.06,1.05,0.58,0.42,...,31.6,6.45,6.17,7.99,27.8,22.2,1.88,59.8,0.03,34.9
2378,Alexis Sánchez,FW,0.51,39.4,0.15,0.38,0.2,1.22,1.12,0.31,...,33.5,8.06,6.94,5.82,32.3,27.1,1.94,72.7,0.0,32.4
2565,Raheem Sterling,FW,0.55,46.3,0.2,0.44,0.21,1.19,0.76,0.55,...,29.3,5.59,6.23,9.19,36.0,16.1,1.4,45.0,0.08,15.2


In [161]:
fw_features[fw_features['Player']=="Thomas Müller"]

Unnamed: 0,Player,Goals,SoT%,G/Sh,G/SoT,Assists,Tkl,TklWon,TklAtt3rd,TklDri%,...,Press%,PresAtt3rd,PresMid3rd,TouAttPen,TouAtt3rd,TouMid3rd,Car3rd,DriSucc%,PKwon,AerWon%
1869,Thomas Müller,0.28,43.6,0.15,0.33,0.63,1.48,0.88,0.46,29.8,...,32.4,6.9,7.68,6.16,31.8,28.6,1.9,53.8,0.0,41.3
