In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('../data/2021-22/cleaned_players.csv')

df = pd.DataFrame(data)

In [3]:
print(df.columns)

Index(['first_name', 'second_name', 'goals_scored', 'assists', 'total_points',
       'minutes', 'goals_conceded', 'creativity', 'influence', 'threat',
       'bonus', 'bps', 'ict_index', 'clean_sheets', 'red_cards',
       'yellow_cards', 'selected_by_percent', 'now_cost', 'element_type'],
      dtype='object')


In [5]:
# Combinez certaines caractéristiques en une seule chaîne de texte
# Combine all columns into a single 'Caracteristiques' column
df['Caracteristiques'] = (
    df['goals_scored'].astype(str) + ' ' +
    df['assists'].astype(str) + 'b ' +
    df['total_points'].astype(str) + 'p ' +
    df['minutes'].astype(str) + 'cj ' +
    df['goals_conceded'].astype(str) + 'gc ' +
    df['creativity'].astype(str) + 'cr ' +
    df['influence'].astype(str) + 'inf ' +
    df['threat'].astype(str) + 'th ' +
    df['bonus'].astype(str) + 'bns ' +
    df['bps'].astype(str) + 'bps ' +
    df['ict_index'].astype(str) + 'ict ' +
    df['clean_sheets'].astype(str) + 'cs ' +
    df['red_cards'].astype(str) + 'rc ' +
    df['yellow_cards'].astype(str) + 'yc ' +
    df['selected_by_percent'].astype(str) + 'sbp ' +
    df['now_cost'].astype(str) + 'nc ' +
    df['element_type'].astype(str) + 'et'
)

In [6]:
# Utilisez TF-IDF pour créer une matrice de caractéristiques
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Caracteristiques'])

In [7]:
# Calcul de la similarité cosinus entre les joueurs
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [8]:
# Display the cosine similarity matrix
print(cosine_sim)

[[1.         0.19298389 0.1172453  ... 0.13581157 0.20052823 0.01685944]
 [0.19298389 1.         0.53105125 ... 0.77038262 0.64019309 0.04812575]
 [0.1172453  0.53105125 1.         ... 0.593655   0.66611643 0.06577815]
 ...
 [0.13581157 0.77038262 0.593655   ... 1.         0.79001227 0.08356677]
 [0.20052823 0.64019309 0.66611643 ... 0.79001227 1.         0.11027664]
 [0.01685944 0.04812575 0.06577815 ... 0.08356677 0.11027664 1.        ]]


In [41]:
#access to player and team data
team_player = pd.read_csv('../data/2021-22/fbref_overview.csv')
real_name_to_field_name = pd.read_csv('../data/2021-22/id_dict.csv')

In [28]:
#team for testing
dict_team_player = {"Allan":"Everton",
                    "André Gomes":"Chelsea",
                    "Andros Townsend":"Leicester City",
                    "Anthony Gordon":"Crystal Palace",
                    "Asmir Begović":"West Ham",
                    "Ben Godfrey":"Everton",
                    "Bernard":"Newcastle Utd",
                    "Cenk Tosun":"Everton",
                    "Davy Klaassen":"Brighton",
                    "Demarai Gray":"Manchester City",
}

In [51]:
def filtered_player_by_club(similar_player_indices,dict_team_player):
    #filtrer les joueurs dans similar_players pour enlver ceux ou le club apparairait dans le dict 3 fois 
    new_similar_players_indices = []
    for x in similar_player_indices : 
        player_name = df.loc[x, 'first_name'] + ' ' + df.loc[x, 'second_name']
        try : 
            team = team_player.loc[team_player['player'] == player_name]["squad"].values[0]
        except :
            try:
                new_name = real_name_to_field_name.loc[real_name_to_field_name[' FPL_Name'] == player_name][" Understat_Name"].values[0]
                team = team_player.loc[team_player['player'] == new_name]["squad"].values[0]
            except:
                print("player not found", player_name)
                pass
        if sum(1 for t in dict_team_player.values() if t == team) < 3 or player_name not in dict_team_player.keys():
            new_similar_players_indices.append(x)
        else:
            pass
    return new_similar_players_indices

In [40]:
team_player.loc[team_player['player'] == "Kayky da Silva Chagas"]

Unnamed: 0,pens_made,age,minutes_90s,pens_att,goals_pens,player,assists,xg,position,npxg_per90,...,cards_red,goals_assists_pens_per90,xg_per90,goals_per90,games_starts,goals_assists_per90,xg_xa_per90,goals,xa_per90,goals_pens_per90


In [52]:
def get_recommendations(player,dict_team_player, cosine_similarities, df, num_recommendations=5):
    name_parts = player.split()
    
    # Assuming you want to match both first_name and second_name
    first_name_match = df['first_name'] == name_parts[0]
    second_name_match = df['second_name'] == name_parts[1]

    # Retrieve the index of the row where both first and second names match
    player_index = df.index[(first_name_match) & (second_name_match)][0]
    print(df.iloc[player_index])
    # Get the element_type of the target player
    target_position = df.loc[player_index, 'element_type']

    # Calculate cosine similarities for the target player
    similar_players = list(enumerate(cosine_similarities[player_index]))
    
    # Sort the players by similarity in descending order
    similar_players = sorted(similar_players, key=lambda x: x[1], reverse=True)
    
    # Exclude the target player itself
    similar_players = similar_players[1:]
    
    # Filter similar players based on the same position (element_type)
    similar_player_indices = [x[0] for x in similar_players if df.loc[x[0], 'element_type'] == target_position]
    similar_player_indices = filtered_player_by_club(similar_player_indices,dict_team_player)
    # Take only the top num_recommendations recommendations
    similar_player_indices = similar_player_indices[:num_recommendations]
    
    # Return the DataFrame with recommended players
    return df.iloc[similar_player_indices]


In [53]:

player_name = "Folarin Balogun"
recommendations = get_recommendations(player_name,dict_team_player, cosine_sim, df)
print(recommendations)


first_name                                                       Folarin
second_name                                                      Balogun
goals_scored                                                           0
assists                                                                0
total_points                                                           2
minutes                                                               69
goals_conceded                                                         1
creativity                                                           3.7
influence                                                            4.2
threat                                                              17.0
bonus                                                                  0
bps                                                                    2
ict_index                                                            1.9
clean_sheets                                       

In [15]:

# Example of usage
user_choice = 'Max	Aarons'
recommendations = get_recommendations(user_choice, cosine_sim, df, num_recommendations=5)

# Print the top 5 recommendations
print(f"Top 5 Recommendations for {user_choice}:")
print(recommendations)



IndexError: invalid index to scalar variable.