In [29]:
import pandas as pd
from os import path

# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

# Load the CSV data
df = pd.read_csv(path.join(DATA_DIR, 'player_match.csv'))

In [30]:
# processing
 
# Fill missing values with 0
df.fillna(0, inplace=True)

# Group by player_id and aggregate
columns_to_convert = ['shot', 'goal', 'goal_allowed', 'assist', 'pass', 'pass_accurate', 'tackle', 
                     'accel', 'counter', 'opportunity', 'keypass', 'own_goal', 'interception', 
                     'smart', 'clearance', 'cross', 'air_duel', 'air_duel_won', 'gk_leave_line', 
                     'gk_save_attempt', 'throw', 'corner']

agg_funcs = {col: 'sum' for col in columns_to_convert}
agg_funcs['min'] = 'sum'  # Make sure to sum up the minutes as well
grouped = df.groupby('player_id').agg(agg_funcs).reset_index()

# a small value to avoid division by zero
epsilon = 1e-10 

# Calculate per 90 metrics for relevant columns
for col in columns_to_convert:
    grouped[col+'_per_90'] = (grouped[col] / (grouped['min'] + epsilon)) * 90

# Save the grouped DataFrame to a new CSV file
grouped.to_csv('player_per_90.csv', index=False)

In [31]:
from sklearn.preprocessing import StandardScaler

# Create a copy of the grouped DataFrame for normalization
normalized_per_90 = grouped.copy()

# Normalize the per 90 metrics
scaler = StandardScaler()
columns_to_normalize = [col+'_per_90' for col in columns_to_convert]
normalized_per_90[columns_to_normalize] = scaler.fit_transform(normalized_per_90[columns_to_normalize])

# Save the grouped DataFrame to a CSV file
grouped.to_csv('player_per_90.csv', index=False)

# Save the normalized DataFrame to a new CSV file
normalized_per_90.to_csv('player_per_90_normalized.csv', index=False)

In [32]:
import faiss
import numpy as np

# Assuming df_normalized from previous steps is your DataFrame with player vectors
vectors = normalized_per_90.values.astype('float32')

# Build the index
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

# Search for the 5 most similar players to the player at index 0
D, I = index.search(vectors[0:1], 5)

print(f"Indices of the 5 most similar players to player at index 0: {I[0]}")
print(f"Distances: {D[0]}")

Indices of the 5 most similar players to player at index 0: [0 4 3 7 5]
Distances: [    0.     8330.235 21798.494 28163.    33589.23 ]


In [33]:
# 1. Load the CSV file and extract only the player_id and player_name columns
names_df = pd.read_csv(path.join(DATA_DIR, 'players.csv'))[['player_id', 'player_name']]

# 1. Create transformations for player_id to player_name and vice versa using names_df
id_to_name = pd.Series(names_df.player_name.values, index=names_df.player_id).to_dict()
name_to_id = {v: k for k, v in id_to_name.items()}

# Adjust the id_to_index and index_to_id mappings to use the player IDs from normalized_df
id_to_index = {player_id: idx for idx, player_id in enumerate(normalized_per_90['player_id'])}
index_to_id = {idx: player_id for player_id, idx in id_to_index.items()}


In [34]:
def find_similar_players(player_name, columns=None, k=5):
    # If no columns are specified, use all columns in normalized_df (excluding 'player_id')
    if columns is None:
        columns = [col for col in normalized_per_90.columns if col != 'player_id']
    
    # Extract vectors based on the specified columns
    specific_vectors = normalized_per_90[columns].values.astype('float32')
    
    # Build the FAISS index for the specified vectors
    specific_index = faiss.IndexFlatL2(specific_vectors.shape[1])
    specific_index.add(specific_vectors)
    
    # Find the player_id corresponding to the player_name using name_to_id
    if player_name not in name_to_id:
        return f"Player {player_name} not found in the dataset."
    
    player_id = name_to_id[player_name]
    
    # Get the vector for the given player_id based on the specified columns
    player_vector = specific_vectors[id_to_index[player_id]].reshape(1, -1)
    
    # Search for the k most similar players in FAISS using the specific index
    D, I = specific_index.search(player_vector, k+1)  # k+1 because the player will be most similar to themselves
    
    # Convert indices to player_ids and then use id_to_name to get player names, excluding the input player
    similar_players_ids = [index_to_id[idx] for idx in I[0] if idx != id_to_index[player_id]]
    similar_players_names = [id_to_name[player_id] for player_id in similar_players_ids]
    
    return similar_players_names[:k]

In [38]:
# Test the function
player_name = "L. Messi"
comparison_cols = ['shot', 'goal', 'goal_allowed', 'assist', 'pass', 'pass_accurate', 'tackle', 'accel', 'counter', 'opportunity', 'keypass']    #['shot', 'goal', 'goal_allowed', 'assist', 'pass', 'pass_accurate', 'tackle', 'accel', 'counter', 'opportunity', 'keypass', 'own_goal', 'interception', 'smart', 'clearance', 'cross', 'air_duel', 'air_duel_won', 'gk_leave_line', 'gk_save_attempt', 'throw', 'corner']
print(f"Players most similar to {player_name}:")
print(find_similar_players(player_name, comparison_cols))


Players most similar to L. Messi:
['X. Shaqiri', 'Salem Al Dawsari', 'A. Griezmann', 'Yasir Al Shahrani', 'S. Arias']
