In [1]:
import pandas as pd
from os import path

# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

# Load the CSV data
df = pd.read_csv(path.join(DATA_DIR, 'player_match.csv'))

In [2]:
# Fill missing values with 0
df.fillna(0, inplace=True)

# Group by player_id and aggregate
columns_to_convert = ['shot', 'goal', 'goal_allowed', 'assist', 'pass', 'pass_accurate', 'tackle', 
                     'accel', 'counter', 'opportunity', 'keypass', 'own_goal', 'interception', 
                     'smart', 'clearance', 'cross', 'air_duel', 'air_duel_won', 'gk_leave_line', 
                     'gk_save_attempt', 'throw', 'corner']

agg_funcs = {col: 'sum' for col in columns_to_convert}
agg_funcs['min'] = 'sum'  # Make sure to sum up the minutes as well
grouped = df.groupby('player_id').agg(agg_funcs).reset_index()

# a small value to avoid division by zero
epsilon = 1e-10 

# Calculate per 90 metrics for relevant columns
for col in columns_to_convert:
    grouped[col+'_per_90'] = (grouped[col] / (grouped['min'] + epsilon)) * 90

# Save the grouped DataFrame to a new CSV file
grouped.to_csv('player_per_90.csv', index=False)

In [3]:
from sklearn.preprocessing import StandardScaler

# Create a copy of the grouped DataFrame for normalization
normalized_per_90 = grouped.copy()

# Normalize the per 90 metrics
scaler = StandardScaler()
columns_to_normalize = [col+'_per_90' for col in columns_to_convert]
normalized_per_90[columns_to_normalize] = scaler.fit_transform(normalized_per_90[columns_to_normalize])

# Save the grouped DataFrame to a CSV file
grouped.to_csv('player_per_90.csv', index=False)

# Save the normalized DataFrame to a new CSV file
normalized_per_90.to_csv('player_per_90_normalized.csv', index=False)

In [4]:
import faiss
import numpy as np

# Assuming df_normalized from previous steps is your DataFrame with player vectors
vectors = normalized_per_90.values.astype('float32')

# Build the index
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)

# Search for the 5 most similar players to the player at index 0
D, I = index.search(vectors[0:1], 5)

In [5]:
# 1. Load the CSV file and extract only the player_id and player_name columns
names_df = pd.read_csv(path.join(DATA_DIR, 'players.csv'))[['player_id', 'player_name']]

# 1. Create transformations for player_id to player_name and vice versa using names_df
id_to_name = pd.Series(names_df.player_name.values, index=names_df.player_id).to_dict()
name_to_id = {v: k for k, v in id_to_name.items()}

# Adjust the id_to_index and index_to_id mappings to use the player IDs from normalized_df
id_to_index = {player_id: idx for idx, player_id in enumerate(normalized_per_90['player_id'])}
index_to_id = {idx: player_id for player_id, idx in id_to_index.items()}


In [6]:
def rank_players(columns_dict, k=5):
    """
    Returns the k players based on aggregate ranking across specified columns.
    
    Parameters:
    - columns_dict (dict): Dictionary where keys are column names and values are booleans.
                           If True, sort column in descending order. If False, sort in ascending order.
    - k (int): Number of players to return.
    
    Returns:
    - List of player names.
    """
    
    # Check if all columns in columns_dict exist in normalized_df
    for column in columns_dict:
        if column not in normalized_per_90.columns:
            return f"Column {column} not found in the dataset."
    
    # Create a rank DataFrame
    rank_df = pd.DataFrame()
    for column, high in columns_dict.items():
        rank_df[column] = normalized_per_90[column].rank(ascending=not high)
    
    # Calculate aggregate rank across columns
    rank_df['aggregate_rank'] = rank_df.sum(axis=1)
    
    # Sort players based on aggregate rank
    sorted_df = rank_df.sort_values(by='aggregate_rank')
    
    # Get the top k player_ids based on aggregate rank
    top_k_ids = normalized_per_90.iloc[sorted_df.head(k).index]['player_id'].tolist()
    
    # Convert player_ids to player names using id_to_name
    top_k_names = [id_to_name[player_id] for player_id in top_k_ids]
    
    return top_k_names

In [7]:
# Test the function
columns_dict = {
    "shot": True,   # Highest players
    "goal": False  # Lowest players
}
print(f"Top 5 players based on aggregate ranking across {list(columns_dict.keys())}:")
print(rank_players(columns_dict, k=5))

Top 5 players based on aggregate ranking across ['shot', 'goal']:
['M. Berg', 'O. Giroud', 'R. Sterling', 'Trézéguet', 'Gabriel Jesus']
