In [12]:
from functions import *

# Models
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import pickle

# Data
import pandas as pd
import numpy as np

# Dataviz
import plotly.express as px

# ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [13]:
dataset_with_goals_and_assists = pd.read_feather("dataset_with_goals_and_assists.feather")
# dataset = dataset_with_goals_and_assists.drop(['goals', 'goal_assist'], axis=1)
dataset = dataset_with_goals_and_assists

In [14]:
selected_features = feature_selection(df=dataset, alpha=0).columns
X = dataset[selected_features]
y = dataset['win']


With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator


Coordinate descent with no regularization may lead to unexpected results and is discouraged.


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.315e+02, tolerance: 1.249e-01 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.



In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)

In [16]:
# Create the Logistic Regression model
LOG_clf = LogisticRegression(max_iter=1000)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
LOG_scores = cross_val_score(LOG_clf, X_train_scaled, y_train, cv=5, scoring='accuracy', n_jobs=8)

# Print the cross-validation scores
print("Cross-validation scores:", LOG_scores)
print("Mean cross-validation score:", np.mean(LOG_scores))

# Train the model on the entire training set
LOG_clf.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
LOG_pred = LOG_clf.predict(X_test_scaled)
print("Accuracy on test set:", accuracy_score(y_test, LOG_pred))
print(classification_report(y_test, LOG_pred))
print(confusion_matrix(y_test, LOG_pred))

Cross-validation scores: [1.         0.99882904 1.         1.         1.        ]
Mean cross-validation score: 0.9997658079625292
Accuracy on test set: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       674
           1       1.00      1.00      1.00       393

    accuracy                           1.00      1067
   macro avg       1.00      1.00      1.00      1067
weighted avg       1.00      1.00      1.00      1067

[[674   0]
 [  0 393]]


In [54]:
coefficients = LOG_clf.coef_[0]
cbrt_coefficients = (np.cbrt(coefficients))
features_weights = {col: coeff for col, coeff in zip(X, coefficients)}

In [55]:
# Example column names, replace these with your actual column names
column_names = X_train.columns

# Ensure the lengths match
assert len(coefficients) == len(column_names), "Coefficients and column names must have the same length."

# Sort coefficients and column names by the coefficients
sorted_indices = np.argsort((cbrt_coefficients)) # scaled
sorted_coefficients = cbrt_coefficients[sorted_indices] # scaled
sorted_column_names = [column_names[i] for i in sorted_indices]

# Plot using Plotly Express
fig = px.bar(x=sorted_column_names, y=sorted_coefficients)
fig.show()

# Prediction

In [19]:
players = pd.read_feather(r"C:\Users\leoac\OneDrive - Università degli Studi di Milano\Unimi\Subjects\Labs\Football analytics\SPR - Project\Seasonal-Player-Rating\Project2-3 - match-seasonal index\match_data.feather")
players['mins_played'] = players['end_minute'] - players['start_minute']
# + ['goals', 'goal_assist'] at the end if you want to use them separatedly wrt the logistic
# players_data = players[['competition','mins_played'] + ['player_id','game_id','team_id','team','season','full_name','cluster_label'] + list(selected_features) + ['goals', 'goal_assist']].fillna(0)
players_data = players[['competition','mins_played'] + ['player_id','game_id','team_id','team','season','full_name','cluster_label'] + list(selected_features)].fillna(0)

In [20]:
cluster_features_weights = get_cluster_features_weights(pd.read_feather("Project2-3 - match-seasonal index\match_data.feather"))
cluster_features_weights = {cluster:
                            {feature: cluster_features_weights[cluster][feature]
                              for feature in features_weights.keys()}
                                for cluster in cluster_features_weights.keys()}

In [21]:
# def calculate_playerank(row: pd.DataFrame | pd.Series, features_weights: dict, cluster_features_weights: dict) -> float:

#     # Get the cluster weights for the current player
#     cluster = row['cluster_label'].values[0]
#     cluster_weights = cluster_features_weights[cluster]
#     # Usa alpha and beta as the same weights of xG and xA for goals and assists
#     alpha = 1 
#     beta = 0.2
    
#     # Calculate Goals and Assists
#     goals = row['goals']
#     assists = row['goal_assist']

#     # Drop goals and assists
#     row = row.drop(['goals', 'goal_assist'], axis=1)

#     # Drop indicators
#     row = row.drop(['player_id', 'game_id', 'team_id', 'team', 'season', 'full_name', 'cluster_label'] + ['competition','mins_played'], axis=1)
    
#     # Calculate playerank by summing the products
#     playerank = (np.array(list(features_weights.values())) * np.array(list(cluster_weights.values())) * row.values).sum()
#     # Add the weighted goals and assists values. Weights are chosen as the same weights of xG and xA cluster-wise
#     playerank = playerank*(1-alpha-beta) + goals*alpha + assists*beta
        
#     return playerank

In [22]:
def calculate_playerank(row: pd.DataFrame | pd.Series, features_weights: dict, cluster_features_weights: dict) -> float:

    # Get the cluster weights for the current player
    cluster = row['cluster_label'].values[0]
    cluster_weights = cluster_features_weights[cluster]
    # cluster_weights['goals'] = cluster_weights['goals'] * cluster_features_weights[cluster]['xG']
    # cluster_weights['goal_assist'] = cluster_weights['goal_assist'] * cluster_features_weights[cluster]['xA']

    # Drop indicators
    row = row.drop(['player_id', 'game_id', 'team_id', 'team', 'season', 'full_name', 'cluster_label'] + ['competition','mins_played'], axis=1)
    
    # Calculate playerank by summing the products
    playerank = (np.array(list(features_weights.values())) * np.array(list(cluster_weights.values())) * row.values).sum()

    return playerank

In [23]:
players_playerank = players_data[players_data['season'] == 2023].groupby(['player_id', 'game_id', 'team_id', 'team', 'season', 'full_name']).apply(lambda row: calculate_playerank(row, features_weights, cluster_features_weights))





In [24]:
# Players and cluster
player_cluster = players_data[['player_id', 'cluster_label', 'competition',]].groupby('player_id').last()
# players and minutes played
player_minutes = players_data[['player_id', 'mins_played','season']].groupby(['player_id', 'season']).sum().reset_index()
# Merge clusters and minutes played
player_cluster = pd.merge(player_cluster, player_minutes[player_minutes['season'] == 2023], right_on='player_id', left_on='player_id')
player_cluster.set_index('player_id', inplace=True)
# Create playerank series
players_playerank_series = pd.Series(players_playerank.groupby(['player_id', 'full_name']).mean(), name='playerank')
# Merge clusters, minutes and playerank
ranking = pd.merge(player_cluster, players_playerank_series, right_on='player_id', left_index=True)

In [25]:
ranking[ranking['mins_played'] > 2000].sort_values(by='playerank', ascending=False).head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,cluster_label,competition,season,mins_played,playerank
player_id,full_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
244855,Jude Bellingham,Mobile finisher,La Liga,2023,2143.0,11.360321
212690,Vangelis Pavlidis,All-round finisher,Eredivisie,2023,2782.0,10.506381
209737,Lautaro Martínez,All-round finisher,Serie A,2023,2361.0,9.840866
209244,Phil Foden,Chance creator,Premier League,2023,2597.0,9.594894
244851,Cole Palmer,One-to-one explorer,Premier League,2023,2186.0,9.459614
53041,Luuk de Jong,Target man,Eredivisie,2023,2480.0,9.380232
199248,Borja Mayoral,Mobile finisher,La Liga,2023,2243.0,9.294008
432234,Santiago Giménez,All-round finisher,Eredivisie,2023,2461.0,9.019867
178186,Jarrod Bowen,One-to-one explorer,Premier League,2023,2970.0,8.924275
441264,Brian Brobbey,All-round finisher,Eredivisie,2023,2186.0,8.694018


In [26]:
# ranking.groupby('cluster_label').sum() / ranking.groupby('cluster_label').count()

In [27]:
for cluster in ranking.cluster_label.unique():
    print(ranking[(ranking.cluster_label == cluster) & (ranking['mins_played'] > 2000)].sort_values(by='playerank', ascending=False).head(5))

                                 cluster_label     competition  season  \
player_id full_name                                                      
543612    Federico Gatti     Buildup initiator         Serie A    2023   
510162    Ryan Flamingo      Buildup initiator      Eredivisie    2023   
462424    William Saliba     Buildup initiator  Premier League    2023   
95090     André Ramalho      Buildup initiator      Eredivisie    2023   
226597    Gabriel Magalhães  Buildup initiator  Premier League    2023   

                             mins_played  playerank  
player_id full_name                                  
543612    Federico Gatti          2339.0   2.452368  
510162    Ryan Flamingo           2539.0   2.302959  
462424    William Saliba          3133.0   2.249328  
95090     André Ramalho           2146.0   2.203498  
226597    Gabriel Magalhães       2741.0   2.117557  
                           cluster_label     competition  season  mins_played  \
player_id full_name   

In [28]:
ranking[(ranking['competition'] == 'Serie A') & (ranking['mins_played'] > 2000)].sort_values(by='playerank', ascending=False).head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,cluster_label,competition,season,mins_played,playerank
player_id,full_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
209737,Lautaro Martínez,All-round finisher,Serie A,2023,2361.0,9.840866
439492,Lewis Ferguson,Roaming playmaker,Serie A,2023,2882.0,7.984007
458249,Joshua Zirkzee,Mobile finisher,Serie A,2023,2557.0,7.617533
171384,Nicolò Barella,Chance creator,Serie A,2023,2548.0,7.246367
57249,Henrikh Mkhitaryan,Roaming playmaker,Serie A,2023,2598.0,6.581564
203325,Frank Anguissa,Roaming playmaker,Serie A,2023,2385.0,6.473591
103086,Duván Zapata,All-round finisher,Serie A,2023,2536.0,6.163
204644,Teun Koopmeiners,Chance creator,Serie A,2023,2373.0,5.979986
120743,Adrien Rabiot,Roaming playmaker,Serie A,2023,2374.0,5.768393
215147,Albert Gudmundsson,Chance creator,Serie A,2023,2837.0,5.751448
