In [138]:
from functions import *

# Models
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import pickle

# Data
import pandas as pd
import numpy as np

# Dataviz
import plotly.express as px

# ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [269]:
dataset_with_goals_and_assists = pd.read_feather("dataset_with_goals_and_assists.feather")
# dataset = dataset_with_goals_and_assists.drop(['goals', 'goal_assist'], axis=1)
dataset = dataset_with_goals_and_assists

In [280]:
selected_features = feature_selection(df=dataset, alpha=0.01).columns
X = dataset[selected_features]
y = dataset['win']

In [281]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)

In [282]:
# Create the Logistic Regression model
LOG_clf = LogisticRegression(max_iter=1000)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
LOG_scores = cross_val_score(LOG_clf, X_train_scaled, y_train, cv=5, scoring='accuracy', n_jobs=8)

# Print the cross-validation scores
print("Cross-validation scores:", LOG_scores)
print("Mean cross-validation score:", np.mean(LOG_scores))

# Train the model on the entire training set
LOG_clf.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
LOG_pred = LOG_clf.predict(X_test_scaled)
print("Accuracy on test set:", accuracy_score(y_test, LOG_pred))
print(classification_report(y_test, LOG_pred))
print(confusion_matrix(y_test, LOG_pred))

Cross-validation scores: [1. 1. 1. 1. 1.]
Mean cross-validation score: 1.0
Accuracy on test set: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       674
           1       1.00      1.00      1.00       393

    accuracy                           1.00      1067
   macro avg       1.00      1.00      1.00      1067
weighted avg       1.00      1.00      1.00      1067

[[674   0]
 [  0 393]]


In [283]:
coefficients = LOG_clf.coef_[0]
features_weights = {col: coeff for col, coeff in zip(X, coefficients)}

In [284]:
# Example column names, replace these with your actual column names
column_names = X_train.columns

# Ensure the lengths match
assert len(coefficients) == len(column_names), "Coefficients and column names must have the same length."

# Sort coefficients and column names by the coefficients
sorted_indices = np.argsort((coefficients))
sorted_coefficients = coefficients[sorted_indices]
sorted_column_names = [column_names[i] for i in sorted_indices]

# Plot using Plotly Express
fig = px.bar(x=sorted_column_names, y=sorted_coefficients)
fig.show()

# Prediction

In [329]:
players = pd.read_feather(r"C:\Users\leoac\OneDrive - Università degli Studi di Milano\Unimi\Subjects\Labs\Football analytics\SPR - Project\Seasonal-Player-Rating\Project2-3 - match-seasonal index\match_data.feather")
players['mins_played'] = players['end_minute'] - players['start_minute']
# + ['goals', 'goal_assist'] at the end if you want to use them separatedly wrt the logistic
# players_data = players[['player_id','game_id','team_id','team','season','full_name','cluster_label'] + list(selected_features) + ['goals', 'goal_assist']].fillna(0)
players_data = players[['competition','mins_played'] + ['player_id','game_id','team_id','team','season','full_name','cluster_label'] + list(selected_features)].fillna(0)

In [330]:
cluster_features_weights = get_cluster_features_weights(pd.read_feather("Project2-3 - match-seasonal index\match_data.feather"))
cluster_features_weights = {cluster:
                            {feature: cluster_features_weights[cluster][feature]
                              for feature in features_weights.keys()}
                                for cluster in cluster_features_weights.keys()}

In [287]:
# def calculate_playerank(row: pd.DataFrame | pd.Series, features_weights: dict, cluster_features_weights: dict) -> float:

#     # Get the cluster weights for the current player
#     cluster = row['cluster_label'].values[0]
#     cluster_weights = cluster_features_weights[cluster]
#     # Usa alpha and beta as the same weights of xG and xA for goals and assists
#     alpha = cluster_features_weights[cluster]['xG']
#     beta = 0 # cluster_features_weights[cluster]['xA']
    
#     # Calculate Goals and Assists
#     goals = row['goals']
#     assists = row['goal_assist']

#     # Drop goals and assists
#     row = row.drop(['goals', 'goal_assist'], axis=1)

#     # Drop indicators
#     row = row.drop(['player_id', 'game_id', 'team_id', 'team', 'season', 'full_name', 'cluster_label'], axis=1)
    
#     # Calculate playerank by summing the products
#     playerank = (np.array(list(features_weights.values())) * np.array(list(cluster_weights.values())) * row.values).sum()
#     # Add the weighted goals and assists values. Weights are chosen as the same weights of xG and xA cluster-wise
#     playerank = playerank*(1-alpha-beta) + goals*alpha + assists*beta
        
#     return playerank

In [332]:
def calculate_playerank(row: pd.DataFrame | pd.Series, features_weights: dict, cluster_features_weights: dict) -> float:

    # Get the cluster weights for the current player
    cluster = row['cluster_label'].values[0]
    cluster_weights = cluster_features_weights[cluster]
    # cluster_weights['goals'] = cluster_weights['goals'] * cluster_features_weights[cluster]['xG']
    # cluster_weights['goal_assist'] = cluster_weights['goal_assist'] * cluster_features_weights[cluster]['xA']

    # Drop indicators
    row = row.drop(['player_id', 'game_id', 'team_id', 'team', 'season', 'full_name', 'cluster_label'] + ['competition','mins_played'], axis=1)
    
    # Calculate playerank by summing the products
    playerank = (np.array(list(features_weights.values())) * np.array(list(cluster_weights.values())) * row.values).sum()

    return playerank

In [333]:
players_playerank = players_data[players_data['season'] == 2023].groupby(['player_id', 'game_id', 'team_id', 'team', 'season', 'full_name']).apply(lambda row: calculate_playerank(row, features_weights, cluster_features_weights))





In [347]:
player_cluster = players_data[['player_id', 'cluster_label', 'competition',]].groupby('player_id').last()
player_cluster = pd.merge(player_cluster, players_data[['player_id', 'mins_played']].groupby('player_id').sum(), right_on='player_id', left_index=True)

In [348]:
players_playerank_series = pd.Series(players_playerank.groupby(['player_id', 'full_name']).mean(), name='playerank')
ranking = pd.merge(player_cluster, players_playerank_series, right_on='player_id', left_index=True)

In [336]:
# ranking.groupby('cluster_label').sum() / ranking.groupby('cluster_label').count()

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [337]:
for cluster in ranking.cluster_label.unique():
    print(ranking[ranking.cluster_label == cluster].sort_values(by='playerank', ascending=False).head(5))

                                     cluster_label competition  playerank
player_id full_name                                                      
247359    Alessandro Buongiorno  Buildup initiator     Serie A   1.562528
543612    Federico Gatti         Buildup initiator     Serie A   1.559263
510162    Ryan Flamingo          Buildup initiator  Eredivisie   1.351816
554605    Dean Huijsen           Buildup initiator     Serie A   1.319025
170856    Damon Mirani           Buildup initiator  Eredivisie   1.183828
                           cluster_label     competition  playerank
player_id full_name                                                
207283    Mathias Jensen    Wide creator  Premier League   2.225117
161919    Federico Dimarco  Wide creator         Serie A   0.926953
202072    Bart Nieuwkoop    Wide creator      Eredivisie   0.915947
219435    Jordy Bruijn      Wide creator      Eredivisie   0.908626
466044    Jesús Vázquez     Wide creator         La Liga   0.882150
      

In [349]:
ranking[ranking['competition'] == 'Serie A'].sort_values(by='playerank', ascending=False).head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,cluster_label,competition,mins_played,playerank
player_id,full_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
209737,Lautaro Martínez,All-round finisher,Serie A,5048.0,8.546492
218329,Victor Osimhen,All-round finisher,Serie A,4327.0,6.824948
215145,Sam Lammers,Mobile finisher,Serie A,3342.0,6.188773
225479,Dusan Vlahovic,Target man,Serie A,3988.0,5.704125
195899,Gianluca Scamacca,Mobile finisher,Serie A,2105.0,5.183011
458249,Joshua Zirkzee,Mobile finisher,Serie A,3387.0,5.16773
120638,Paulo Dybala,Chance creator,Serie A,3436.0,5.006822
44346,Olivier Giroud,Target man,Serie A,4291.0,4.815105
103086,Duván Zapata,All-round finisher,Serie A,3967.0,4.771592
116796,Domenico Berardi,Chance creator,Serie A,3443.0,4.556105
