In [1]:
from functions import *

# Models
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import pickle

# Data
import pandas as pd
import numpy as np

# Dataviz
import plotly.express as px

# ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [71]:
dataset_with_goals = pd.read_feather("dataset_with_goals.feather")
dataset = dataset_with_goals.drop('goals', axis=1)

In [72]:
selected_features = feature_selection(df=dataset, alpha=0.01).columns
X = dataset[selected_features]
y = dataset['win']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)

In [5]:
# Create the Logistic Regression model
LOG_clf = LogisticRegression(max_iter=1000)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
LOG_scores = cross_val_score(LOG_clf, X_train_scaled, y_train, cv=5, scoring='accuracy', n_jobs=8)

# Print the cross-validation scores
print("Cross-validation scores:", LOG_scores)
print("Mean cross-validation score:", np.mean(LOG_scores))

# Train the model on the entire training set
LOG_clf.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
LOG_pred = LOG_clf.predict(X_test_scaled)
print("Accuracy on test set:", accuracy_score(y_test, LOG_pred))
print(classification_report(y_test, LOG_pred))
print(confusion_matrix(y_test, LOG_pred))

Cross-validation scores: [0.8793911  0.86299766 0.88862837 0.88511137 0.88276671]
Mean cross-validation score: 0.8797790413226771
Accuracy on test set: 0.8734770384254921
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       674
           1       0.84      0.81      0.83       393

    accuracy                           0.87      1067
   macro avg       0.87      0.86      0.86      1067
weighted avg       0.87      0.87      0.87      1067

[[612  62]
 [ 73 320]]


In [7]:
coefficients = LOG_clf.coef_[0]
features_weights = {col: coeff for col, coeff in zip(X, coefficients)}

In [8]:
# Example column names, replace these with your actual column names
column_names = X_train.columns

# Ensure the lengths match
assert len(coefficients) == len(column_names), "Coefficients and column names must have the same length."

# Sort coefficients and column names by the coefficients
sorted_indices = np.argsort((coefficients))
sorted_coefficients = coefficients[sorted_indices]
sorted_column_names = [column_names[i] for i in sorted_indices]

# Plot using Plotly Express
fig = px.bar(x=sorted_column_names, y=sorted_coefficients)
fig.show()

# Prediction

In [74]:
players = pd.read_feather(r"C:\Users\leoac\OneDrive - Università degli Studi di Milano\Unimi\Subjects\Labs\Football analytics\SPR - Project\Seasonal-Player-Rating\Project2-3 - match-seasonal index\match_data.feather")
players_data = players[['player_id','game_id','team_id','team','season','full_name','cluster_label'] + list(selected_features) + ['goals']].fillna(0)

In [75]:
players_data

Unnamed: 0,player_id,game_id,team_id,team,season,full_name,cluster_label,accurate_throws,aerial_won,ball_recovery,...,touches_attack_third,defensive_aerials,defensive_actions_height,dribbles_final_third,SGA,xGoT,xG,prog_carries_final_third,opxG,goals
0,80607,2292978,1,Manchester United,2022,Christian Eriksen,Roaming playmaker,0.0,0.0,5.0,...,33,0.0,58.725000,0,0.00000,0.000000,0.075655,2,0.075655,0.0
1,154296,2367682,54,Fulham,2023,João Palhinha,Ball stealer,0.0,2.0,7.0,...,5,1.0,40.775000,0,0.00000,0.000000,0.000000,0,0.000000,0.0
2,191866,2367740,94,Brentford,2023,Kristoffer Ajer,First line breaker,0.0,0.0,1.0,...,2,0.0,38.250000,0,0.00000,0.000000,0.000000,0,0.000000,0.0
3,168991,2367666,91,Bournemouth,2023,Philip Billing,Roaming playmaker,0.0,2.0,2.0,...,9,2.0,38.600000,0,0.00000,0.000000,0.140282,0,0.140282,0.0
4,427623,2367593,31,Crystal Palace,2023,Chris Richards,Ball stopper,0.0,0.0,0.0,...,0,0.0,0.000000,0,0.00000,0.000000,0.000000,0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84674,224467,2301806,2893,Girona,2022,Iván Martín,Roaming playmaker,0.0,1.0,2.0,...,8,1.0,0.000000,0,0.46407,0.684316,0.331423,0,0.331423,1.0
84675,213431,2372336,175,Atlético de Madrid,2023,Mario Hermoso,First line breaker,5.0,3.0,6.0,...,8,4.0,32.680000,0,0.00000,0.000000,0.000000,0,0.000000,0.0
84676,170154,2301800,181,Mallorca,2022,Pablo Maffeo,Wide creator,6.0,1.0,10.0,...,13,1.0,58.062500,1,0.00000,0.000000,0.019835,2,0.019835,0.0
84677,500046,2301909,178,Barcelona,2022,Gavi,Roaming playmaker,0.0,0.0,2.0,...,9,0.0,56.600000,0,0.00000,0.000000,0.000000,2,0.000000,0.0


In [23]:
cluster_features_weights = get_cluster_features_weights(pd.read_feather("Project2-3 - match-seasonal index\match_data.feather"))
cluster_features_weights = {cluster:
                            {feature: cluster_features_weights[cluster][feature]
                              for feature in features_weights.keys()}
                                for cluster in cluster_features_weights.keys()}

In [56]:
def calculate_playerank(row: pd.DataFrame | pd.Series, features_weights: dict, cluster_features_weights: dict) -> float:

    # Get the cluster weights for the current player
    cluster = row['cluster_label'].values[0]
    cluster_weights = cluster_features_weights[cluster]
    
    # Calculate Goals and Assists
    goals = row['goals']
    assists = row['goal_assist']

    # Drop indicators
    row = row.drop(['player_id', 'game_id', 'team_id', 'team', 'season', 'full_name', 'cluster_label'], axis=1)
    
    # Calculate playerank by summing the products
    playerank = (np.array(list(features_weights.values())) * np.array(list(cluster_weights.values())) * row.values).sum()
    playerank = playerank*(1-alpha-beta) + goals*alpha + assists*beta # Fixed or by cluster?
        
    return playerank

In [59]:
players_playerank = players_data[players_data['season'] == 2023].groupby(['player_id', 'game_id', 'team_id', 'team', 'season', 'full_name']).apply(lambda row: calculate_playerank(row, features_weights, cluster_features_weights))





In [65]:
ranking = players_playerank.groupby(['player_id', 'full_name']).mean()

In [67]:
ranking.sort_values(ascending=False).head(20)

player_id  full_name         
209244     Phil Foden            21.796842
60307      Pascal Groß           19.073059
461358     Julián Álvarez        18.708667
172780     James Maddison        18.520920
184029     Martin Ødegaard       18.198668
141746     Bruno Fernandes       18.121635
165809     Bernardo Silva        18.092518
231653     Joey Veerman          16.925308
80209      Isco                  16.908861
223340     Bukayo Saka           16.592130
244851     Cole Palmer           16.318300
229596     Michel Vlap           15.927698
519638     Johan Bakayoko        15.800244
201658     Marcus Tavernier      15.690763
232787     Conor Gallagher       15.437267
59859      Ilkay Gündogan        15.218475
424876     Dominik Szoboszlai    15.209493
116796     Domenico Berardi      15.092657
120638     Paulo Dybala          14.943823
88935      Steven Berghuis       14.794689
dtype: float64