In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

import sklearn.discriminant_analysis as DA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR

import matplotlib.pyplot as plt

In [23]:
from fonctions_tests import add_manager_win_percentage

df = pd.read_csv("./data/matchs_2013_2022.csv")

add_manager_win_percentage(df)

df.head()

Unnamed: 0.1,Unnamed: 0,game_id,season,round,date,home_club_id,away_club_id,home_club_goals,away_club_goals,home_club_position,...,referee,home_club_formation,away_club_formation,home_club_name,away_club_name,aggregate,competition_type,results,home_club_manager_win_percentage,away_club_manager_win_percentage
0,12,2330874,2013,8. Matchday,2013-09-28,618,595,2,2,6.0,...,Antony Gautier,4-3-3 Attacking,4-2-3-1,AS Saint-Étienne,SC Bastia,2:2,domestic_league,0,0.571429,0.166667
1,13,2330880,2013,10. Matchday,2013-10-19,14171,855,1,2,16.0,...,Philippe Kalt,4-4-2 double 6,4-4-2 double 6,Thonon Évian Grand Genève FC,EA Guingamp,1:2,domestic_league,-1,0.393939,0.254237
2,14,2330916,2013,15. Matchday,2013-11-30,1423,1082,0,1,18.0,...,Bartolomeu Varela,4-4-2 double 6,4-3-1-2,Valenciennes FC,Lille Olympique Sporting Club Lille Métropole,0:1,domestic_league,-1,0.214286,0.276923
3,15,2331018,2013,21. Matchday,2014-01-18,273,14171,0,0,15.0,...,Benoît Millot,4-3-3 Attacking,4-2-3-1,Stade Rennais Football Club,Thonon Évian Grand Genève FC,0:0,domestic_league,0,0.313433,0.178218
4,16,2331034,2013,24. Matchday,2014-02-08,1147,273,3,1,20.0,...,Benoît Bastien,4-1-4-1,4-3-3 Defending,AC Ajaccio,Stade Rennais Football Club,3:1,domestic_league,1,0.166667,0.338235


In [24]:
X = df[["attendance", "home_club_manager_win_percentage", "away_club_manager_win_percentage"]]
y = df[["results"]]
X.head()

Unnamed: 0,attendance,home_club_manager_win_percentage,away_club_manager_win_percentage
0,31952.0,0.571429,0.166667
1,8620.0,0.393939,0.254237
2,14874.0,0.214286,0.276923
3,13391.0,0.313433,0.178218
4,5981.0,0.166667,0.338235


## Calcul des scores de précision de différents modèles

In [25]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

X_train , X_test , y_train , y_test = train_test_split(
        X, y, test_size = 0.15, random_state = 1234)

print(X_train.shape, X_test.shape)

(3466, 3) (612, 3)


In [26]:
# Define the names of the classifications method
names = ["LogisticRegression",
         "NBayes",
         "LDA",
         "QDA",
         "KNN",
         "SVM rbf",
         "SVM sigmoid",
         "RandomForest",
         "DecisionTree"]

# Define the classifiers
classifiers = [LogisticRegression(C=1e5),
               GaussianNB(),
               DA.LinearDiscriminantAnalysis(),
               DA.QuadraticDiscriminantAnalysis(), 
               KNeighborsClassifier(n_neighbors=5),
               SVC(kernel='rbf', gamma = 1e-4),
               SVC(kernel='sigmoid', gamma = 1e-4),
               RandomForestClassifier(),
               DecisionTreeClassifier()]

from warnings import simplefilter
simplefilter(action='ignore')

# Write the head line of the table describing the accuracy of each method
print('Name  Accuracy\n'+14*'-')

# Create a new figure for the histogram
plt.figure()

# Loop through all different classfication methods
for name, clf in zip(names, classifiers):

    # Predict the diagnosis using the classifier
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Print the classifier's name and its accuracy score
    print('{:6}  {:3.3f}'.format(name, accuracy_score(y_test,y_pred)))

Name  Accuracy
--------------
LogisticRegression  0.529
NBayes  0.474
LDA     0.539
QDA     0.538
KNN     0.340
SVM rbf  0.377
SVM sigmoid  0.433
RandomForest  0.462
DecisionTree  0.418


<Figure size 640x480 with 0 Axes>