In [1]:
# Installation des bibliothèques nécessaires
# pip install requests pandas numpy scikit-learn xgboost matplotlib seaborn

import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

In [2]:
dataset = pd.read_csv('EPL.csv')
dataset.head()

Unnamed: 0,Date,Season,HomeTeam,AwayTeam,FTH Goals,FTA Goals,FT Result,HTH Goals,HTA Goals,HT Result,...,H Fouls,A Fouls,H Corners,A Corners,H Yellow,A Yellow,H Red,A Red,Display_Order,League
0,16/01/2025,2024/25,Ipswich Town,Brighton & Hove Albion,0,2,A,0.0,1.0,A,...,13.0,14.0,1.0,9.0,2.0,2.0,0.0,0.0,20250116,Premier League
1,16/01/2025,2024/25,Man United,Southampton,3,1,H,0.0,1.0,A,...,7.0,10.0,4.0,4.0,1.0,3.0,0.0,0.0,20250116,Premier League
2,15/01/2025,2024/25,Everton,Aston Villa,0,1,A,0.0,0.0,D,...,17.0,10.0,8.0,5.0,2.0,1.0,0.0,0.0,20250115,Premier League
3,15/01/2025,2024/25,Leicester,Crystal Palace,0,2,A,0.0,0.0,D,...,7.0,6.0,4.0,3.0,0.0,0.0,0.0,0.0,20250115,Premier League
4,15/01/2025,2024/25,Newcastle,Wolves,3,0,H,1.0,0.0,H,...,10.0,13.0,4.0,2.0,0.0,2.0,0.0,0.0,20250115,Premier League


In [3]:
# Conversion de la colonne 'Date' en format datetime
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%d/%m/%Y')

In [4]:
# On garde uniquement les matchs à partir de la saison 2018/2019
dataset = dataset[dataset['Date'] >= '01/08/2018']  
dataset = dataset[dataset['Date'] <= '31/05/2024']  # On garde jusqu'à la saison 2023/2024

In [5]:
dataset['FT Result'].unique()

array(['H', 'A', 'D'], dtype=object)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2440 entries, 209 to 2648
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           2440 non-null   datetime64[ns]
 1   Season         2440 non-null   object        
 2   HomeTeam       2440 non-null   object        
 3   AwayTeam       2440 non-null   object        
 4   FTH Goals      2440 non-null   int64         
 5   FTA Goals      2440 non-null   int64         
 6   FT Result      2440 non-null   object        
 7   HTH Goals      2440 non-null   float64       
 8   HTA Goals      2440 non-null   float64       
 9   HT Result      2440 non-null   object        
 10  Referee        2440 non-null   object        
 11  H Shots        2440 non-null   float64       
 12  A Shots        2440 non-null   float64       
 13  H SOT          2440 non-null   float64       
 14  A SOT          2440 non-null   float64       
 15  H Fouls        2440 non-

In [7]:
df_train = dataset[dataset['Date'] < '01/08/2023']
df_test = dataset[dataset['Date'] >= '01/08/2023']
print(f"Taille du dataset d'entraînement: {df_train.shape[0]} matchs")
print(f"Taille du dataset de test: {df_test.shape[0]} matchs")

Taille du dataset d'entraînement: 1856 matchs
Taille du dataset de test: 584 matchs


In [8]:
df_train['Day'] = df_train['Date'].dt.day
df_train['Month'] = df_train['Date'].dt.month
df_train['Year'] = df_train['Date'].dt.year
df_train['Weekday'] = df_train['Date'].dt.weekday  # Lundi=0, Dimanche=6

df_test['Day'] = df_test['Date'].dt.day
df_test['Month'] = df_test['Date'].dt.month
df_test['Year'] = df_test['Date'].dt.year
df_test['Weekday'] = df_test['Date'].dt.weekday


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Day'] = df_train['Date'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Month'] = df_train['Date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['Year'] = df_train['Date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [9]:
df_test.head()

Unnamed: 0,Date,Season,HomeTeam,AwayTeam,FTH Goals,FTA Goals,FT Result,HTH Goals,HTA Goals,HT Result,...,H Yellow,A Yellow,H Red,A Red,Display_Order,League,Day,Month,Year,Weekday
209,2024-05-19,2023/24,Chelsea,Bournemouth,2,1,H,1.0,0.0,H,...,2.0,3.0,0.0,0.0,20240519,Premier League,19,5,2024,6
210,2024-05-19,2023/24,Brighton,Man United,0,2,A,0.0,0.0,D,...,1.0,3.0,0.0,0.0,20240519,Premier League,19,5,2024,6
211,2024-05-19,2023/24,Crystal Palace,Aston Villa,5,0,H,2.0,0.0,H,...,1.0,4.0,0.0,0.0,20240519,Premier League,19,5,2024,6
212,2024-05-19,2023/24,Arsenal,Everton,2,1,H,1.0,1.0,D,...,4.0,3.0,0.0,0.0,20240519,Premier League,19,5,2024,6
213,2024-05-19,2023/24,Sheffield United,Tottenham,0,3,A,0.0,1.0,A,...,2.0,0.0,0.0,0.0,20240519,Premier League,19,5,2024,6


In [10]:
df_train_encoded = pd.get_dummies(df_train, columns=['HomeTeam', 'AwayTeam'])
df_test_encoded = pd.get_dummies(df_test, columns=['HomeTeam', 'AwayTeam'])


In [11]:
df_train_encoded, df_test_encoded = df_train_encoded.align(df_test_encoded, join='left', axis=1, fill_value=0)

In [12]:
print(df_train_encoded.shape)
print(df_test_encoded.shape)


(1856, 85)
(584, 85)


In [13]:
df_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1856 entries, 793 to 2648
Data columns (total 85 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       1856 non-null   datetime64[ns]
 1   Season                     1856 non-null   object        
 2   FTH Goals                  1856 non-null   int64         
 3   FTA Goals                  1856 non-null   int64         
 4   FT Result                  1856 non-null   object        
 5   HTH Goals                  1856 non-null   float64       
 6   HTA Goals                  1856 non-null   float64       
 7   HT Result                  1856 non-null   object        
 8   Referee                    1856 non-null   object        
 9   H Shots                    1856 non-null   float64       
 10  A Shots                    1856 non-null   float64       
 11  H SOT                      1856 non-null   float64       
 12  A SOT    

In [14]:
columns_to_drop = ['FTH Goals', 'FTA Goals', 'FT Result', 
                   'HTH Goals', 'HTA Goals', 'HT Result', 
                   'Display_Order', 'Referee', 'League', 'Date', 'Season']


In [15]:
# Sélectionner les features (sans les variables cibles)
X_train = df_train_encoded.drop(columns=columns_to_drop)
X_test = df_test_encoded.drop(columns=columns_to_drop)

In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1856 entries, 793 to 2648
Data columns (total 74 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   H Shots                    1856 non-null   float64
 1   A Shots                    1856 non-null   float64
 2   H SOT                      1856 non-null   float64
 3   A SOT                      1856 non-null   float64
 4   H Fouls                    1856 non-null   float64
 5   A Fouls                    1856 non-null   float64
 6   H Corners                  1856 non-null   float64
 7   A Corners                  1856 non-null   float64
 8   H Yellow                   1856 non-null   float64
 9   A Yellow                   1856 non-null   float64
 10  H Red                      1856 non-null   float64
 11  A Red                      1856 non-null   float64
 12  Day                        1856 non-null   int32  
 13  Month                      1856 non-null   int32  


In [17]:

# Transformation simple: H -> 0, D -> 1, A -> 2 & Sélection de la variable cible
result_mapping = {'H': 0, 'D': 1, 'A': 2}
y_train = df_train_encoded['FT Result'].map(result_mapping)
y_test = df_test_encoded['FT Result'].map(result_mapping)


# Entraîner votre modèle
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Évaluer votre modèle
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Précision du modèle: {accuracy:.4f}")

Précision du modèle: 0.5736
