In [51]:
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

# Assembler les résultats des matches de tous les saisons

In [52]:
def concat_season():
    seasons=os.listdir(".")
    header=["Date","Time","Comp","Round","Day","Venue","Result","GF","GA","Opponent","Poss","Attendance","Captain","Formation","Referee","Match Report","Notes","team","season"]
    data=pd.DataFrame(columns=header)
    for season in seasons:
        if "-" in season:
            equipes=os.listdir(f"./{season}")
            for equipe in equipes:
                if "Stats" in equipe:
                    fils=pd.read_csv(f"./{season}/{equipe}/{equipe}.csv")
                    fils["team"]= " ".join(equipe.split("-")[:-1])
                    fils["season"]=season.split("-")[0]
                    data=pd.concat([data, fils], ignore_index=True)
    data.to_csv("dataset.csv")
    return data

In [53]:
data=concat_season()

In [54]:
data.head()

Unnamed: 0.1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Captain,Formation,Referee,Match Report,Notes,team,season,Unnamed: 0,xG,xGA
0,2010-08-15,,Premier League,Matchweek 1,Sun,Away,D,1,1,Liverpool,...,,4-5-1,Martin Atkinson,https://fbref.com/en/matches/4d2dd8b3/Liverpoo...,,Arsenal,2010,0.0,,
1,2010-08-21,,Premier League,Matchweek 2,Sat,Home,W,6,0,Blackpool,...,,4-3-3,Mike Jones,https://fbref.com/en/matches/9e517d4d/Arsenal-...,,Arsenal,2010,1.0,,
2,2010-08-28,,Premier League,Matchweek 3,Sat,Away,W,2,1,Blackburn,...,,4-2-3-1,Chris Foy,https://fbref.com/en/matches/2b39e7c0/Blackbur...,,Arsenal,2010,2.0,,
3,2010-09-11,,Premier League,Matchweek 4,Sat,Home,W,4,1,Bolton,...,,4-5-1,Stuart Attwell,https://fbref.com/en/matches/8e9a6c63/Arsenal-...,,Arsenal,2010,3.0,,
4,2010-09-18,,Premier League,Matchweek 5,Sat,Away,D,1,1,Sunderland,...,,4-2-3-1,Phil Dowd,https://fbref.com/en/matches/66b3fe8e/Sunderla...,,Arsenal,2010,5.0,,


# Vérifier les valeurs manquants

In [55]:
data.isna().sum()

Date               0
Time            3040
Comp               0
Round              0
Day                0
Venue              0
Result             0
GF                 0
GA                 0
Opponent           0
Poss            3040
Attendance       884
Captain         3800
Formation          0
Referee            0
Match Report       0
Notes           9880
team               0
season             0
Unnamed: 0         0
xG              5320
xGA             5320
dtype: int64

In [56]:
data=data.drop(["Time","Comp","xG","xGA","Poss","Notes","Unnamed: 0"],axis=1)

# Encoder les carctérististiques non numérique

In [57]:
data["Date"]=pd.to_datetime(data["Date"])

In [58]:
data.dtypes

Date            datetime64[ns]
Round                   object
Day                     object
Venue                   object
Result                  object
GF                      object
GA                      object
Opponent                object
Attendance              object
Captain                 object
Formation               object
Referee                 object
Match Report            object
team                    object
season                  object
dtype: object

In [59]:
data

Unnamed: 0,Date,Round,Day,Venue,Result,GF,GA,Opponent,Attendance,Captain,Formation,Referee,Match Report,team,season
0,2010-08-15,Matchweek 1,Sun,Away,D,1,1,Liverpool,44722,,4-5-1,Martin Atkinson,https://fbref.com/en/matches/4d2dd8b3/Liverpoo...,Arsenal,2010
1,2010-08-21,Matchweek 2,Sat,Home,W,6,0,Blackpool,60032,,4-3-3,Mike Jones,https://fbref.com/en/matches/9e517d4d/Arsenal-...,Arsenal,2010
2,2010-08-28,Matchweek 3,Sat,Away,W,2,1,Blackburn,25059,,4-2-3-1,Chris Foy,https://fbref.com/en/matches/2b39e7c0/Blackbur...,Arsenal,2010
3,2010-09-11,Matchweek 4,Sat,Home,W,4,1,Bolton,59876,,4-5-1,Stuart Attwell,https://fbref.com/en/matches/8e9a6c63/Arsenal-...,Arsenal,2010
4,2010-09-18,Matchweek 5,Sat,Away,D,1,1,Sunderland,38950,,4-2-3-1,Phil Dowd,https://fbref.com/en/matches/66b3fe8e/Sunderla...,Arsenal,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9875,2023-04-29,Matchweek 34,Sat,Away,L,0,6,Brighton,31584,Rúben Neves,4-4-2,David Coote,https://fbref.com/en/matches/21f920e0/Brighton...,Wolverhampton Wanderers,2022
9876,2023-05-06,Matchweek 35,Sat,Home,W,1,0,Aston Villa,31641,Rúben Neves,4-4-2,Stuart Attwell,https://fbref.com/en/matches/217a7faf/Wolverha...,Wolverhampton Wanderers,2022
9877,2023-05-13,Matchweek 36,Sat,Away,L,0,2,Manchester Utd,73570,Rúben Neves,4-4-2,John Brooks,https://fbref.com/en/matches/f5d61382/Manchest...,Wolverhampton Wanderers,2022
9878,2023-05-20,Matchweek 37,Sat,Home,D,1,1,Everton,31684,Rúben Neves,4-4-2,David Coote,https://fbref.com/en/matches/ff2b58c3/Wolverha...,Wolverhampton Wanderers,2022


### Venue

In [60]:
data["Venue_code"]=data["Venue"].astype("category").cat.codes

### Resultat

In [61]:
resultat_encoder=LabelEncoder()
data["Result_code"]=resultat_encoder.fit_transform(data["Result"])

In [62]:
data.head(10)[["Result","Result_code"]]

Unnamed: 0,Result,Result_code
0,D,0
1,W,2
2,W,2
3,W,2
4,D,0
5,L,1
6,L,1
7,W,2
8,W,2
9,W,2


### Round

In [63]:
round_encoder=LabelEncoder()
data["Round_code"]=round_encoder.fit_transform(data["Round"])

In [64]:
data.head(10)[["Round","Round_code"]]

Unnamed: 0,Round,Round_code
0,Matchweek 1,0
1,Matchweek 2,11
2,Matchweek 3,22
3,Matchweek 4,32
4,Matchweek 5,33
5,Matchweek 6,34
6,Matchweek 7,35
7,Matchweek 8,36
8,Matchweek 9,37
9,Matchweek 10,1


### Formation

In [65]:
Formation_encoder=LabelEncoder()
data["Formation_code"]=Formation_encoder.fit_transform(data["Formation"])

In [66]:
data.head(76)[["Formation","Formation_code"]]

Unnamed: 0,Formation,Formation_code
0,4-5-1,19
1,4-3-3,16
2,4-2-3-1,12
3,4-5-1,19
4,4-2-3-1,12
...,...,...
71,4-4-2,18
72,4-4-2,18
73,4-4-2,18
74,4-5-1,19


### Referee

In [67]:
Referee_encoder=LabelEncoder()
data["Referee_code"]=Referee_encoder.fit_transform(data["Referee"])

In [68]:
data[data["Referee"]=="Chris Foy"].head(76)[["Referee","Referee_code"]]


Unnamed: 0,Referee,Referee_code
2,Chris Foy,4
15,Chris Foy,4
25,Chris Foy,4
34,Chris Foy,4
62,Chris Foy,4
...,...,...
1160,Chris Foy,4
1169,Chris Foy,4
1183,Chris Foy,4
1194,Chris Foy,4


### Day

In [69]:
data["Day_code"]=data["Date"].dt.dayofweek

In [70]:
data.head(10)

Unnamed: 0,Date,Round,Day,Venue,Result,GF,GA,Opponent,Attendance,Captain,...,Referee,Match Report,team,season,Venue_code,Result_code,Round_code,Formation_code,Referee_code,Day_code
0,2010-08-15,Matchweek 1,Sun,Away,D,1,1,Liverpool,44722,,...,Martin Atkinson,https://fbref.com/en/matches/4d2dd8b3/Liverpoo...,Arsenal,2010,0,0,0,19,24,6
1,2010-08-21,Matchweek 2,Sat,Home,W,6,0,Blackpool,60032,,...,Mike Jones,https://fbref.com/en/matches/9e517d4d/Arsenal-...,Arsenal,2010,1,2,11,16,28,5
2,2010-08-28,Matchweek 3,Sat,Away,W,2,1,Blackburn,25059,,...,Chris Foy,https://fbref.com/en/matches/2b39e7c0/Blackbur...,Arsenal,2010,0,2,22,12,4,5
3,2010-09-11,Matchweek 4,Sat,Home,W,4,1,Bolton,59876,,...,Stuart Attwell,https://fbref.com/en/matches/8e9a6c63/Arsenal-...,Arsenal,2010,1,2,32,19,41,5
4,2010-09-18,Matchweek 5,Sat,Away,D,1,1,Sunderland,38950,,...,Phil Dowd,https://fbref.com/en/matches/66b3fe8e/Sunderla...,Arsenal,2010,0,0,33,12,35,5
5,2010-09-25,Matchweek 6,Sat,Home,L,2,3,West Brom,60025,,...,Michael Oliver,https://fbref.com/en/matches/2b9dadaa/Arsenal-...,Arsenal,2010,1,1,34,12,25,5
6,2010-10-03,Matchweek 7,Sun,Away,L,0,2,Chelsea,41828,,...,Mike Dean,https://fbref.com/en/matches/a10d839a/North-We...,Arsenal,2010,0,1,35,19,27,6
7,2010-10-16,Matchweek 8,Sat,Home,W,2,1,Birmingham City,60070,,...,Martin Atkinson,https://fbref.com/en/matches/44dd7154/Arsenal-...,Arsenal,2010,1,2,36,19,24,5
8,2010-10-24,Matchweek 9,Sun,Away,W,3,0,Manchester City,47393,,...,Mark Clattenburg,https://fbref.com/en/matches/06943b36/Manchest...,Arsenal,2010,0,2,37,19,21,6
9,2010-10-30,Matchweek 10,Sat,Home,W,1,0,West Ham,60086,,...,Mike Jones,https://fbref.com/en/matches/2dc0c516/Arsenal-...,Arsenal,2010,1,2,1,12,28,5


### Team and Opponent

In [71]:
data["team"]=data["team"].replace("Blackburn Rovers","Blackburn")
data["team"]=data["team"].replace("Bolton Wanderers","Bolton")
data["team"]=data["team"].replace("Manchester United","Manchester Utd")
data["team"]=data["team"].replace("Newcastle United","Newcastle Utd")
data["team"]=data["team"].replace("Tottenham Hotspur","Tottenham")
data["team"]=data["team"].replace("West Bromwich Albion","West Brom")
data["team"]=data["team"].replace("West Ham United","West Ham")
data["team"]=data["team"].replace("Wolverhampton Wanderers","Wolves")
data["team"]=data["team"].replace("Queens Park Rangers","QPR")
data["team"]=data["team"].replace("Brighton and Hove Albion","Brighton")
data["team"]=data["team"].replace("Huddersfield Town","Huddersfield")
data["team"]=data["team"].replace("Sheffield United","Sheffield Utd")
data["team"]=data["team"].replace("Nottingham Forest","Nott'ham Forest")



In [72]:
label_encoder = LabelEncoder()
data["Opponent_code"] = label_encoder.fit_transform(data["Opponent"])
data["team_code"] = label_encoder.transform(data["team"])

# Feature Engineering

## Avr Goals

In [73]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [74]:
grouped_matches = data.groupby("team")

In [75]:
cols = ["GF", "GA"]
new_cols = [f"{c}_rolling" for c in cols]

In [76]:
matches_rolling = data.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [77]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Round,Day,Venue,Result,GF,GA,Opponent,Attendance,Captain,...,Venue_code,Result_code,Round_code,Formation_code,Referee_code,Day_code,Opponent_code,team_code,GF_rolling,GA_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,3,2010-09-11,Matchweek 4,Sat,Home,W,4,1,Bolton,59876,,...,1,2,32,19,41,5,5,0,3.000000,0.666667
Arsenal,4,2010-09-18,Matchweek 5,Sat,Away,D,1,1,Sunderland,38950,,...,0,0,33,12,35,5,31,0,4.000000,0.666667
Arsenal,5,2010-09-25,Matchweek 6,Sat,Home,L,2,3,West Brom,60025,,...,1,1,34,12,25,5,35,0,2.333333,1.000000
Arsenal,6,2010-10-03,Matchweek 7,Sun,Away,L,0,2,Chelsea,41828,,...,0,1,35,19,27,6,11,0,2.333333,1.666667
Arsenal,7,2010-10-16,Matchweek 8,Sat,Home,W,2,1,Birmingham City,60070,,...,1,2,36,19,24,5,2,0,1.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolves,9875,2023-04-29,Matchweek 34,Sat,Away,L,0,6,Brighton,31584,Rúben Neves,...,0,1,27,18,9,5,8,38,1.666667,0.666667
Wolves,9876,2023-05-06,Matchweek 35,Sat,Home,W,1,0,Aston Villa,31641,Rúben Neves,...,1,2,28,18,41,5,1,38,1.000000,2.666667
Wolves,9877,2023-05-13,Matchweek 36,Sat,Away,L,0,2,Manchester Utd,73570,Rúben Neves,...,0,1,29,18,13,5,21,38,1.000000,2.000000
Wolves,9878,2023-05-20,Matchweek 37,Sat,Home,D,1,1,Everton,31684,Rúben Neves,...,1,0,30,18,9,5,13,38,0.333333,2.666667


In [78]:
matches_rolling[matches_rolling['team']=="Wolves"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Round,Day,Venue,Result,GF,GA,Opponent,Attendance,Captain,...,Venue_code,Result_code,Round_code,Formation_code,Referee_code,Day_code,Opponent_code,team_code,GF_rolling,GA_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Wolves,725,2010-09-11,Matchweek 4,Sat,Away,L,1,2,Fulham,25280,,...,0,1,32,18,35,5,14,38,1.333333,1.000000
Wolves,726,2010-09-18,Matchweek 5,Sat,Away,L,1,3,Tottenham,35940,,...,0,1,33,19,28,5,33,38,1.000000,1.333333
Wolves,727,2010-09-26,Matchweek 6,Sun,Home,L,1,2,Aston Villa,27511,,...,1,1,34,19,22,6,1,38,1.000000,2.000000
Wolves,728,2010-10-02,Matchweek 7,Sat,Away,L,0,2,Wigan Athletic,14042,,...,0,1,35,17,19,5,37,38,1.000000,2.333333
Wolves,729,2010-10-16,Matchweek 8,Sat,Home,D,1,1,West Ham,28582,,...,1,0,36,8,21,5,36,38,0.666667,2.333333
Wolves,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolves,9875,2023-04-29,Matchweek 34,Sat,Away,L,0,6,Brighton,31584,Rúben Neves,...,0,1,27,18,9,5,8,38,1.666667,0.666667
Wolves,9876,2023-05-06,Matchweek 35,Sat,Home,W,1,0,Aston Villa,31641,Rúben Neves,...,1,2,28,18,41,5,1,38,1.000000,2.666667
Wolves,9877,2023-05-13,Matchweek 36,Sat,Away,L,0,2,Manchester Utd,73570,Rúben Neves,...,0,1,29,18,13,5,21,38,1.000000,2.000000
Wolves,9878,2023-05-20,Matchweek 37,Sat,Home,D,1,1,Everton,31684,Rúben Neves,...,1,0,30,18,9,5,13,38,0.333333,2.666667


In [79]:
matches_rolling = matches_rolling.droplevel('team')

In [80]:
matches_rolling.columns

Index(['Date', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent',
       'Attendance', 'Captain', 'Formation', 'Referee', 'Match Report', 'team',
       'season', 'Venue_code', 'Result_code', 'Round_code', 'Formation_code',
       'Referee_code', 'Day_code', 'Opponent_code', 'team_code', 'GF_rolling',
       'GA_rolling'],
      dtype='object')

In [81]:
dataCleaned=matches_rolling[["team_code","Opponent_code","season","Round_code","Venue_code","Referee_code","Formation_code","Day_code","Result_code","GF","GF_rolling","GA","GA_rolling"]]
dataCleaned.to_csv("dataCleaned.csv")