# Modelling Strategie
We looking for solve a classification problem like:\
*What range a goal would belongs ?*
- Target = **global**
- Features = **goal averages for team at home and away (scored / conceded)**
- Model = **Random Forest Classifier**

In [70]:
import pandas as pd
import numpy as np
from pathlib import Path

### Feature Engineering

In [71]:
path = Path.cwd().parent.parent / 'data' / '2022_23' / 'results' / 'ligue1.csv'
df = pd.read_csv(path, parse_dates=True, index_col='date_time')
df.head()

Unnamed: 0_level_0,journée,home_team,away_team,1st_home_team_goal,1st_away_team_goal,1st_total_goal,2nd_home_team_goal,2nd_away_team_goal,2nd_total_goal,total_home_team_goal,total_away_team_goal,global
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2023-06-03 19:00:00,38,AC Ajaccio,Marseille,0,0,0,1,0,1,1,0,1
2023-06-03 19:00:00,38,Nice,Lyon,3,1,4,0,0,0,3,1,4
2023-06-03 19:00:00,38,Troyes,Lille,0,0,0,1,1,2,1,1,2
2023-06-03 19:00:00,38,Reims,Montpellier,1,0,1,0,3,3,1,3,4
2023-06-03 19:00:00,38,PSG,Clermont,2,2,4,0,1,1,2,3,5


In [72]:
df = df[['journée', 'home_team', 'away_team',
         'total_home_team_goal', 'total_away_team_goal', 'global']]

In [73]:
def avg_per_match():
    """This function will calculate for each team
    the average of goal scored and conceded after each journée
    """
    data = df.copy()
    data = data.sort_values(by='journée', ascending=True)
    for row in data.iterrows():
        at_home = data[data['home_team']==data['home_team']]
        at_away = data[data['away_team']==data['away_team']]
        data['ht_scored_avg'] = at_home.groupby('home_team')['total_home_team_goal'].transform(lambda x: x.expanding().mean().shift())
        data['ht_conceded_avg'] = at_home.groupby('home_team')['total_away_team_goal'].transform(lambda x: x.expanding().mean().shift())
        data['at_scored_avg'] = at_away.groupby('away_team')['total_away_team_goal'].transform(lambda x: x.expanding().mean().shift())
        data['at_conceded_avg'] = at_home.groupby('away_team')['total_home_team_goal'].transform(lambda x: x.expanding().mean().shift())
    return data

In [74]:
data = avg_per_match()
data.tail(10)

Unnamed: 0_level_0,journée,home_team,away_team,total_home_team_goal,total_away_team_goal,global,ht_scored_avg,ht_conceded_avg,at_scored_avg,at_conceded_avg
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-06-03 19:00:00,38,Monaco,Toulouse,1,2,3,2.0,1.722222,1.222222,1.611111
2023-06-03 19:00:00,38,Brest,Rennes,1,2,3,1.277778,1.333333,1.333333,1.333333
2023-06-03 19:00:00,38,Lorient,Strasbourg,2,1,3,1.333333,1.111111,1.388889,1.722222
2023-06-03 19:00:00,38,Nantes,Angers,1,0,1,1.055556,1.444444,0.722222,2.444444
2023-06-03 19:00:00,38,Auxerre,Lens,1,3,4,0.944444,1.388889,1.333333,0.833333
2023-06-03 19:00:00,38,PSG,Clermont,2,3,5,2.388889,1.222222,1.222222,1.055556
2023-06-03 19:00:00,38,Reims,Montpellier,1,3,4,1.5,1.111111,1.833333,1.777778
2023-06-03 19:00:00,38,Troyes,Lille,1,1,2,1.0,1.611111,1.333333,1.0
2023-06-03 19:00:00,38,Nice,Lyon,3,1,4,1.166667,0.944444,1.611111,1.388889
2023-06-03 19:00:00,38,AC Ajaccio,Marseille,1,0,1,0.5,1.666667,1.777778,0.833333


### Modelling

In [75]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [76]:
# we started to drop columns which contains missing values
# and non reliable columns
dataset = data[data['journée'] > 2]
dataset = dataset.iloc[:, 5:]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 360 entries, 2022-08-21 18:45:00 to 2023-06-03 19:00:00
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   global           360 non-null    int64  
 1   ht_scored_avg    360 non-null    float64
 2   ht_conceded_avg  360 non-null    float64
 3   at_scored_avg    360 non-null    float64
 4   at_conceded_avg  360 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 16.9 KB


In [77]:
labels_name = ['under 2,5', 'over 2,5']
dataset['class'] = pd.cut(x=dataset['global'], bins=[min(dataset['global']), 2, max(dataset['global'])],
              labels=labels_name, include_lowest=True)

In [78]:
encod = {
    'under 2,5': 0,
    'over 2,5': 1,
}
dataset['class'] = dataset['class'].map(encod)

In [79]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 360 entries, 2022-08-21 18:45:00 to 2023-06-03 19:00:00
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   global           360 non-null    int64   
 1   ht_scored_avg    360 non-null    float64 
 2   ht_conceded_avg  360 non-null    float64 
 3   at_scored_avg    360 non-null    float64 
 4   at_conceded_avg  360 non-null    float64 
 5   class            360 non-null    category
dtypes: category(1), float64(4), int64(1)
memory usage: 17.3 KB


In [80]:
y = dataset['class']
X = dataset.drop(['global', 'class'], axis=1)

In [81]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = RandomForestClassifier(random_state=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [84]:
print(f"score {model.__class__.__name__}: {model.score(x_test, y_test)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

score RandomForestClassifier: 0.5925925925925926
[[23 23]
 [21 41]]
              precision    recall  f1-score   support

           0       0.52      0.50      0.51        46
           1       0.64      0.66      0.65        62

    accuracy                           0.59       108
   macro avg       0.58      0.58      0.58       108
weighted avg       0.59      0.59      0.59       108

