In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
matches = pd.read_csv("matches.csv", index_col=0)
matches["date"] = pd.to_datetime(matches["date"])
matches = matches.sort_values(by='date')
matches['venue_c'] = matches['venue'].astype('category').cat.codes
matches['opp_c'] = matches['opponent'].astype('category').cat.codes
matches['hour'] = matches['time'].str.replace(":.+", "", regex=True).astype("int")
matches['day_c'] = matches['date'].dt.dayofweek


In [3]:
matches['target'] = (matches['result'] == "W").astype("int")

In [4]:
matches = matches.drop(columns=['round', 'day', 'comp', 'attendance', 'captain', 'formation', 'referee', 'match report', 'notes'])

In [5]:
matches

Unnamed: 0,date,time,venue,result,gf,ga,opponent,xg,xga,poss,...,fk,pk,pkatt,season,team,venue_c,opp_c,hour,day_c,target
0,2020-08-21,19:00,Away,D,0.0,0.0,Bordeaux,0.3,0.4,56.0,...,1.0,0.0,0.0,2021,Nantes,0,3,19,4,0
0,2020-08-21,19:00,Home,D,0.0,0.0,Nantes,0.4,0.3,44.0,...,0.0,0.0,0.0,2021,Bordeaux,1,15,19,4,0
0,2020-08-22,21:00,Home,D,1.0,1.0,Rennes,0.5,1.4,52.0,...,0.0,0.0,0.0,2021,Lille,1,20,21,5,0
0,2020-08-22,17:00,Home,L,0.0,1.0,Angers,0.5,2.1,54.0,...,0.0,0.0,0.0,2021,Dijon,1,1,17,5,0
0,2020-08-22,21:00,Away,D,1.0,1.0,Lille,1.4,0.5,48.0,...,1.0,0.0,0.0,2021,Rennes,0,8,21,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,2022-08-28,15:00,Home,L,0.0,7.0,Montpellier,0.3,2.6,52.0,...,1.0,0.0,0.0,2023,Brest,1,14,15,6,0
3,2022-08-28,13:00,Away,L,1.0,3.0,Nantes,0.5,1.5,51.0,...,0.0,0.0,0.0,2023,Toulouse,0,15,13,6,0
4,2022-08-28,13:00,Home,W,3.0,1.0,Toulouse,1.5,0.5,49.0,...,0.0,0.0,0.0,2023,Nantes,1,23,13,6,1
2,2022-08-28,17:05,Away,D,1.0,1.0,Reims,0.9,0.5,74.0,...,1.0,0.0,0.0,2023,Lyon,0,19,17,6,0


In [6]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [7]:
train = matches[matches['date'] < '2022-01-01']
test = matches[matches['date'] > '2022-01-01']

predictors = ['venue_c', 'opp_c', 'xg', 'xga']

In [8]:
rf.fit(train[predictors], train['target'])

preds = rf.predict(test[predictors])

acc = accuracy_score(test['target'], preds)

In [9]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [10]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])
precision_score(test['target'], preds)

grouped_matches = matches.groupby('team')
group = grouped_matches.get_group("Lille")

In [11]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [12]:
cols = ['gf', 'ga', 'sh', 'poss', 'sot', 'dist', 'fk']
new_cols = [f"{c}R" for c in cols]

In [13]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling = round(matches_rolling)
matches_rolling = matches_rolling.drop(columns=['pk', 'pkatt', 'day_c', 'hour', 'time', 'season'])
matches_rolling["date"] = pd.to_datetime(matches_rolling["date"])

In [14]:
def make_predictions(data, predictors):
    train = data[data['date'] < '2022-01-01']
    test = data[data['date'] > '2022-01-01']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], predicted=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [16]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [17]:
combined = combined.merge(matches_rolling[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)

In [18]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {
    "Paris Saint Germain": "Paris S-G", 
              
} 
mapping = MissingDict(**map_values)

In [19]:
combined["new_team"] = combined["team"].map(mapping)
combined = combined.sort_values(by='date')

In [20]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged = merged.sort_values(by='date')

In [22]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,1,2022-01-07,Marseille,Bordeaux,W,Marseille,0,0,Bordeaux,Marseille,L,Bordeaux
1,0,0,2022-01-07,Bordeaux,Marseille,L,Bordeaux,1,1,Marseille,Bordeaux,W,Marseille
2,0,0,2022-01-08,Rennes,Lens,L,Rennes,1,0,Lens,Rennes,W,Lens
3,1,0,2022-01-08,Lens,Rennes,W,Lens,0,0,Rennes,Lens,L,Rennes
13,0,0,2022-01-09,Metz,Strasbourg,L,Metz,1,1,Strasbourg,Metz,W,Strasbourg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,0,0,2022-08-28,Brest,Montpellier,L,Brest,1,1,Montpellier,Brest,W,Montpellier
411,0,0,2022-08-28,Angers,Troyes,L,Angers,1,0,Troyes,Angers,W,Troyes
423,0,0,2022-08-28,Toulouse,Nantes,L,Toulouse,1,1,Nantes,Toulouse,W,Nantes
416,1,1,2022-08-28,Marseille,Nice,W,Marseille,0,0,Nice,Marseille,L,Nice


In [21]:
merged.shape

(425, 13)

In [23]:
merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0)].shape

(127, 13)

In [24]:
merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0) & (merged['actual_x'] == 1)].shape

(85, 13)