In [1366]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

In [1367]:
all_matches = pd.read_csv("matches.csv", index_col=0)
all_matches['gf'] = pd.to_numeric(all_matches['gf'].str.split('(').str[0].str.strip(), errors='coerce')
all_matches['ga'] = pd.to_numeric(all_matches['ga'].str.split('(').str[0].str.strip(), errors='coerce')

all_matches['pk'] = pd.to_numeric(all_matches['pk'], errors='coerce').astype('float64')
all_matches['pkatt'] = pd.to_numeric(all_matches['pkatt'], errors='coerce').astype('float64')

In [1368]:
all_matches.shape


(1078, 28)

In [1369]:
del all_matches["comp"]
del all_matches["notes"]

In [1370]:
all_matches["date"] = pd.to_datetime(all_matches["date"])


In [1371]:
all_matches["venue_code"] = all_matches["venue"].astype("category").cat.codes
all_matches["opponent_code"] = all_matches["opponent"].astype("category").cat.codes
all_matches["hour"] = all_matches["time"].str.replace(":.+", "",regex=True).astype("int")
all_matches["day_code"]=all_matches["date"].dt.dayofweek
all_matches["target"] = (all_matches["result"] == "W").astype("int")

In [1372]:
rf = RandomForestClassifier(
    n_estimators=750,
    min_samples_split=35,
    max_depth=15,
    class_weight='balanced',  
    random_state=1
)

In [1373]:
train = all_matches[all_matches["date"]<'2024-01-01']
test = all_matches[all_matches["date"]>'2024-01-01']

In [1374]:
predictors = ["venue_code", "opponent_code","hour","day_code"]

In [1375]:
rf.fit(train[predictors], train["target"])

In [1376]:
preds = rf.predict(test[predictors])

In [1377]:
precision = accuracy_score(test["target"], preds)

In [1378]:
precision

0.6390728476821192

In [1379]:
combined = pd.DataFrame(dict(actual=test["target"],predicted=preds))

In [1380]:
pd.crosstab(index=combined["actual"], columns = combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,114,54
1,55,79


In [1381]:
precision_score(test["target"],preds)

0.5939849624060151

In [1382]:
grouped_matches = all_matches.groupby("team")


In [1383]:
group = grouped_matches.get_group("Arsenal").sort_values("date")

In [1384]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(10, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [1385]:
columns = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_columns = [f"{c}_rolling" for c in columns]

rolling_averages(group, columns, new_columns)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,opp formation,referee,match report,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opponent_code,hour,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2024-09-19,21:00,League phase,Thu,Away,D,0.0,0.0,Atalanta,0.8,1.2,46.0,22858.0,Gabriel Jesus,4-3-3,3-4-3,Clément Turpin,Match Report,6.0,2.0,14.9,1.0,0.0,0.0,2024,Arsenal,0,8,21,3,0,1.9,0.8,12.0,4.2,15.24,0.1,0.1,0.1
8,2024-10-01,20:00,League phase,Tue,Home,W,2.0,0.0,Paris S-G,0.7,0.4,35.0,60103.0,Bukayo Saka,4-4-2,4-3-3,Slavko Vinčič,Match Report,6.0,5.0,15.7,1.0,0.0,0.0,2024,Arsenal,1,61,20,1,1,1.5,0.8,10.8,3.6,15.29,0.2,0.1,0.1
11,2024-10-22,20:00,League phase,Tue,Home,W,1.0,0.0,Shakhtar,1.8,0.7,56.0,59594.0,Gabriel Jesus,4-4-2,4-2-3-1,Benoît Bastien,Match Report,12.0,4.0,16.0,0.0,0.0,1.0,2024,Arsenal,1,74,20,1,1,1.6,0.6,10.3,3.5,15.68,0.3,0.1,0.1
15,2024-11-06,21:00,League phase,Wed,Away,L,0.0,1.0,Inter,1.8,1.1,62.0,75222.0,Bukayo Saka,4-4-2,3-5-2,István Kovács,Match Report,20.0,4.0,14.6,0.0,0.0,0.0,2024,Arsenal,0,38,21,2,0,1.5,0.5,10.1,3.5,15.64,0.3,0.1,0.2
18,2024-11-26,20:00,League phase,Tue,Away,W,5.0,1.0,Sporting Cp,4.0,1.0,48.0,47386.0,Martin Ødegaard,4-3-3,3-4-3,Szymon Marciniak,Match Report,12.0,8.0,14.6,0.0,1.0,1.0,2024,Arsenal,0,82,20,1,1,1.3,0.6,11.0,3.6,15.47,0.2,0.1,0.2
22,2024-12-11,20:00,League phase,Wed,Home,W,3.0,0.0,Monaco,2.3,0.5,54.0,60157.0,Martin Ødegaard,4-3-3,4-2-3-1,Davide Massa,Match Report,16.0,8.0,14.2,0.0,0.0,0.0,2024,Arsenal,1,54,20,2,1,1.2,0.7,10.9,3.7,15.45,0.2,0.1,0.2


In [1386]:
matches_rolling = all_matches.groupby("team").apply(lambda x: rolling_averages(x, columns, new_columns))

  matches_rolling = all_matches.groupby("team").apply(lambda x: rolling_averages(x, columns, new_columns))


In [1387]:
matches_rolling = matches_rolling.droplevel('team')

In [1388]:
matches_rolling.index = range(matches_rolling.shape[0])

In [1389]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2024-01-01']
    test = data[data["date"] > '2024-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [1390]:
combined, precision = make_predictions(matches_rolling, predictors + new_columns)

In [1391]:
precision

0.6666666666666666