In [81]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import statsmodels.api as sm

In [82]:
FILES_PATH = "/home/guillaume/pythia/code/data/"

In [83]:
fixtures_data = pd.read_csv(f"{FILES_PATH}fixtures.csv", delimiter=",")
odds_data = pd.read_csv(f"{FILES_PATH}odds.csv", delimiter=",")
players_data = pd.read_csv(f"{FILES_PATH}players.csv", delimiter=",")
results_data = pd.read_csv(f"{FILES_PATH}results.csv", delimiter=",")
startingXI_data = pd.read_csv(f"{FILES_PATH}startingXI.csv", delimiter=",")
teams_data = pd.read_csv(f"{FILES_PATH}teams.csv", delimiter=",")

In [84]:
results_data = results_data.merge(teams_data, left_on="HomeTeamID", right_on = "TeamID", how="left")\
    .rename(columns = {"TeamName" : "HomeTeamName"})\
        .drop(columns="TeamID")

results_data = results_data.merge(teams_data, left_on="AwayTeamID", right_on = "TeamID", how="left")\
    .rename(columns = {"TeamName" : "AwayTeamName"})\
        .drop(columns="TeamID")

In [85]:
train_results = results_data[results_data["SeasonID"]==1]

## Feature engineering

Let's compute some statistics about season 1 overall

In [86]:
total_home_goals = train_results.HomeScore.sum()
average_home_goals = train_results.HomeScore.mean()
total_away_goals = train_results.AwayScore.sum()
average_away_goals = train_results.AwayScore.mean()

We extract the ground truth: home & away goals scored

In [88]:
home_strength = train_results.groupby(["HomeTeamID"]).agg(
    # HomeSeasonGoalsScored = ("HomeScore", "sum"),
    # HomeSeasonGoalsConceded = ("AwayScore", "sum"),
    HomeSeasonGoalsScoredAvg = ("HomeScore", "mean"),
    HomeSeasonGoalsConcededAvg = ("AwayScore", "mean")
).reset_index()
# \.rename(columns={"HomeTeamID" : "TeamID"})

away_strength = train_results.groupby(["AwayTeamID"]).agg(
    # AwaySeasonGoalsScored = ("AwayScore", "sum"),
    # AwaySeasonGoalsConceded = ("HomeScore", "sum"),
    AwaySeasonGoalsScoredAvg = ("AwayScore", "mean"),
    AwaySeasonGoalsConcededAvg = ("HomeScore", "mean")
).reset_index()
# \.rename(columns={"AwayTeamID" : "TeamID"})

# teams_strength = home_strength.merge(away_strength, on = ["TeamID"], how = "inner")

In [89]:
home_strength["HomeAttackStrength"] = home_strength["HomeSeasonGoalsScoredAvg"]/average_home_goals
home_strength["HomeDefenceStrength"] = home_strength["HomeSeasonGoalsConcededAvg"]/average_away_goals

away_strength["AwayAttackStrength"] = away_strength["AwaySeasonGoalsScoredAvg"]/average_away_goals
away_strength["AwayDefenceStrength"] = away_strength["AwaySeasonGoalsConcededAvg"]/average_home_goals

In [90]:
home_strength = home_strength.drop(columns=["HomeSeasonGoalsScoredAvg", "HomeSeasonGoalsConcededAvg"])
away_strength = away_strength.drop(columns=["AwaySeasonGoalsScoredAvg", "AwaySeasonGoalsConcededAvg"])

In [91]:
train_results = train_results.merge(home_strength, on = ["HomeTeamID"], how = "left")
train_results = train_results.merge(away_strength, on = ["AwayTeamID"], how = "left")

In [97]:
X = train_results[["HomeAttackStrength", "HomeDefenceStrength", "AwayAttackStrength", "AwayDefenceStrength"]]
y_home = train_results[["HomeScore"]]
y_away = train_results[["AwayScore"]]

## Build model

In [99]:
X = sm.add_constant(X) 
poisson_home = sm.GLM(y_home, X, family=sm.families.Poisson()).fit()
poisson_away = sm.GLM(y_away, X, family=sm.families.Poisson()).fit()

In [None]:
X_test = train_results.iloc[1][["HomeAttackStrength", "HomeDefenceStrength", "AwayAttackStrength", "AwayDefenceStrength"]]

In [113]:
predicted_home_goals = poisson_home.predict([1]+ X_test.values.tolist())
predicted_away_goals = poisson_away.predict([1]+ X_test.values.tolist())

In [137]:
home_simulated_goals = np.random.poisson(lam=predicted_home_goals, size=(100, len(predicted_home_goals)))
away_simulated_goals = np.random.poisson(lam=predicted_away_goals, size=(100, len(predicted_away_goals)))

In [189]:
def count_goals(simulated_goals : list[int]) -> pd.Series:
    return simulated_goals.value_counts().sort_index()/len(simulated_goals)
home_goals_proba = count_goals(pd.Series(home_simulated_goals.flatten()))
away_goals_proba = count_goals(pd.Series(away_simulated_goals.flatten()))

home_goals_proba = home_goals_proba.rename("HomeGoalsProba")
away_goals_proba = away_goals_proba.rename("AwayGoalsProba")

In [190]:
def generate_goals_proba_table(home_goals_proba, away_goals_proba):
    return pd.concat([home_goals_proba, away_goals_proba], axis = 1).sort_index().fillna(0)

goals_proba_matrix = generate_goals_proba_table(home_goals_proba, away_goals_proba)

In [191]:
scores_proba_matrix = np.outer(goals_proba_matrix["HomeGoalsProba"].values, goals_proba_matrix["AwayGoalsProba"].values)
pd.DataFrame(scores_proba_matrix, index = goals_proba_matrix.index, columns = goals_proba_matrix.index)

Unnamed: 0,0,1,2,3,4,5,6
0,0.0725,0.1015,0.0754,0.029,0.0058,0.0029,0.0029
1,0.1,0.14,0.104,0.04,0.008,0.004,0.004
2,0.055,0.077,0.0572,0.022,0.0044,0.0022,0.0022
3,0.0175,0.0245,0.0182,0.007,0.0014,0.0007,0.0007
4,0.005,0.007,0.0052,0.002,0.0004,0.0002,0.0002
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
