In [26]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [27]:
FILES_PATH = "/home/guillaume/pythia/code/data/"

In [96]:
fixtures_data = pd.read_csv(f"{FILES_PATH}fixtures.csv", delimiter=",")
odds_data = pd.read_csv(f"{FILES_PATH}odds.csv", delimiter=",")
players_data = pd.read_csv(f"{FILES_PATH}players.csv", delimiter=",")
results_data = pd.read_csv(f"{FILES_PATH}results.csv", delimiter=",")
startingXI_data = pd.read_csv(f"{FILES_PATH}startingXI.csv", delimiter=",")
teams_data = pd.read_csv(f"{FILES_PATH}teams.csv", delimiter=",")

In [97]:
results_data_season1 = results_data[results_data["SeasonID"]==1]

## Feature engineering

### General statistics for season 1

In [98]:
def season_statistics(df : pd.DataFrame) -> list[int, int, float, float]:
    """
    Calculate the total number of goals scored at home/away. 
    As well the average number of goals scored at home/away. 
    """
    return df.HomeScore.sum(), df.AwayScore.sum(), df.HomeScore.mean(), df.AwayScore.mean()
TOTAL_LEAGUE_GOALS_HOME, TOTAL_LEAGUE_GOALS_AWAY, AVERAGE_LEAGUE_GOALS_HOME, AVERAGE_LEAGUE_GOALS_AWAY = season_statistics(train_results)

### Team-oriented statistics

- Calculation of attack & defence strengths for home & away teams
- Shots and goals conceded features generation

In [129]:
def home_average_stats(df : pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the average number of goals scored/conceded for every home team. 
    """
    return df.groupby(["HomeTeamID"]).agg(
        in_HomeSeasonGoalsScoredAvg = ("HomeScore", "mean"),
        in_HomeSeasonGoalsConcededAvg = ("AwayScore", "mean"),
        # in_HomeSeasonShotsAvg = ("HomeShots", "mean"),
        # in_HomeSeasonShotsConcededAvg = ("AwayShots", "mean")
    ).reset_index()

def away_average_stats(df : pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the average number of goals scored/conceded for every away team.  
    """
    return df.groupby(["AwayTeamID"]).agg(
    in_AwaySeasonGoalsScoredAvg = ("AwayScore", "mean"),
    in_AwaySeasonGoalsConcededAvg = ("HomeScore", "mean"),
    # in_AwaySeasonShotsAvg = ("AwayShots", "mean"),
    # in_AwaySeasonShotsConcededAvg = ("HomeShots", "mean")
).reset_index()

In [130]:
def get_home_strength(df : pd.DataFrame) -> pd.DataFrame:
    """
    Divide the average number of goals scored by a home team by the average number of home goals scored thoughout the season.
    """
    df["in_HomeAttackStrength"] = df["in_HomeSeasonGoalsScoredAvg"]/AVERAGE_LEAGUE_GOALS_HOME
    df["in_HomeDefenceStrength"] = df["in_HomeSeasonGoalsConcededAvg"]/AVERAGE_LEAGUE_GOALS_AWAY
    return df.drop(columns=["in_HomeSeasonGoalsScoredAvg", "in_HomeSeasonGoalsConcededAvg"])

def get_away_strength(df : pd.DataFrame) -> pd.DataFrame:
    """
    Divide the average number of goals scored by an away team by the average number of away goals scored thoughout the season.
    """
    df["in_AwayAttackStrength"] = df["in_AwaySeasonGoalsScoredAvg"]/AVERAGE_LEAGUE_GOALS_AWAY
    df["in_AwayDefenceStrength"] = df["in_AwaySeasonGoalsConcededAvg"]/AVERAGE_LEAGUE_GOALS_HOME
    return df.drop(columns=["in_AwaySeasonGoalsScoredAvg", "in_AwaySeasonGoalsConcededAvg"])

In [131]:
def merge_results_stats(train_df : pd.DataFrame, stats_df : pd.DataFrame, _home_boolean : bool) -> pd.DataFrame:
    """
    Add the new features to the existing results dataset. 
    """
    if _home_boolean:
        train_df = train_df.merge(stats_df, on = ["HomeTeamID"], how = "left")
    else:
        train_df = train_df.merge(stats_df, on = ["AwayTeamID"], how = "left")
    return train_df

Home advantage added as a feature

In [132]:
def intercept_term_advantage(df : pd.DataFrame) -> pd.DataFrame:
    """
    Add an intercept term to capture any constant effects such as the advantage of playing at home.
    """
    df["in_InterceptTerm"] = 1
    return df

In [133]:
train_results, test_results = train_test_split(results_data_season1, test_size=0.2)

In [None]:
# df = train_results
# home_strength = home_average_stats(df)
# away_strength = away_average_stats(df)
# home_strength = get_home_strength(home_strength)
# away_strength = get_away_strength(away_strength)
# train_results_home = merge_results_stats(df, home_strength, True)
# train_results_away = merge_results_stats(df, away_strength, False)
# train_results_home = intercept_term_advantage(train_results_home)
# train_results_away = intercept_term_advantage(train_results_away)

In [None]:
def generate_features(df):
    home_strength = home_average_stats(df)
    away_strength = away_average_stats(df)
    home_strength = get_home_strength(home_strength)
    away_strength = get_away_strength(away_strength)
    train_results_home = merge_results_stats(df, home_strength, True)
    train_results_away = merge_results_stats(df, away_strength, False)
    train_results_home = intercept_term_advantage(train_results_home)
    train_results_away = intercept_term_advantage(train_results_away)
    return train_results_home, train_results_away

train_results_home, train_results_away = generate_features(train_results)
test_results_home, test_results_away = generate_features(test_results)



## Input dataset creation and model training

In [140]:
class DataPreprocessing:
    def generate_inputs_model(self, df : pd.DataFrame, home_boolean : bool):
        """
        Generate the model input dataset as well as the ground truth - here the number of goals.  
        """
        if home_boolean:
            X = df[[x for x in df.columns if x.startswith("in_")]]
            # X = df[["InterceptTerm", "HomeSeasonShotsAvg", "HomeSeasonShotsConcededAvg", "HomeAttackStrength", "HomeDefenceStrength"]]
            y = df[["HomeScore"]]
        else:
            X = df[[x for x in df.columns if x.startswith("in_")]]
            # X = df[["InterceptTerm", "AwaySeasonShotsAvg", "AwaySeasonShotsConcededAvg", "AwayAttackStrength", "AwayDefenceStrength"]]
            y = df[["AwayScore"]]
        return X, y

In [141]:
class TrainModel(DataPreprocessing):
    def __init__(self):
        pass
    
    def model_training(self, X : pd.DataFrame, y : pd.DataFrame):
        """
        Fit a Poisson regression model to the train data. 
        """
        return sm.GLM(y, X, family=sm.families.Poisson()).fit()

In [142]:
train_model = TrainModel()
## Generate the model inputs and ground truths for home & away team
X_home, y_home = train_model.generate_inputs_model(train_results_home, True)
X_away, y_away = train_model.generate_inputs_model(train_results_away, False)

## - Build a Poisson regression model to generate predicitons for team playing at home
## - Build another model for team playing away 

model_home = train_model.model_training(X_home, y_home)
model_away = train_model.model_training(X_away, y_away)

## Model testing

In [145]:
class TestModel:
    def __init__(self):
        pass

    def predict_expected_value_goal(self, model, X : pd.Series) -> np.array:
        """
        Predict the expected value of the poisson distribution for a fixture and a team. 
        """
        return model.predict(X.values.tolist())
    
    def random_draw_goal_distrib(self, predicted_lambda : np.array, n : int = 10000) -> np.array:
        """
        Draw samples from a Poisson distribution given its expected value. 
        """
        return np.random.poisson(lam=predicted_lambda, size=(n, len(predicted_lambda))).flatten()
    
    def count_goals(self, simulated_goals : np.array, home_boolean : bool) -> pd.Series:
        """
        Calculate the probability for a set of goals to be scored for a certain fixture. 
        """
        if home_boolean:
            return simulated_goals.value_counts().sort_index().rename("HomeGoalsProbability")/len(simulated_goals)
        else:
            return simulated_goals.value_counts().sort_index().rename("AwayGoalsProbability")/len(simulated_goals)
        
    def generate_goals_proba_table(self,home_goals_proba : pd.Series, away_goals_proba : pd.Series) -> pd.DataFrame:
        """
        Concatenate the goal probabilities for home and away team. 
        """
        return pd.concat([home_goals_proba, away_goals_proba], axis = 1).sort_index().fillna(0)
    
    def calculate_scores_proba(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Multiply the probability of Home Team to score X goals with the probability of Away Team to score Y goals.
        Return a matrix containing the probability of every possible outcome (e.g.: 0-1, 2-2, etc.) 
        """
        scores_proba_matrix = np.outer(df["HomeGoalsProbability"].values, df["AwayGoalsProbability"].values)
        return pd.DataFrame(scores_proba_matrix, index = df.index, columns = df.index)
    
    def calculate_probability_home_win(self, df : pd.DataFrame) -> list[float, float, float]:
        """
        Calculate the probability for the home/away team to win, draw or lose.
        Given the matrix containing the every outcome probability, it sums the all the probabilities for:
        - Home Team to win
        - Away Team to win
        - Draw
        """
        probability_home_win = 0
        probability_away_win = 0
        probability_draw = 0
        for i in range(len(df)):
            for j in range(len(df)):
                if i==j:
                    probability_draw += df.iloc[i, j]
                elif i<j:
                    probability_away_win += df.iloc[i, j]
                else:
                    probability_home_win += df.iloc[i, j]
        return probability_home_win, probability_draw, probability_away_win
    
    def game_outcome(self, probability_win_home : float, probability_draw : float, probability_win_away : float) -> int:
        """
        Find the result with the highest probability and return it as the game outcome.  
        """
        return np.argmax([probability_win_home, probability_draw, probability_win_away])

In [121]:
data_preprocessing = DataPreprocessing()
X_test_home, _ = data_preprocessing.generate_inputs_model(test_results_home, home_boolean=True)
X_test_away, _ = data_preprocessing.generate_inputs_model(test_results_away, home_boolean=False)

In [123]:
test_model = TestModel()

probability_win_home_test = []
probability_draw_test = []
probability_win_away_test = []
for i in range(len(X_test_home)):
    X_test_home_i = X_test_home.iloc[i]
    X_test_away_i = X_test_away.iloc[i]

    predicted_lambda_home = test_model.predict_expected_value_goal(model_home, X_test_home_i)
    predicted_lambda_away = test_model.predict_expected_value_goal(model_away, X_test_away_i)
    simulated_goals_home = test_model.random_draw_goal_distrib(predicted_lambda_away)
    simulated_goals_away = test_model.random_draw_goal_distrib(predicted_lambda_away)
    home_goals_proba = test_model.count_goals(pd.Series(simulated_goals_home), home_boolean = True)
    away_goals_proba = test_model.count_goals(pd.Series(simulated_goals_away), home_boolean = False)
    goals_proba_matrix = test_model.generate_goals_proba_table(home_goals_proba, away_goals_proba)
    scores_proba_df = test_model.calculate_scores_proba(goals_proba_matrix)
    probability_win_home, probability_draw, probability_win_away  = test_model.calculate_probability_home_win(scores_proba_df)

    probability_win_home_test.append(probability_win_home)
    probability_draw_test.append(probability_draw)
    probability_win_away_test.append(probability_win_away)


In [128]:
pd.DataFrame({
    "HomeTeamWinProbability" : probability_win_home_test,
    "TeamsDrawProbability" : probability_draw_test,
    "AwayTeamWinProbability" : probability_win_away_test
})

Unnamed: 0,HomeTeamWinProbability,TeamsDrawProbability,AwayTeamWinProbability
0,0.332099,0.336788,0.331113
1,0.407968,0.187864,0.404168
2,0.395258,0.201078,0.403663
3,0.328936,0.339124,0.331940
4,0.384020,0.222066,0.393915
...,...,...,...
147,0.413875,0.171268,0.414857
148,0.282565,0.431016,0.286418
149,0.345162,0.303071,0.351767
150,0.426798,0.145311,0.427891
