In [295]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from scipy.stats import poisson
from sklearn.preprocessing import StandardScaler

In [296]:
FILES_PATH = "/home/guillaume/pythia/code/data/"
MODEL_FEATURES_HOME = [
    "HomeScore",
    "in_HomeSeasonShotsAvg", 
    "in_HomeSeasonShotsConcededAvg",
    "in_HomeAttackStrength", 
    "in_HomeDefenceStrength",
    "in_HomeGoalkeeperAdjustedRating",
    "in_StrengthDifference",
    # "in_AwayAttackStrength",
    "in_AwayDefenceStrength",
    # "in_InterceptTerm"
    ]

MODEL_FEATURES_AWAY = [
    "AwayScore",
    "in_AwaySeasonShotsAvg", 
    "in_AwaySeasonShotsConcededAvg",
    "in_AwayAttackStrength", 
    "in_AwayDefenceStrength",
    "in_AwayGoalkeeperAdjustedRating",
    "in_StrengthDifference",
    # "in_HomeAttackStrength",
    "in_HomeDefenceStrength",
    # "in_InterceptTerm"
    ]

In [297]:
fixtures_data = pd.read_csv(f"{FILES_PATH}fixtures.csv", delimiter=",")
odds_data = pd.read_csv(f"{FILES_PATH}odds.csv", delimiter=",")
players_data = pd.read_csv(f"{FILES_PATH}players.csv", delimiter=",")
results_data = pd.read_csv(f"{FILES_PATH}results.csv", delimiter=",")
startingXI_data = pd.read_csv(f"{FILES_PATH}startingXI.csv", delimiter=",")
teams_data = pd.read_csv(f"{FILES_PATH}teams.csv", delimiter=",")
goalkeeper_games = pd.read_parquet(f"{FILES_PATH}goalkeeper_games.pq")
goalkeeper_teams = pd.read_parquet(f"{FILES_PATH}goalkeeper_teams.pq")

## Classes creation

In [411]:
class DataPreprocessing: 
    @staticmethod
    def players_stats_preprocessing(df_gk_games : pd.DataFrame, df_gk_teams : pd.DataFrame) -> [pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """ 
        Preprocess the goalkeeper datasets and make them a parent class variables so it can be used in the model pipeline.
        """
        goalkeeper_games_home = df_gk_games.copy()
        goalkeeper_games_away= df_gk_games.copy()

        goalkeeper_games_home = goalkeeper_games_home.rename(columns = {"TeamID" : "HomeTeamID",
                                                                        "GoalkeeperAdjustedRating" : "in_HomeGoalkeeperAdjustedRating",
                                                                        "PlayerID" : "HomePlayerID",
                                                                        "PositionID" : "HomePositionID"
                                                                        })
        goalkeeper_games_away = goalkeeper_games_away.rename(columns = {"TeamID" : "AwayTeamID",
                                                                        "GoalkeeperAdjustedRating" : "in_AwayGoalkeeperAdjustedRating",
                                                                        "PlayerID" : "AwayPlayerID",
                                                                        "PositionID" : "AwayPositionID"
                                                                        })
        goalkeeper_teams = df_gk_teams.rename(columns = {"GoalkeeperAdjustedRating" : "in_GoalkeeperAdjustedRating"})
        return goalkeeper_games_home, goalkeeper_games_away, goalkeeper_teams
    
    goalkeeper_games_home, goalkeeper_games_away, goalkeeper_teams = players_stats_preprocessing(goalkeeper_games, goalkeeper_teams) 

    def game_outcome_ground_truth(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Encode the results of the game.
        - 0 if Home Team won
        - 1 if a draw
        - 2 if Away Team won 
        """
        df["GameOutcome"] = 0 
        df.loc[df["HomeScore"] == df["AwayScore"], "GameOutcome"] = 1  
        df.loc[df["HomeScore"] < df["AwayScore"], "GameOutcome"] = 2
        return df
    
    
    def season_split(self, df : pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Split the results dataset into two dataframes for season 1 & 2 
        """
        results_data_season1 = df.copy()
        results_data_season2 = df.copy()
        results_data_season1 = results_data_season1[results_data_season1["SeasonID"]==1]
        results_data_season2 = results_data_season2[results_data_season2["SeasonID"]==2]
        return results_data_season1, results_data_season2

    def split_input_output(self, df : pd.DataFrame, home_boolean : bool) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Generate the model input dataset as well as the ground truth - here the number of goals.  
        """
        if home_boolean:
            y = df[["HomeScore"]]
            X = df[MODEL_FEATURES_HOME].drop(columns = ["HomeScore"])
            
        else:
            y = df[["AwayScore"]]
            X = df[MODEL_FEATURES_AWAY].drop(columns = ["AwayScore"])
        return X, y
    
    def select_input_features(self, df : pd.DataFrame, home_boolean : bool) -> pd.DataFrame:
        """
        Return a dataset used for training the model only with the relevant features. 
        """
        if home_boolean:
            return df[MODEL_FEATURES_HOME]
        else:
            return df[MODEL_FEATURES_AWAY]

    
    def merge_results_stats(self, results_df : pd.DataFrame, stats_df : pd.DataFrame, _home_boolean : bool) -> pd.DataFrame:
        """
        Add the new features to the existing results dataset. 
        """
        if _home_boolean:
            results_df = results_df.merge(stats_df, on = ["HomeTeamID"], how = "left")
        else:
            results_df = results_df.merge(stats_df, on = ["AwayTeamID"], how = "left")
        return results_df
    
    def normalised_data(self, df : pd.DataFrame, trained_scaler) -> pd.DataFrame:
        """ 
        Normalise the data using the Standard Scaler method.
        """
        if trained_scaler is None:
            scaler = StandardScaler()
            df_scaled = scaler.fit_transform(df)
            return pd.DataFrame(df_scaled, columns = df.columns), scaler
        else:
            df_scaled = trained_scaler.transform(df)
            return pd.DataFrame(df_scaled, columns = df.columns)
        

Creation of two goalkeeper rating datasets for home & away games

In [412]:
data_preprocessing = DataPreprocessing()
# goalkeeper_games_home, goalkeeper_games_away, goalkeeper_teams = data_preprocessing.players_stats_preprocessing(goalkeeper_games, goalkeeper_teams)

Encode the outcome of the game in the results dataset
- 0 if Home Team won
- 1 if a draw
- 2 if Away Team won 

In [413]:
results_data = data_preprocessing.game_outcome_ground_truth(results_data)

We extract the unique id of each team and make it a global list as it can be useful throughout the model implementation.

In [414]:
ALL_TEAM_IDS = results_data["HomeTeamID"].sort_values().unique().tolist()

Split dataset into a train dataset for season 1 and a test dataset for season 2

In [415]:
results_data_season1, results_data_season2 = data_preprocessing.season_split(results_data)

## Feature engineering

In [416]:
class FeatureEngineering(DataPreprocessing):
    def __init__(self):
        """
        We generate variables that wil be used throughout the training of the model.
        - The results datasets for season 1 and season 2.

        We generate general statistics about the first season that will be used throughout the model training and testing. 
        - Calcualtion of the total number of goals scored at home/away. 
        - Calculation of the average number of goals scored at home/away. 
        """
        self.results_data_season1, self.results_data_season2 = self.season_split(results_data)
        self.total_league_goals_home, self.total_league_goals_away, self.average_league_goals_home, self.average_league_goals_away = self.season_statistics(self.results_data_season1)
        

    def season_statistics(self, df : pd.DataFrame) -> list[int, int, float, float]:
        """
        Calculate the total number of goals scored at home/away. 
        As well the average number of goals scored at home/away. 
        """
        return df.HomeScore.sum(), df.AwayScore.sum(), df.HomeScore.mean(), df.AwayScore.mean()
    
    def generate_average_stats_home(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Calculate the average number of goals scored/conceded for every home team. 
        """
        return df.groupby(["HomeTeamID"]).agg(
            in_HomeSeasonGoalsScoredAvg = ("HomeScore", "mean"),
            in_HomeSeasonGoalsConcededAvg = ("AwayScore", "mean"),
            in_HomeSeasonShotsAvg = ("HomeShots", "mean"),
            in_HomeSeasonShotsConcededAvg = ("AwayShots", "mean"),
        ).reset_index()

    def generate_average_stats_away(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Calculate the average number of goals scored/conceded for every away team.  
        """
        return df.groupby(["AwayTeamID"]).agg(
        in_AwaySeasonGoalsScoredAvg = ("AwayScore", "mean"),
        in_AwaySeasonGoalsConcededAvg = ("HomeScore", "mean"),
        in_AwaySeasonShotsAvg = ("AwayShots", "mean"),
        in_AwaySeasonShotsConcededAvg = ("HomeShots", "mean"),
    ).reset_index()

    def calculate_strength_home(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Divide the average number of goals scored by a home team by the average number of home goals scored thoughout the season.
        """
        df["in_HomeAttackStrength"] = df["in_HomeSeasonGoalsScoredAvg"]/self.average_league_goals_home
        df["in_HomeDefenceStrength"] = df["in_HomeSeasonGoalsConcededAvg"]/self.average_league_goals_away
        return df.drop(columns=["in_HomeSeasonGoalsScoredAvg", "in_HomeSeasonGoalsConcededAvg"])

    def calculate_strength_away(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Divide the average number of goals scored by an away team by the average number of away goals scored thoughout the season.
        """
        df["in_AwayAttackStrength"] = df["in_AwaySeasonGoalsScoredAvg"]/self.average_league_goals_away
        df["in_AwayDefenceStrength"] = df["in_AwaySeasonGoalsConcededAvg"]/self.average_league_goals_home
        return df.drop(columns=["in_AwaySeasonGoalsScoredAvg", "in_AwaySeasonGoalsConcededAvg"])
    
    def generate_features_dataset(self, df : pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Calculate the the average attack & defence strenght scores for every team.
        """
        df_features_home = self.generate_average_stats_home(df)
        df_features_away = self.generate_average_stats_away(df)
        df_features_home = self.calculate_strength_home(df_features_home)
        df_features_away = self.calculate_strength_away(df_features_away)

        ### TO BE CHANGED
        df_features_home["in_StrengthDifference"] = abs(df_features_home["in_HomeAttackStrength"] - df_features_away["in_AwayDefenceStrength"])
        return df_features_home, df_features_away
    
    def add_home_away_features(self, df_results : pd.DataFrame, df_features_home : pd.DataFrame, df_features_away : pd.DataFrame) -> pd.DataFrame:
        """
        Merge the results of season 1 with the team features previously calculated. 
        """
        df_merged = df_results.merge(df_features_home, on = ["HomeTeamID"], how = "left")
        return df_merged.merge(df_features_away, on = ["AwayTeamID"], how = "left")
    
    def add_goalkeeper_rating_feature(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Add the goalkeeper ratings to the train results dataset for home and away teams.  
        """
        df_merged = df.merge(FeatureEngineering.goalkeeper_games_home, on = ["MatchID", "HomeTeamID"], how = "left")
        return df_merged.merge(FeatureEngineering.goalkeeper_games_away, on = ["MatchID", "AwayTeamID"], how = "left")

In [417]:
feature_engineering = FeatureEngineering()

### Team-oriented statistics
- Calculation of attack & defence strengths for home & away teams
- Shots and goals conceded features generation

In [418]:
df_features_home, df_features_away = feature_engineering.generate_features_dataset(results_data_season1)

In [419]:
train_results_merged = feature_engineering.add_home_away_features(results_data_season1, df_features_home, df_features_away)

In [420]:
train_results_merged = feature_engineering.add_goalkeeper_rating_feature(train_results_merged)

## Input dataset creation and model training

In [421]:
class TrainModel(DataPreprocessing):
    def __init__(self):
        pass
    
    def model_training(self, X : pd.DataFrame, y : pd.DataFrame):
        """
        Fit a Poisson regression model to the train data. 
        """
        return sm.GLM(y, X, family=sm.families.Poisson()).fit()
    
    def run_model_training(self, df : pd.DataFrame):
        ## Generate the model inputs and ground truths for home & away team
        train_features_home = self.select_input_features(df, home_boolean = True)
        train_features_away = self.select_input_features(df, home_boolean = False)

        ## Normalise the data
        train_features_scaled_home, scaler_home = self.normalised_data(train_features_home, None)
        train_features_scaled_away, scaler_away = self.normalised_data(train_features_away, None)

        ## Generate model inputs and outputs
        X_home, y_home = self.split_input_output(train_features_scaled_home, home_boolean = True)
        X_away, y_away = self.split_input_output(train_features_scaled_away, home_boolean = False)

        ## - Build a Poisson regression model to generate predicitons for team playing at home
        ## - Build another model for team playing away 
        model_home = self.model_training(X_home, y_home)
        model_away = self.model_training(X_away, y_away)
        return model_home, model_away, scaler_home, scaler_away

In [422]:
train_model = TrainModel()
model_home, model_away, scaler_home, scaler_away = train_model.run_model_training(train_results_merged)

## Model testing

In [431]:
class TestModel(FeatureEngineering):
    def predict_expected_value_goal(self, model, X : pd.Series) -> np.array:
        """
        Predict the expected value of the poisson distribution for a fixture and a team. 
        """
        return model.predict(X.values.tolist())
    
    def random_draw_goal_distrib(self, predicted_lambda : np.array, n : int = 10000) -> np.array:
        """
        Draw samples from a Poisson distribution given its expected value. 
        """
        return np.random.poisson(lam=predicted_lambda, size=(n, len(predicted_lambda))).flatten()
    
    def count_goals(self, simulated_goals : np.array, home_boolean : bool) -> pd.Series:
        """
        Calculate the probability for a set of goals to be scored for a certain fixture. 
        """
        if home_boolean:
            return simulated_goals.value_counts().sort_index().rename("HomeGoalsProbability")/len(simulated_goals)
        else:
            return simulated_goals.value_counts().sort_index().rename("AwayGoalsProbability")/len(simulated_goals)
        
    def generate_goals_proba_table(self,home_goals_proba : pd.Series, away_goals_proba : pd.Series) -> pd.DataFrame:
        """
        Concatenate the goal probabilities for home and away team. 
        """
        return pd.concat([home_goals_proba, away_goals_proba], axis = 1).sort_index().fillna(0)
    
    def calculate_scores_proba(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Multiply the probability of Home Team to score X goals with the probability of Away Team to score Y goals.
        Return a matrix containing the probability of every possible outcome (e.g.: 0-1, 2-2, etc.) 
        """
        scores_proba_matrix = np.outer(df["HomeGoalsProbability"].values, df["AwayGoalsProbability"].values)
        return pd.DataFrame(scores_proba_matrix, index = df.index, columns = df.index)
    
    def calculate_outcome_probability(self, df : pd.DataFrame) -> list[float, float, float]:
        """
        Calculate the probability for the home/away team to win, draw or lose.
        Given the matrix containing the every outcome probability, it sums the all the probabilities for:
        - Home Team to win
        - Away Team to win
        - Draw
        """
        probability_home_win = 0
        probability_away_win = 0
        probability_draw = 0
        for i in range(len(df)):
            for j in range(len(df)):
                if i==j:
                    probability_draw += df.iloc[i, j]
                elif i<j:
                    probability_away_win += df.iloc[i, j]
                else:
                    probability_home_win += df.iloc[i, j]
        return probability_home_win, probability_draw, probability_away_win
    
    def game_outcome(self, probability_win_home : float, probability_draw : float, probability_win_away : float) -> int:
        """
        Find the result with the highest probability and return it as the game outcome.  
        """
        return np.argmax([probability_win_home, probability_draw, probability_win_away])
    
    def generate_input_testing(self, df : pd.DataFrame, df_features_home : pd.DataFrame, df_features_away : pd.DataFrame) -> pd.DataFrame:
        """
        Generate inputs for testing. 
        """
        test_results_home = df.merge(df_features_home, on = "HomeTeamID", how = "left")
        test_results_away = df.merge(df_features_away, on = "AwayTeamID", how = "left")
        X_test_home, _ = self.split_input_output(test_results_home, home_boolean=True)
        X_test_away, _ = self.split_input_output(test_results_away, home_boolean=False)

        return X_test_home, X_test_away
    
    def random_goalkeeper_selection(self, df_gk_teams: pd.DataFrame, n_draw : int = 1) -> pd.DataFrame:
        return df_gk_teams.groupby(["TeamID"]).apply(lambda x: x.sample(n=n_draw))[["TeamID", "in_GoalkeeperAdjustedRating"]].reset_index(drop=True)
    
    def merge_random_players_results(self, df_results : pd.DataFrame, df_gk : pd.DataFrame) -> pd.DataFrame:
        """
        Merge the goalkeeper selected at random for every team with the results of season 2. 
        """
        df_gk_home = df_gk.rename(columns = {"in_GoalkeeperAdjustedRating" : "in_HomeGoalkeeperAdjustedRating"})
        df_gk_away = df_gk.rename(columns = {"in_GoalkeeperAdjustedRating" : "in_AwayGoalkeeperAdjustedRating"})

        df_merged = df_results.merge(df_gk_home, left_on = ["HomeTeamID"], right_on = ["TeamID"], how = "left").drop(columns = ["TeamID"])
        return df_merged.merge(df_gk_away, left_on = ["AwayTeamID"], right_on = ["TeamID"], how = "left").drop(columns = ["TeamID"])
    
    def run_model_testing(self, df : pd.DataFrame, df_features_home : pd.DataFrame, df_features_away : pd.DataFrame, scaler_home : pd.DataFrame, scaler_away : pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
        test_results_merged = self.add_home_away_features(df, df_features_home, df_features_away)
        goalkeeper_random_selection = self.random_goalkeeper_selection(goalkeeper_teams)
        test_results_merged = self.merge_random_players_results(test_results_merged, goalkeeper_random_selection)

        test_features_home = train_model.select_input_features(test_results_merged, home_boolean = True)
        test_features_away = train_model.select_input_features(test_results_merged, home_boolean = False)
        
        test_features_scaled_home = self.normalised_data(test_features_home, scaler_home)
        test_features_scaled_away = self.normalised_data(test_features_away, scaler_away)

        X_test_home, _ = self.split_input_output(test_features_scaled_home, home_boolean=True)
        X_test_away, _ = self.split_input_output(test_features_scaled_away, home_boolean=False)
        return X_test_home, X_test_away
    
    def model_testing_metrics_generation(self, X_test_home, X_test_away, model_home, model_away):
        probability_win_home_test = []
        probability_draw_test = []
        probability_win_away_test = []
        game_outcome_test = []
        for i in range(len(X_test_home)):
            X_test_home_i = X_test_home.iloc[i]
            X_test_away_i = X_test_away.iloc[i]

            predicted_lambda_home = self.predict_expected_value_goal(model_home, X_test_home_i)
            predicted_lambda_away = self.predict_expected_value_goal(model_away, X_test_away_i)

            simulated_goals_home = self.random_draw_goal_distrib(predicted_lambda_home)
            simulated_goals_away = self.random_draw_goal_distrib(predicted_lambda_away)

            home_goals_proba = self.count_goals(pd.Series(simulated_goals_home), home_boolean = True)
            away_goals_proba = self.count_goals(pd.Series(simulated_goals_away), home_boolean = False)
            goals_proba_matrix = self.generate_goals_proba_table(home_goals_proba, away_goals_proba)
                    
            scores_proba_df = self.calculate_scores_proba(goals_proba_matrix)
            probability_win_home, probability_draw, probability_win_away  = self.calculate_outcome_probability(scores_proba_df)

            probability_win_home_test.append(probability_win_home)
            probability_draw_test.append(probability_draw)
            probability_win_away_test.append(probability_win_away)
            game_outcome_test.append(self.game_outcome(probability_win_home, probability_draw, probability_win_away))

        return pd.DataFrame({
        "HomeTeamWinProbability" : probability_win_home_test,
        "TeamsDrawProbability" : probability_draw_test,
        "AwayTeamWinProbability" : probability_win_away_test,
        "PredictedGameOutcome" : game_outcome_test
        })


Goalkeeper random selection for every game of season 2

In [438]:
test_model = TestModel()
X_test_home, X_test_away = test_model.run_model_testing(results_data_season2, df_features_home, df_features_away, scaler_home, scaler_away)
model_test_outcomes = test_model.model_testing_metrics_generation(X_test_home, X_test_away, model_home, model_away)

  return df_gk_teams.groupby(["TeamID"]).apply(lambda x: x.sample(n=n_draw))[["TeamID", "in_GoalkeeperAdjustedRating"]].reset_index(drop=True)


In [440]:
model_test_outcomes["PredictedGameOutcome"].value_counts()

PredictedGameOutcome
0    377
2    360
1     19
Name: count, dtype: int64

### Compare results 2 prediction with the ground truth

In [441]:
def merge_season2_outcomes(df_season : pd.DataFrame, df_results : pd.DataFrame) -> pd.DataFrame:
    """
    Concatenate the predictions of win/draw/lose with the original dataset for season 2.
    """
    return pd.concat([df_season.reset_index(drop=True), df_results.reset_index(drop=True)], axis=1)
test_results_prediction = merge_season2_outcomes(results_data_season2, outcomes_df)

Calculate the predicted odds using the probabilities computed by the model for season 2

In [442]:
def calculate_predicted_odds(df : pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the odds based on the predicted probabilities. 
    """
    df["PredictedOddsHomeWin"] = 1/df["HomeTeamWinProbability"]
    df["PredictedOddsDraw"] = 1/df["TeamsDrawProbability"]
    df["PredictedOddsAwayWin"] = 1/df["AwayTeamWinProbability"]
    return df
test_results_prediction = calculate_predicted_odds(test_results_prediction)

Calculate the expected value of points for every team at every game
$$E[X] = \sum_{x=0,1,3} x*P(X=x)$$
- x represents the possible points a team can earn in a game 
- P(X=x) is the probability of earning x points, as computed by the model.

In [443]:
def calculate_points_expected_value_per_game(df : pd.DataFrame) -> pd.DataFrame:
    df["HomePredictedPoints"] = df["HomeTeamWinProbability"]*3 + df["TeamsDrawProbability"]
    df["AwayPredictedPoints"] = df["AwayTeamWinProbability"]*3 + df["TeamsDrawProbability"]
    return df
test_results_prediction = calculate_points_expected_value_per_game(test_results_prediction)

In [444]:
def calculate_points_season_home_away(df : pd.DataFrame, home_boolean : bool) -> pd.DataFrame:
    if home_boolean:
        return df.groupby(["HomeTeamID"]).agg(
            season_predicted_points_home = ("HomePredictedPoints", "sum")
        ).reset_index(drop=False)

    else:
        return df.groupby(["AwayTeamID"]).agg(
            season_predicted_points_away = ("AwayPredictedPoints", "sum")
        ).reset_index(drop=False)
predicted_points_home = calculate_points_season_home_away(test_results_prediction, True)
predicted_points_away = calculate_points_season_home_away(test_results_prediction, False)

In [445]:
def calculate_points_season(df_home, df_away):
    df = df_home.merge(df_away, left_on="HomeTeamID", right_on="AwayTeamID", how = "inner")
    df["PredictedPoints"] = df["season_predicted_points_home"].values + df["season_predicted_points_away"].values
    return df.rename(columns = {"HomeTeamID" : "TeamID"}).drop(columns = "AwayTeamID")
predicted_points = calculate_points_season(predicted_points_home, predicted_points_away).sort_values(by = ["PredictedPoints"], ascending=False)

In [446]:
def generate_ranking(df):
    df["ranking_season2"] = np.arange(1, len(df)+1)
    return df
predicted_points = generate_ranking(predicted_points)

### Monte-Carlo simualtion using the outcome probabilities to compute the final ranking probabilities of season 2

We employ Monte Carlo simulations to predict the outcomes of all games in Season 2, leveraging the probabilities generated by the model: $$P(X = \text{Home team win}), P(X = \text{Draw}), P(X = \text{Away team win})$$

In [447]:
def MC_simulation_single_season(df : pd.DataFrame) -> pd.DataFrame:
    """
    Returns the total number of points for every team after simulating
    the whole season given the probabilities calculated by the model.
    """
    points = {_team: 0 for _team in ALL_TEAM_IDS}
    for i in range(len(df)):
        _game = df.iloc[i]
        
        team_id_home = _game["HomeTeamID"] ## Home Team ID
        team_id_away = _game["AwayTeamID"] ## Away Team ID

        proba_win_home = _game["HomeTeamWinProbability"] ## Probability of Home Team to win that game
        proba_draw = _game["TeamsDrawProbability"] ## Probability of teams to draw
        proba_win_away = _game["AwayTeamWinProbability"] ## Probability of Away Team to win that game

        ## Generation of a random number {0, 1, 2} with the probabilities of [Home, Draw, Away]
        _outcome = np.random.choice(np.arange(0, 3), p=[proba_win_home, proba_draw, proba_win_away])

        ## 3, 1 or 0 points are then added to the team's total points of the season
        if _outcome == 0:
            points[team_id_home] += 3 
        elif _outcome == 1:
            points[team_id_home] += 1
            points[team_id_away] += 3 
        else:
            points[team_id_away] += 3 
    return points
    


In [None]:
def MC_simulation_multiple_seasons(df : pd.DataFrame, n_iterations : int = 100) -> pd.DataFrame:
    """
    Simulate a number of n_iterations seasons using the probabilities of the model.
    It returns the probability of every ranking for every team.  
    """
    position_counts = {_team: [0] * len(ALL_TEAM_IDS) for _team in ALL_TEAM_IDS} ## Dictionnary containing the history of every team's ranking
    for _iter in range(n_iterations):
        season_points = MC_simulation_single_season(df) ## Simulation of a single season

        ## We save the points in a dataframe and ranked the teams accordingly
        df_season_points = pd.DataFrame(season_points.items(), columns = ["TeamID", "SeasonPoints"])\
            .sort_values(by = ["SeasonPoints"], ascending=False)\
                .reset_index(drop=True)
        
        ## The ranking of every team for this season is saved in a dictionnary
        for _pos, team_val in df_season_points.iterrows():
            team_id = team_val["TeamID"]
            position_counts[team_id][_pos] += 1

    ## Generation of ranking probabilities by dividing the number of times a team finishes in a specific position by the total number of simulations
    for _key, _val in position_counts.items():
        position_counts[_key] = [x/n_iterations for x in _val]

    ## Creation of a dataframe for the results with the ranking as index and the teams as columns
    df_results = pd.DataFrame(position_counts, index = position_counts.keys())
    df_results.columns = [f"TeamID_{x}" for x in df_results.columns]
    return df_results

df_ranking_season2_proba = MC_simulation_multiple_seasons(test_results_prediction)

In [None]:
for i in ALL_TEAM_IDS:
    ranking_team = df_ranking_season2_proba[f"TeamID_{i}"]
    ranking_team_plot = ranking_team[ranking_team!=0]
    plt.bar(ranking_team_plot.index, ranking_team_plot.values)
    plt.show()

## Real ranking of season 2