In [21]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import pyarrow as py

In [2]:
FILES_PATH = "/home/guillaume/pythia/code/data/"

In [3]:
fixtures_data = pd.read_csv(f"{FILES_PATH}fixtures.csv", delimiter=",")
odds_data = pd.read_csv(f"{FILES_PATH}odds.csv", delimiter=",")
players_data = pd.read_csv(f"{FILES_PATH}players.csv", delimiter=",")
results_data = pd.read_csv(f"{FILES_PATH}results.csv", delimiter=",")
startingXI_data = pd.read_csv(f"{FILES_PATH}startingXI.csv", delimiter=",")
teams_data = pd.read_csv(f"{FILES_PATH}teams.csv", delimiter=",")

## Feature engineering on players

Let's first check if any players with the same name.

In [5]:
HOMONYMOUS_PLAYER_NAMES = players_data[players_data["PlayerName"].duplicated()]["PlayerName"].values
players_data[players_data["PlayerName"].duplicated()]

## It is important to make the distinctions between these players who share their name with at least another player.
## The list of identical name will be used as a global variable.

Unnamed: 0,PlayerName,TeamID,PositionID
92,Frank Smith,12,3
414,Fred Williams,24,3
650,Robert Pitts,8,1


We create a unique ID for every player

In [6]:
class GoalkeeperPreprocessing:
    def create_player_ID(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Create a unique player ID. 
        """
        df["PlayerID"] = np.arange(1, len(df)+1)
        return df
    
    def link_players_to_teamID(self, player_names : list, homonymous_player_names : list = HOMONYMOUS_PLAYER_NAMES)->str:
        """
        Link the team ID to the list of players in the dataset containing the starting XI of every game. 
        It uses the players' name except for the ones which are not unique.
        """
        for _name in player_names:
            if _name not in homonymous_player_names:
                return players_data[players_data["PlayerName"]==_name]["TeamID"].values[0]
            else: 
                continue

    def explode_startingXI_dataset(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Return a StartingXI dataset with one unique player per row per game.
        """
        df = df.explode("StartingXI").rename(columns = {"StartingXI" : "PlayerName"})
        df = df.merge(players_data, on = ["PlayerName", "TeamID"], how = "left")
        df["MatchPlayed"] = 1 ## This variable will be useful later on for any calculations (e.g.: to count the number of games played)
        return df

In [7]:
gk_preprocessing = GoalkeeperPreprocessing()
## Creation of player ID
players_data = gk_preprocessing.create_player_ID(players_data)

## Add team ID to the dataset using players' name
startingXI_data["StartingXI"] = startingXI_data["StartingXI"].map(lambda x: x.split(","))
startingXI_data["TeamID"] = startingXI_data["StartingXI"].map(lambda x: gk_preprocessing.link_players_to_teamID(x))

startingXI_data_exploded = startingXI_data.copy()

## Explode the dataframe to get one player per row, per game, per team
startingXI_data_exploded = gk_preprocessing.explode_startingXI_dataset(startingXI_data_exploded)

## We merge the season results with the players dataset
df_merged = startingXI_data_exploded.merge(results_data, on = ["MatchID"], how = "left")
results_with_players = df_merged.copy()
results_with_players = results_with_players[results_with_players["SeasonID"]==1]

## Goalkeeper rating calculation

However, the goalkeeper rating becomes less reliable for goalkeepers whom haven't conceded many shots. Therefore, we calculate an adjusted rating using the formula below:
$$\text{Adjusted rating} = \frac{n}{n+k}*\text{Goalkeeper rating} + \frac{k}{k+n}*\text{Baseline rating}$$
- n: number of shots faced by the goalkeeper
- k: influence of the baseline
- baseline rating: the average rating of goalkeepers for season 1

Calculation of a global goalkeeper ration for the whole league

In [8]:
goals_league = results_data["HomeScore"].sum() + results_data["AwayScore"].sum()
shots_league = results_data["HomeShots"].sum() + results_data["AwayShots"].sum()

RATIO_LEAGUE = goals_league/shots_league ## It will be used as a global variable
print(f"The global league ratio is {RATIO_LEAGUE}")

The global league ratio is 0.11351712532171847


In [9]:
class GoalkeeperRatingGeneration:
    def create_new_variables(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Creation of new variables with no distinction of home/away.
        """
        df["TeamScore"] = df.apply(lambda x : x.HomeScore if x.TeamID == x.HomeTeamID else x.AwayScore, axis=1)
        df["OpponentScore"] = df.apply(lambda x : x.AwayScore if x.TeamID == x.HomeTeamID else x.HomeScore, axis=1)
        df = df.drop(columns = ["HomeScore", "AwayScore"])


        df["TeamShots"] = df.apply(lambda x : x.HomeShots if x.TeamID == x.HomeTeamID else x.AwayShots, axis=1)
        df["OpponentShots"] = df.apply(lambda x : x.AwayShots if x.TeamID == x.HomeTeamID else x.HomeShots, axis=1)
        df = df.drop(columns = ["HomeShots", "AwayShots"])

        return df
    
    def create_goalkeeper_rating(self, df : pd.DataFrame) -> pd.DataFrame:
        """ 
        Create variables by summing the number of goals and shots conceded per team throughout the season.
        """
        ## Sum of conceded goals
        gk_conceded_goals_season = df\
            .groupby("PlayerID")["OpponentScore"].sum()

        ## Sum of conceded shots
        gk_conceded_shots_season = players_data["GoalkeeperConcededShotsSeason"] = df\
            .groupby("PlayerID")["OpponentShots"].sum()

        ## Add the new variables in a dataframe
        players_features = pd.DataFrame({"GoalkeeperConcededGoalsSeason" : gk_conceded_goals_season,
                    "GoalkeeperConcededShotsSeason" : gk_conceded_shots_season
                    }).reset_index()

        ## Fill the NaN with 0
        players_features = players_features.fillna(0)

        ## Calculate the ration of goals conceded / shots conceded
        players_features["GoalkeeperRating"] = players_features.apply(lambda x : 0 if x.GoalkeeperConcededShotsSeason ==0 else x.GoalkeeperConcededGoalsSeason/x.GoalkeeperConcededShotsSeason, axis = 1)
        return players_features

    def adjusted_rating_formula(self, gk_rating : float, n_shots : int, ratio_league : float, k : int) -> float:
        """
        Calcualte the adjusted goalkeeper ratio. 
        """
        return n_shots/(n_shots+k)*gk_rating + k/(k+n_shots)*ratio_league

    def calculate_adjusted_rating(self, df : pd.DataFrame, ratio_league : float, k : int) -> pd.DataFrame:
        """
        Add the adjusted ratio into the dataset. 
        """
        df["GoalkeeperAdjustedRating"] = df.apply(lambda x : self.adjusted_rating_formula(x.GoalkeeperRating, x.GoalkeeperConcededShotsSeason, ratio_league, k), axis=1)
        return df

In [10]:
gk_rating = GoalkeeperRatingGeneration()
results_with_players = gk_rating.create_new_variables(results_with_players)
players_features = gk_rating.create_goalkeeper_rating(results_with_players)

median_shots_conceded = players_features["GoalkeeperConcededShotsSeason"].median()
print(f"The median of the number of shots conceded during season 1 is {median_shots_conceded}.\nWe take k = 34 so that the original goalkeeper rating counts for 90% of the adjusted rating if the number of conceded shots is equal to the median.")

players_features = gk_rating.calculate_adjusted_rating(players_features, RATIO_LEAGUE, 34)

The median of the number of shots conceded during season 1 is 302.0.
We take k = 34 so that the original goalkeeper rating counts for 90% of the adjusted rating if the number of conceded shots is equal to the median.


Generation of:
- One dataset with the goalkeeper adjusted rating for every game of the season
- One dataset with all the goalkeeper adjusted ratings per team

In [None]:
results_with_players = results_with_players.merge(players_features, on = ["PlayerID"], how = "left")
goalkeeper_games = results_with_players[["MatchID", "TeamID", "PlayerID", "PositionID", "GoalkeeperAdjustedRating"]]
goalkeeper_games = goalkeeper_games[goalkeeper_games["PositionID"]==1]\
    .drop_duplicates()\
        .sort_values(by="PlayerID")\
            .reset_index(drop=True)

goalkeeper_teams = results_with_players[["TeamID", "PlayerID", "PositionID", "GoalkeeperAdjustedRating"]]
goalkeeper_teams = goalkeeper_teams[goalkeeper_teams["PositionID"]==1]\
    .drop_duplicates()\
        .sort_values(by="PlayerID")\
            .reset_index(drop=True)


## Save goalkeeper rating files as parquet files

In [26]:
goalkeeper_games.to_parquet(f"{FILES_PATH}goalkeeper_games.pq")
goalkeeper_teams.to_parquet(f"{FILES_PATH}goalkeeper_teams.pq")