In [51]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import pyarrow as py

In [52]:
## Get the current working directory
current_directory = os.getcwd()
## Use a global variable for the path to the data
FILES_PATH = f"{current_directory}/data/"

In [53]:
fixtures_data = pd.read_csv(f"{FILES_PATH}fixtures.csv", delimiter=",")
odds_data = pd.read_csv(f"{FILES_PATH}odds.csv", delimiter=",")
players_data = pd.read_csv(f"{FILES_PATH}players.csv", delimiter=",")
results_data = pd.read_csv(f"{FILES_PATH}results.csv", delimiter=",")
startingXI_data = pd.read_csv(f"{FILES_PATH}startingXI.csv", delimiter=",")
teams_data = pd.read_csv(f"{FILES_PATH}teams.csv", delimiter=",")

In [63]:
results_data = results_data[results_data["SeasonID"]==1]

# Objectives of this notebook and discussion

As we have the line-up for every game of Season 1, it would be interesting to compute a performance score for each player based on their contributions throughout the season.

To achieve this, we first compile the starting XI of each team for every game and then aggregate the data over the season using a unique player ID.

This approach allows us to generate statistics for each player for Season 1 using the available game data, providing insights into their impact. The goal is to summarize each player’s performance and measure their influence on the season's outcomes.

It is important to note that this metric represents an average impact and cannot precisely capture their contribution without specific player-related data. However, by aggregating data over Season 1, we aim to evaluate the overall defensive and attacking influence of each player.

Ideally, this analysis would cover all players, regardless of their position on the field. However, due to time constraints, we will focus exclusively on goalkeepers.

Goalkeepers likely have the most measurable impact on a game. Unlike outfield players, whose performance often depends on their partnerships (e.g., Saliba and Gabriel perform well together), we can evaluate a goalkeeper's performance more directly. This can be done using metrics such as the number of goals and shots they conceded.

An appropriate rating for goalkeepers could be calculated as:

$$\text{Rating} = \text{\text{Number of conceded goals}{\text{Number of conceded shots}}}$$


While we may not know whether every shot was on target, we will assume that the percentage of shots on target is consistent across teams. Moreover, as teams all play each other, it should not significantly affect our analysis.

Additionally, we will assume that goalkeepers perform similarly in home and away games. While this may not be entirely accurate, we can reasonably assume that their shot-blocking percentage remains consistent regardless of where they play. Although goalkeepers may face more shots when playing away (as teams often perform worse), using the ratio of goals conceded to shots conceded ensures that this factor does not influence the results.

## Feature engineering on goalkeepers

Let's first check if there are any players with the same name

In [55]:
HOMONYMOUS_PLAYER_NAMES = players_data[players_data["PlayerName"].duplicated()]["PlayerName"].values
players_data[players_data["PlayerName"].duplicated()]

## It is important to make the distinction between these players who share their name with at least another player.
## The list of identical name will be used as a global variable.

Unnamed: 0,PlayerName,TeamID,PositionID
92,Frank Smith,12,3
414,Fred Williams,24,3
650,Robert Pitts,8,1


### Creation of a class to preprocess the data needed to compute statistics about goalkeepers

The two main outcomes of the class are:
- Creating a unique player ID
- Adding the starting XI to every game played (season results data)

In [66]:
class GoalkeeperPreprocessing:
    def create_player_ID(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Create a unique player ID. 
        """
        df["PlayerID"] = np.arange(1, len(df)+1)
        return df
    
    def link_players_to_teamID(self, player_names : list, homonymous_player_names : list = HOMONYMOUS_PLAYER_NAMES)->str:
        """
        Link the team ID to the list of players in the dataset containing the starting XI of every game. 
        It uses the players' name except for the ones which are not unique.
        """
        for _name in player_names:
            if _name not in homonymous_player_names:
                return players_data[players_data["PlayerName"]==_name]["TeamID"].values[0]
            else: 
                continue

    def explode_startingXI_dataset(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Return a StartingXI dataset with one unique player per row per game.
        """
        df = df.explode("StartingXI").rename(columns = {"StartingXI" : "PlayerName"})
        df = df.merge(players_data, on = ["PlayerName", "TeamID"], how = "left")
        df["MatchPlayed"] = 1 ## This variable will be useful later on for any calculations (e.g.: to count the number of games played)
        return df
    
    def run_goalkeeper_preprocessing(self, results_data : pd.DataFrame, startingXI_data : pd.DataFrame, players_data : pd.DataFrame) -> pd.DataFrame:
        """
        Run the preprocessing pipeline for the goalkeeper ratings generation. 
        """
        ## Creation of a unique player ID
        players_data = self.create_player_ID(players_data)

        ## Add team ID to the dataset using players' name
        startingXI_data_copy = startingXI_data.copy()
        startingXI_data_copy["StartingXI"] = startingXI_data_copy["StartingXI"].map(lambda x: x.split(","))
        startingXI_data_copy["TeamID"] = startingXI_data_copy["StartingXI"].map(lambda x: self.link_players_to_teamID(x))

        ## Explode the dataframe to get one player per row, per game, per team
        startingXI_data_exploded = self.explode_startingXI_dataset(startingXI_data_copy)

        ## Merge the season results with the players dataset
        df_merged = startingXI_data_exploded.merge(results_data, on = ["MatchID"], how = "right")
        results_with_players = df_merged.copy()

        ## Filter out season 2 data
        # results_with_players = results_with_players[results_with_players["SeasonID"]==1]
        return results_with_players

We end up with a dataset containing every player for every game as entries

In [67]:
gk_preprocessing = GoalkeeperPreprocessing()
results_with_players = gk_preprocessing.run_goalkeeper_preprocessing(results_data, startingXI_data, players_data)
results_with_players

Unnamed: 0,MatchID,PlayerName,TeamID,PositionID,PlayerID,MatchPlayed,SeasonID,Gameweek,HomeTeamID,HomeScore,HomeShots,AwayTeamID,AwayScore,AwayShots
0,1,Stephen Sallee,7,1,628,1,1,1,7,1,17,1,1,12
1,1,Porter Harris,7,2,635,1,1,1,7,1,17,1,1,12
2,1,David Larson,7,2,636,1,1,1,7,1,17,1,1,12
3,1,Julius Skinner,7,2,631,1,1,1,7,1,17,1,1,12
4,1,Gerald Nelson,7,2,634,1,1,1,7,1,17,1,1,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16627,756,Paul Johnson,22,3,365,1,1,54,21,1,21,22,0,12
16628,756,Lance Silvera,22,3,369,1,1,54,21,1,21,22,0,12
16629,756,John Ostlund,22,3,363,1,1,54,21,1,21,22,0,12
16630,756,Larry Velasquez,22,4,373,1,1,54,21,1,21,22,0,12


## Goalkeeper rating calculation

However, the goalkeeper rating becomes less reliable for goalkeepers whom haven't conceded many shots. Therefore, we calculate an adjusted rating using the formula below:
$$\text{Adjusted rating} = \frac{n}{n+k}*\text{Goalkeeper rating} + \frac{k}{k+n}*\text{Baseline rating}$$
- n: number of shots faced by the goalkeeper
- k: influence of the baseline
- baseline rating: the average rating of all goalkeepers for season 1

### Creation of a class to generate a score for every goalkeeper based on their performances of the season

In [None]:
class GoalkeeperRatingGeneration:
    @staticmethod
    def baseline_gk_score(df : pd.DataFrame) -> pd.DataFrame:
        """
        Calculation of an average goalkeeper score for the whole league. 
        """
        goals_league = df["HomeScore"].sum() + df["AwayScore"].sum()
        shots_league = df["HomeShots"].sum() + df["AwayShots"].sum()
        return goals_league/shots_league
    
    gk_ratio_league = baseline_gk_score(results_data)

    def create_new_variables(self, df : pd.DataFrame) -> pd.DataFrame:
        """
        Creation of new variables with no distinction of home/away.
        """
        df["TeamScore"] = df.apply(lambda x : x.HomeScore if x.TeamID == x.HomeTeamID else x.AwayScore, axis=1)
        df["OpponentScore"] = df.apply(lambda x : x.AwayScore if x.TeamID == x.HomeTeamID else x.HomeScore, axis=1)
        df = df.drop(columns = ["HomeScore", "AwayScore"])


        df["TeamShots"] = df.apply(lambda x : x.HomeShots if x.TeamID == x.HomeTeamID else x.AwayShots, axis=1)
        df["OpponentShots"] = df.apply(lambda x : x.AwayShots if x.TeamID == x.HomeTeamID else x.HomeShots, axis=1)
        df = df.drop(columns = ["HomeShots", "AwayShots"])

        return df
    
    def create_goalkeeper_rating(self, df : pd.DataFrame) -> pd.DataFrame:
        """ 
        Create variables by summing the number of goals and shots conceded per team throughout the season.
        """
        ## Sum of conceded goals
        gk_conceded_goals_season = df\
            .groupby("PlayerID")["OpponentScore"].sum()

        ## Sum of conceded shots
        gk_conceded_shots_season = players_data["GoalkeeperConcededShotsSeason"] = df\
            .groupby("PlayerID")["OpponentShots"].sum()

        ## Add the new variables in a dataframe
        players_features = pd.DataFrame({"GoalkeeperConcededGoalsSeason" : gk_conceded_goals_season,
                    "GoalkeeperConcededShotsSeason" : gk_conceded_shots_season
                    }).reset_index()

        ## Fill the NaN with 0
        players_features = players_features.fillna(0)

        ## Calculate the ration of goals conceded / shots conceded
        players_features["GoalkeeperRating"] = players_features.apply(lambda x : 0 if x.GoalkeeperConcededShotsSeason ==0 else x.GoalkeeperConcededGoalsSeason/x.GoalkeeperConcededShotsSeason, axis = 1)
        return players_features

    def adjusted_rating_formula(self, gk_rating : float, n_shots : int, k : int) -> float:
        """
        Calculate the adjusted goalkeeper ratio. 
        """
        return n_shots/(n_shots+k)*gk_rating + k/(k+n_shots)*GoalkeeperRatingGeneration.gk_ratio_league

    def calculate_adjusted_rating(self, df : pd.DataFrame, k : int) -> pd.DataFrame:
        """
        Add the adjusted ratio into the dataset. 
        """
        df["GoalkeeperAdjustedRating"] = df.apply(lambda x : self.adjusted_rating_formula(x.GoalkeeperRating, x.GoalkeeperConcededShotsSeason, k), axis=1)
        return df

We run the pipeline to generate goalkeepers' score.
- Calculation of a score using the average number of goals conceded divided by the average number of shots conceded
- Adjustment of the score using the baseline

In [None]:
gk_rating = GoalkeeperRatingGeneration()
results_with_players = gk_rating.create_new_variables(results_with_players)
players_features = gk_rating.create_goalkeeper_rating(results_with_players)

median_shots_conceded = players_features["GoalkeeperConcededShotsSeason"].median()
print(f"The median of the number of shots conceded during season 1 is {median_shots_conceded}.\nWe take k = 34 so that the original goalkeeper rating counts for 90% of the adjusted rating if the number of conceded shots is equal to the median.")

players_features = gk_rating.calculate_adjusted_rating(players_features, 34)

The median of the number of shots conceded during season 1 is 302.0.
We take k = 34 so that the original goalkeeper rating counts for 90% of the adjusted rating if the number of conceded shots is equal to the median.


## Generation of the datasets containing the goalkeeper scores
- One dataset with the goalkeeper adjusted rating for every game of the season
- One dataset with all the goalkeeper adjusted ratings per team

In [None]:
results_with_players = results_with_players.merge(players_features, on = ["PlayerID"], how = "left")
goalkeeper_games = results_with_players[["MatchID", "TeamID", "PlayerID", "PositionID", "GoalkeeperAdjustedRating"]]
goalkeeper_games = goalkeeper_games[goalkeeper_games["PositionID"]==1]\
    .drop_duplicates()\
        .sort_values(by="PlayerID")\
            .reset_index(drop=True)

goalkeeper_teams = results_with_players[["TeamID", "PlayerID", "PositionID", "GoalkeeperAdjustedRating"]]
goalkeeper_teams = goalkeeper_teams[goalkeeper_teams["PositionID"]==1]\
    .drop_duplicates()\
        .sort_values(by="PlayerID")\
            .reset_index(drop=True)


## Save goalkeeper rating files as parquet files

In [26]:
goalkeeper_games.to_parquet(f"{FILES_PATH}goalkeeper_games.pq")
goalkeeper_teams.to_parquet(f"{FILES_PATH}goalkeeper_teams.pq")