In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import statsmodels.api as sm
from scipy.stats import poisson
from scipy import stats
from sklearn.preprocessing import StandardScaler
import numpy as np, scipy.stats as st
import json
from project_code.utils import GoalkeeperPreprocessing

In [2]:
## Get the current working directory
current_directory = os.getcwd()
## Use a global variable for the path to the data
FILES_PATH = f"{current_directory}/data/"

## Dictionnary to decode the position on the field
POSITION_ENCODING = {1 : "Goalkeeper", 2 : "Defender", 3 : "Midfielder", 4 : "Forward"}

In [3]:
fixtures_data = pd.read_csv(f"{FILES_PATH}fixtures.csv", delimiter=",")
odds_data = pd.read_csv(f"{FILES_PATH}odds.csv", delimiter=",")
players_data = pd.read_csv(f"{FILES_PATH}players.csv", delimiter=",")
results_data = pd.read_csv(f"{FILES_PATH}results.csv", delimiter=",")
startingXI_data = pd.read_csv(f"{FILES_PATH}startingXI.csv", delimiter=",")
teams_data = pd.read_csv(f"{FILES_PATH}teams.csv", delimiter=",")
goalkeeper_games = pd.read_parquet(f"{FILES_PATH}goalkeeper_games.pq")
goalkeeper_teams = pd.read_parquet(f"{FILES_PATH}goalkeeper_teams.pq")

In [4]:
results_data_season1 = results_data.copy()
results_data_season1 = results_data_season1[results_data_season1["SeasonID"]==1]

In [5]:
## It is important to make the distinction between these players who share their name with at least another player.
## The list of identical name will be used as a global variable.
homonymous_player_names = players_data[players_data["PlayerName"].duplicated()]["PlayerName"].values

In [6]:
gk_preprocessing = GoalkeeperPreprocessing(homonymous_player_names)
results_with_players = gk_preprocessing.run_goalkeeper_preprocessing(results_data_season1, startingXI_data, players_data)
results_with_players = gk_preprocessing.create_new_variables(results_with_players)

In [7]:
def results_encoding(team_score, opponent_score):
    if team_score>opponent_score:
        return 3
    elif team_score == opponent_score:
        return 1
    else:
        return 0
    
results_with_players["PointsScored"] = results_with_players.apply(lambda x : results_encoding(x.TeamScore, x.OpponentScore), axis = 1)
results_with_players["TeamWin"] = results_with_players["PointsScored"].map(lambda x : 1 if x == 3 else 0)
results_with_players["TeamLose"] = results_with_players["PointsScored"].map(lambda x : 1 if x == 0 else 0)
results_with_players["TeamDraw"] = results_with_players["PointsScored"].map(lambda x : 1 if x == 1 else 0)

In [8]:
results_with_players["GamePlayed"] = 1

In [9]:
cols_feature = [
"TeamScore",  
"OpponentScore", 
"TeamShots", 
"OpponentShots", 
"PointsScored", 
"TeamWin", 
"TeamLose", 
"TeamDraw", 
"GamePlayed"
]

In [10]:
def normalise_data(df : pd.DataFrame, cols_feature : list = cols_feature) -> pd.DataFrame:
    for col_name in cols_feature:
        if col_name != "GamePlayed":
            df[col_name] = df[col_name] / df["GamePlayed"]
    return df

In [11]:
results_teams = results_with_players[["MatchID", "TeamID", "TeamScore", "OpponentScore", "TeamShots", "OpponentShots", "PointsScored", "TeamWin", "TeamLose", "TeamDraw", "GamePlayed"]]
results_teams = results_teams.drop_duplicates()
results_teams = results_teams.groupby(["TeamID"]).agg(
    {col: "sum" for col in cols_feature}
).reset_index()

results_teams = normalise_data(results_teams)
results_teams.columns = ["Team_"+str(x) for x in results_teams.columns]

In [12]:
## Sum of conceded goals
agg_rules = {col: "sum" for col in cols_feature}
agg_rules["TeamID"] = lambda x : list(set(list(x)))[0]
agg_rules["PositionID"] = lambda x : list(set(list(x)))[0]

players_features = results_with_players\
    .groupby("PlayerID").agg(
        agg_rules
    ).reset_index()

players_features = normalise_data(players_features)
players_features.columns = ["Player_"+str(x) for x in players_features.columns]

In [13]:
players_teams_data = players_features.merge(results_teams, left_on = ["Player_TeamID"], right_on = ["Team_TeamID"], how = "left")

In [14]:
for col_name in cols_feature:
    # players_teams_data["Difference_"+str(col_name)] = players_teams_data["Team_"+str(col_name)] - players_teams_data["Player_"+str(col_name)]
    players_teams_data["Ratio_"+str(col_name)] = (players_teams_data["Player_"+str(col_name)] - players_teams_data["Team_"+str(col_name)])/ players_teams_data["Team_"+str(col_name)]
players_teams_data["Ratio_GamePlayed"] = players_teams_data["Player_GamePlayed"] / players_teams_data["Team_GamePlayed"]

In [15]:
players_teams_data = players_data[["PlayerID", "PlayerName"]].merge(players_teams_data, left_on = "PlayerID", right_on = "Player_PlayerID", how = "right")
players_teams_data = teams_data.merge(players_teams_data, left_on = "TeamID", right_on = "Team_TeamID", how = "right")
players_teams_data = players_teams_data.drop(columns = [
    "Player_TeamID",
    "Team_TeamID",
    "Player_PlayerID"
])
players_teams_data = players_teams_data.rename(columns = {"Player_PositionID" : "PositionID"})

## Statistics exploration

We calculate statistics on the number of games played by each player to evaluate the relevance of the data under the following assumptions:

- If a player has participated in fewer than 10% of the games, the data may not be representative or reliable for analysis.
- If a player has participated in more than 90% of the games, it may be difficult to assess their individual impact, as there is limited opportunity for comparison with other players in the same position who have rarely played.

In [48]:
min_percentage = players_teams_data["Ratio_GamePlayed"].min()
median_percentage = players_teams_data["Ratio_GamePlayed"].median()
max_percentage =  players_teams_data["Ratio_GamePlayed"].max()

print(f"Minimum percentage of games played is {min_percentage}")
print(f"Median of percentage of games played is {median_percentage}")
print(f"Maximum percentage of games played is {max_percentage}")

Minimum percentage of games played is 0.2222222222222222
Median of percentage of games played is 0.4444444444444444
Maximum percentage of games played is 0.6666666666666666


Our assumptions hold, no players need to be filtered out.

### 1. Most influencial players

We will use a simple metric: the relative difference in number of points with and without a player.
- We do find the player with highest positive impact
- And the player with the highest negative impact

In [18]:
## Find the player with the highest negative impact
player_negative = players_teams_data.iloc[players_teams_data["Ratio_PointsScored"].idxmin()]

## Find the player with the highest positive impact
player_positive = players_teams_data.iloc[players_teams_data["Ratio_PointsScored"].idxmax()]

We extract their Team ID and their postion on the pitch. 
- They both play for the same team
- They are both goalkeepers

Therefore it is a perfect example of how a single player can influence the outcome of a game. It is probably more true with goalkeepers as their individual performance directly impact the number of goals.

In [24]:
def show_player_info(player_data : pd.Series) -> list[int, str, int]:
    player_id = player_data["PlayerID"]
    position_player = player_data["PositionID"]
    name_player = player_data["PlayerName"]
    team_player = player_data["TeamName"]
    position_player_decoded = POSITION_ENCODING[position_player]
    ratio_points = player_data["Ratio_PointsScored"]
    return player_id, name_player, position_player, position_player_decoded, team_player, ratio_points

player_id_negative, name_player_negative, position_player_negative, position_player_negative_decoded, team_player_negative, ratio_points_negative = show_player_info(player_negative)
print(f"{name_player_negative} is the player with the highest negative influence. He is a {position_player_negative_decoded} playing for {team_player_negative}.")

player_id_positive, name_player_positive, position_player_positive, position_player_positive_decoded, team_player_positive, ratio_points_positive = show_player_info(player_positive)
print(f"{name_player_positive} is the player with the highest positive influence. He is a {position_player_positive_decoded} playing for {team_player_positive}.")


Charles Bates is the player with the highest negative influence. He is a Goalkeeper playing for New York C.
Charles Sanchez is the player with the highest positive influence. He is a Goalkeeper playing for New York C.


In [38]:
extreme_players = players_teams_data[players_teams_data["PlayerID"].isin([player_id_negative, player_id_positive])]
extreme_players[[x for x in extreme_players.columns if x.startswith("Ratio_") or x in ["TeamName", "PlayerName"]]]

Unnamed: 0,TeamName,PlayerName,Ratio_TeamScore,Ratio_OpponentScore,Ratio_TeamShots,Ratio_OpponentShots,Ratio_PointsScored,Ratio_TeamWin,Ratio_TeamLose,Ratio_TeamDraw,Ratio_GamePlayed
300,New York C,Charles Sanchez,0.19889,-0.028486,-0.000482,-0.000875,0.677019,0.878261,-0.12639,0.173913,0.425926
301,New York C,Charles Bates,-0.204583,0.0313,-0.032918,0.078161,-0.802198,-1.0,0.159213,-0.307692,0.240741


We see that when Charles Shanchez is playing, the team scores almsot 68% more points.