In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import statsmodels.api as sm
from scipy.stats import poisson
from scipy import stats
from sklearn.preprocessing import StandardScaler
import numpy as np, scipy.stats as st
import json
from project_code.utils import GoalkeeperPreprocessing

In [2]:
## Get the current working directory
current_directory = os.getcwd()
## Use a global variable for the path to the data
FILES_PATH = f"{current_directory}/data/"

In [3]:
fixtures_data = pd.read_csv(f"{FILES_PATH}fixtures.csv", delimiter=",")
odds_data = pd.read_csv(f"{FILES_PATH}odds.csv", delimiter=",")
players_data = pd.read_csv(f"{FILES_PATH}players.csv", delimiter=",")
results_data = pd.read_csv(f"{FILES_PATH}results.csv", delimiter=",")
startingXI_data = pd.read_csv(f"{FILES_PATH}startingXI.csv", delimiter=",")
teams_data = pd.read_csv(f"{FILES_PATH}teams.csv", delimiter=",")
goalkeeper_games = pd.read_parquet(f"{FILES_PATH}goalkeeper_games.pq")
goalkeeper_teams = pd.read_parquet(f"{FILES_PATH}goalkeeper_teams.pq")

In [4]:
results_data_season1 = results_data.copy()
results_data_season1 = results_data_season1[results_data_season1["SeasonID"]==1]

In [5]:
## It is important to make the distinction between these players who share their name with at least another player.
## The list of identical name will be used as a global variable.
homonymous_player_names = players_data[players_data["PlayerName"].duplicated()]["PlayerName"].values

In [6]:
gk_preprocessing = GoalkeeperPreprocessing(homonymous_player_names)
results_with_players = gk_preprocessing.run_goalkeeper_preprocessing(results_data_season1, startingXI_data, players_data)
results_with_players = gk_preprocessing.create_new_variables(results_with_players)

In [7]:
def results_encoding(team_score, opponent_score):
    if team_score>opponent_score:
        return 3
    elif team_score == opponent_score:
        return 1
    else:
        return 0
    
results_with_players["PointsScored"] = results_with_players.apply(lambda x : results_encoding(x.TeamScore, x.OpponentScore), axis = 1)
results_with_players["TeamWin"] = results_with_players["PointsScored"].map(lambda x : 1 if x == 3 else 0)
results_with_players["TeamLose"] = results_with_players["PointsScored"].map(lambda x : 1 if x == 0 else 0)
results_with_players["TeamDraw"] = results_with_players["PointsScored"].map(lambda x : 1 if x == 1 else 0)

In [8]:
results_with_players["GamePlayed"] = 1

In [50]:
cols_feature = [
"TeamScore",  
"OpponentScore", 
"TeamShots", 
"OpponentShots", 
"PointsScored", 
"TeamWin", 
"TeamLose", 
"TeamDraw", 
"GamePlayed"
]

In [51]:
results_teams = results_with_players[["MatchID", "TeamID", "TeamScore", "OpponentScore", "TeamShots", "OpponentShots", "PointsScored", "TeamWin", "TeamLose", "TeamDraw", "GamePlayed"]]
results_teams = results_teams.drop_duplicates()
results_teams = results_teams.groupby(["TeamID"]).agg(
    {col: "sum" for col in cols_feature}
).reset_index()

results_teams.columns = ["Team_"+str(x) for x in results_teams.columns]

In [53]:
## Sum of conceded goals

agg_rules = {col: "sum" for col in cols_feature}
agg_rules["TeamID"] = lambda x : list(set(list(x)))[0]

players_features = results_with_players\
    .groupby("PlayerID").agg(
        agg_rules
    ).reset_index()

players_features.columns = ["Player_"+str(x) for x in players_features.columns]

In [55]:
players_teams_data = players_features.merge(results_teams, left_on = ["Player_TeamID"], right_on = ["Team_TeamID"], how = "left")

In [56]:
players_teams_data

Unnamed: 0,Player_PlayerID,Player_TeamScore,Player_OpponentScore,Player_TeamShots,Player_OpponentShots,Player_PointsScored,Player_TeamWin,Player_TeamLose,Player_TeamDraw,Player_GamePlayed,...,Team_TeamID,Team_TeamScore,Team_OpponentScore,Team_TeamShots,Team_OpponentShots,Team_PointsScored,Team_TeamWin,Team_TeamLose,Team_TeamDraw,Team_GamePlayed
0,1,14,13,211,160,18,4,5,6,15,...,1,80,60,771,654,77,21,19,14,54
1,2,35,26,311,281,32,9,7,5,21,...,1,80,60,771,654,77,21,19,14,54
2,3,31,21,249,213,27,8,7,3,18,...,1,80,60,771,654,77,21,19,14,54
3,4,33,28,364,325,36,9,9,9,27,...,1,80,60,771,654,77,21,19,14,54
4,5,46,36,409,384,40,11,13,7,31,...,1,80,60,771,654,77,21,19,14,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,696,23,30,267,279,26,7,9,5,21,...,9,51,91,581,793,52,14,30,10,54
696,697,16,37,244,352,15,4,16,3,23,...,9,51,91,581,793,52,14,30,10,54
697,698,17,22,157,219,21,6,6,3,15,...,9,51,91,581,793,52,14,30,10,54
698,699,23,51,275,355,22,6,16,4,26,...,9,51,91,581,793,52,14,30,10,54


In [12]:
forwards_games = results_with_players[results_with_players["PositionID"]==4]\
    .drop_duplicates()\
        .sort_values(by="PlayerID")\
            .reset_index(drop=True)

In [None]:
goalkeeper_games.to_parquet(f"{FILES_PATH}goalkeeper_games.pq")
goalkeeper_teams.to_parquet(f"{FILES_PATH}goalkeeper_teams.pq")

In [None]:
players_per_team = results_with_players[["TeamID", "PlayerID", "PositionID", "PlayerTakenShotsSeason", "PlayerScoredGoalsSeason", "PointsScored"]]
players_per_team = players_per_team[players_per_team["PositionID"]==4]\
    .drop_duplicates()\
        .sort_values(by="PlayerID")\
            .reset_index(drop=True)

In [26]:
players_per_team[:60]

Unnamed: 0,TeamID,PlayerID,PositionID
0,1,20,4
1,1,21,4
2,1,22,4
3,1,23,4
4,1,24,4
5,1,25,4
6,10,45,4
7,10,46,4
8,10,47,4
9,10,48,4
