In [1]:
import csv
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Embeds graphs in Jupyter notebook (instead of pop-ups)
%matplotlib inline

# pd.set_option('display.max_columns', None) # show all columns

In [2]:
main_dfs = {}
individual_dfs = {}

OPPONENT_STATS = "teams_vs"
SQUAD_STATS = "teams"

for data_category in [SQUAD_STATS, OPPONENT_STATS]:
    individual_dfs[data_category] = {}
    for year in range(1992, 2023):
        try:
            individual_dfs[data_category][year] = pd.read_csv(
                f"./data/{data_category}/raw/PL{year}_{data_category}.csv",
                encoding='utf-8'
            )
        except FileNotFoundError:
            pass
   
    if data_category == OPPONENT_STATS:
        print("The following stats were only available from 2017 onwards;" +
        "they have been omitted from the combined dataframe for simplicity:")
        print(list(set(individual_dfs[OPPONENT_STATS][2017].columns) - set(individual_dfs[OPPONENT_STATS][2016].columns)))

    if len(individual_dfs[data_category].values()) > 0:
        main_dfs[data_category] = pd.concat(
            individual_dfs[data_category].values(),
            join="inner",
            ignore_index=True
        )

The following stats were only available from 2017 onwards;they have been omitted from the combined dataframe for simplicity:
['xg_per90', 'progressive_passes_received', 'pass_xa', 'npxg_per90', 'aerials_won', 'xg_net', 'npxg_per_shot', 'xg_assist_per90', 'npxg_xg_assist_per90', 'npxg_net', 'aerials_won_pct', 'xg_xg_assist_per90', 'xg', 'ball_recoveries', 'npxg_xg_assist', 'npxg', 'xg_assist', 'progressive_passes', 'shots_free_kicks', 'progressive_carries', 'aerials_lost']


In [3]:
# Check for null values
# main_dfs[OPPONENT_STATS].isnull().any().any()

In [4]:
# Save for exploratory data analysis
for data_category, df in main_dfs.items():
    df.to_csv(f"./data/{data_category}/{data_category}.csv",index=False)

In [5]:
# Columns pulled from team tables
DROPPED = [
    'pass_xa', 'ball_recoveries', 'aerials_lost', 'progressive_passes', 'npxg_per_shot',
    'progressive_carries', 'progressive_passes_received', 'xg_per90', 'npxg_xg_assist_per90',
    'aerials_won_pct', 'npxg_per90', 'npxg_xg_assist', 'xg_xg_assist_per90', 'xg', 'npxg_net',
    'xg_assist', 'xg_assist_per90', 'xg_net', 'npxg', 'aerials_won', 'shots_free_kicks'
]
STANDARD = [
    'players_used', 'avg_age', 'possession', 'games',
    'games_starts', 'minutes', 'minutes_90s', 'goals', 'assists',
    'goals_assists', 'goals_pens', 'pens_made', 'pens_att', 'cards_yellow',
    'cards_red', 'xg', 'npxg', 'xg_assist', 'npxg_xg_assist',
    'progressive_carries', 'progressive_passes', 'goals_per90',
    'assists_per90', 'goals_assists_per90', 'goals_pens_per90',
    'goals_assists_pens_per90', 'xg_per90', 'xg_assist_per90',
    'xg_xg_assist_per90', 'npxg_per90', 'npxg_xg_assist_per90'
]
GOALKEEPING = [
    'players_used', 'gk_games', 'gk_games_starts', 'gk_minutes',
    'minutes_90s', 'gk_goals_against', 'gk_goals_against_per90',
    'gk_shots_on_target_against', 'gk_saves', 'gk_save_pct', 'gk_wins',
    'gk_ties', 'gk_losses', 'gk_clean_sheets', 'gk_clean_sheets_pct',
    'gk_pens_att', 'gk_pens_allowed', 'gk_pens_saved', 'gk_pens_missed',
    'gk_pens_save_pct'
]
ADVANCED_GOALKEEPING = [
    'players_used', 'minutes_90s', 'gk_goals_against',
    'gk_pens_allowed', 'gk_free_kick_goals_against',
    'gk_corner_kick_goals_against', 'gk_own_goals_against', 'gk_psxg',
    'gk_psnpxg_per_shot_on_target_against', 'gk_psxg_net',
    'gk_psxg_net_per90', 'gk_passes_completed_launched',
    'gk_passes_launched', 'gk_passes_pct_launched', 'gk_passes',
    'gk_passes_throws', 'gk_pct_passes_launched', 'gk_passes_length_avg',
    'gk_goal_kicks', 'gk_pct_goal_kicks_launched',
    'gk_goal_kick_length_avg', 'gk_crosses', 'gk_crosses_stopped',
    'gk_crosses_stopped_pct', 'gk_def_actions_outside_pen_area',
    'gk_def_actions_outside_pen_area_per90', 'gk_avg_distance_def_actions'
]
SHOOTING = [
    'players_used', 'minutes_90s', 'goals', 'shots',
    'shots_on_target', 'shots_on_target_pct', 'shots_per90',
    'shots_on_target_per90', 'goals_per_shot', 'goals_per_shot_on_target',
    'average_shot_distance', 'shots_free_kicks', 'pens_made', 'pens_att',
    'xg', 'npxg', 'npxg_per_shot', 'xg_net', 'npxg_net'
]
PASSING = [
    'players_used', 'minutes_90s', 'passes_completed', 'passes',
    'passes_pct', 'passes_total_distance', 'passes_progressive_distance',
    'passes_completed_short', 'passes_short', 'passes_pct_short',
    'passes_completed_medium', 'passes_medium', 'passes_pct_medium',
    'passes_completed_long', 'passes_long', 'passes_pct_long', 'assists',
    'xg_assist', 'pass_xa', 'xg_assist_net', 'assisted_shots',
    'passes_into_final_third', 'passes_into_penalty_area',
    'crosses_into_penalty_area', 'progressive_passes'
]
PASS_TYPES = [
    'players_used', 'minutes_90s', 'passes', 'passes_live',
    'passes_dead', 'passes_free_kicks', 'through_balls', 'passes_switches',
    'crosses', 'throw_ins', 'corner_kicks', 'corner_kicks_in',
    'corner_kicks_out', 'corner_kicks_straight', 'passes_completed',
    'passes_offsides', 'passes_blocked'
]
# Goal-creating actions
GCA = [
    'players_used', 'minutes_90s', 'sca', 'sca_per90',
    'sca_passes_live', 'sca_passes_dead', 'sca_take_ons', 'sca_shots',
    'sca_fouled', 'sca_defense', 'gca', 'gca_per90', 'gca_passes_live',
    'gca_passes_dead', 'gca_take_ons', 'gca_shots', 'gca_fouled',
    'gca_defense'
]
DEFENCE = [
    'players_used', 'minutes_90s', 'tackles', 'tackles_won',
    'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd',
    'challenge_tackles', 'challenges', 'challenge_tackles_pct',
    'challenges_lost', 'blocks', 'blocked_shots', 'blocked_passes',
    'interceptions', 'tackles_interceptions', 'clearances', 'errors'
]
POSSESSION = [
    'players_used', 'possession', 'minutes_90s', 'touches',
    'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd',
    'touches_att_3rd', 'touches_att_pen_area', 'touches_live_ball',
    'take_ons', 'take_ons_won', 'take_ons_won_pct', 'take_ons_tackled',
    'take_ons_tackled_pct', 'carries', 'carries_distance',
    'carries_progressive_distance', 'progressive_carries',
    'carries_into_final_third', 'carries_into_penalty_area', 'miscontrols',
    'dispossessed', 'passes_received', 'progressive_passes_received'
]
MISCELLANEOUS = [
    'players_used', 'minutes_90s', 'cards_yellow', 'cards_red',
    'cards_yellow_red', 'fouls', 'fouled', 'offsides', 'crosses',
    'interceptions', 'tackles_won', 'pens_won', 'pens_conceded',
    'own_goals', 'ball_recoveries', 'aerials_won', 'aerials_lost',
    'aerials_won_pct'
]

In [6]:
ALL_FEATURES = list(set(STANDARD + GOALKEEPING + ADVANCED_GOALKEEPING + SHOOTING + PASSING + PASS_TYPES + GCA + DEFENCE + POSSESSION + MISCELLANEOUS) - set(DROPPED))
all_features = pd.Series(ALL_FEATURES)
all_features

0      gk_def_actions_outside_pen_area_per90
1                                      shots
2                                    avg_age
3                              gk_goal_kicks
4                  crosses_into_penalty_area
                       ...                  
155                               sca_fouled
156               gk_free_kick_goals_against
157                        touches_live_ball
158                               gk_minutes
159                               goals_pens
Length: 160, dtype: object

In [9]:
teams_df = main_dfs['teams']
teams_df[['season_start_year', 'squad', *ALL_FEATURES]]

Unnamed: 0,season_start_year,squad,gk_def_actions_outside_pen_area_per90,shots,avg_age,gk_goal_kicks,crosses_into_penalty_area,cards_yellow_red,gk_saves,touches_def_pen_area,...,gca_take_ons,throw_ins,challenges_lost,gk_own_goals_against,shots_on_target,sca_fouled,gk_free_kick_goals_against,touches_live_ball,gk_minutes,goals_pens
0,1992,Arsenal,0.00,0.0,26.4,0.0,0.0,0.0,163.0,0.0,...,0.0,0.0,0.0,0.0,179.0,0.0,0.0,0.0,3880.0,40.0
1,1992,Aston Villa,0.00,0.0,27.0,0.0,0.0,0.0,149.0,0.0,...,0.0,0.0,0.0,0.0,217.0,0.0,0.0,0.0,3780.0,54.0
2,1992,Blackburn,0.00,0.0,26.1,0.0,0.0,0.0,138.0,0.0,...,0.0,0.0,0.0,0.0,230.0,0.0,0.0,0.0,3780.0,63.0
3,1992,Chelsea,0.00,0.0,26.3,0.0,0.0,0.0,156.0,0.0,...,0.0,0.0,0.0,0.0,195.0,0.0,0.0,0.0,3780.0,49.0
4,1992,Coventry City,0.00,0.0,26.4,0.0,0.0,0.0,134.0,0.0,...,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,3780.0,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,2022,Nott'ham Forest,1.00,362.0,26.5,338.0,50.0,0.0,112.0,2715.0,...,4.0,720.0,288.0,0.0,111.0,36.0,2.0,17906.0,3420.0,33.0
622,2022,Southampton,1.45,413.0,24.7,303.0,71.0,0.0,81.0,2639.0,...,7.0,752.0,314.0,5.0,132.0,38.0,1.0,21091.0,3420.0,34.0
623,2022,Tottenham,1.18,512.0,27.6,283.0,104.0,1.0,109.0,2734.0,...,7.0,664.0,299.0,1.0,186.0,38.0,1.0,23830.0,3420.0,63.0
624,2022,West Ham,0.68,466.0,28.2,302.0,80.0,0.0,115.0,2362.0,...,6.0,722.0,292.0,1.0,133.0,35.0,0.0,20402.0,3420.0,35.0


In [10]:
individual_score_dfs = {}

for year in range(1992, 2023):
    try:
        individual_score_dfs[year] = pd.read_csv(
            f"./data/scores/PL{year}_scores.csv",
            encoding='utf-8'
        )
    except FileNotFoundError:
        pass

df_1992 = individual_score_dfs[1992]
df_1992

Unnamed: 0,times,home_teams,home_scores,away_teams,away_scores,season_start_year,season_end_year
0,11.05. 22:00,Arsenal,1,Tottenham,3,1992,1993
1,11.05. 22:00,QPR,3,Sheffield Wed,1,1992,1993
2,09.05. 22:00,QPR,2,Aston Villa,1,1992,1993
3,09.05. 22:00,Wimbledon FC,1,Manchester Utd,2,1992,1993
4,08.05. 22:00,Arsenal,3,Crystal Palace,0,1992,1993
...,...,...,...,...,...,...,...
457,15.08. 22:00,Everton,1,Sheffield Wed,1,1992,1993
458,15.08. 22:00,Ipswich,1,Aston Villa,1,1992,1993
459,15.08. 22:00,Leeds,2,Wimbledon FC,1,1992,1993
460,15.08. 22:00,Sheffield Utd,2,Manchester Utd,1,1992,1993
