In [1]:
import csv
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Embeds graphs in Jupyter notebook (instead of pop-ups)
%matplotlib inline

# pd.set_option('display.max_columns', None) # show all columns

In [2]:
main_dfs = {}
individual_dfs = {}

OPPONENT_STATS = "teams_vs"
SQUAD_STATS = "teams"

for data_category in [SQUAD_STATS, OPPONENT_STATS]:
    individual_dfs[data_category] = {}
    for year in range(1992, 2023):
        try:
            individual_dfs[data_category][year] = pd.read_csv(
                f"./data/{data_category}/raw/PL{year}_{data_category}.csv",
                encoding='utf-8'
            )
        except FileNotFoundError:
            pass
   
    if data_category == OPPONENT_STATS:
        print("The following stats were only available from 2017 onwards;" +
        "they have been omitted from the combined dataframe for simplicity:")
        print(list(set(individual_dfs[OPPONENT_STATS][2017].columns) - set(individual_dfs[OPPONENT_STATS][2016].columns)))

    if len(individual_dfs[data_category].values()) > 0:
        main_dfs[data_category] = pd.concat(
            individual_dfs[data_category].values(),
            join="inner",
            ignore_index=True
        )

The following stats were only available from 2017 onwards;they have been omitted from the combined dataframe for simplicity:
['pass_xa', 'xg', 'npxg_per90', 'aerials_lost', 'xg_xg_assist_per90', 'npxg_net', 'xg_assist_per90', 'aerials_won', 'npxg_xg_assist_per90', 'xg_assist', 'xg_per90', 'ball_recoveries', 'xg_net', 'npxg', 'shots_free_kicks', 'progressive_carries', 'npxg_xg_assist', 'npxg_per_shot', 'progressive_passes_received', 'aerials_won_pct', 'progressive_passes']


In [3]:
# Check for null values
# main_dfs[OPPONENT_STATS].isnull().any().any()

In [4]:
# Save for exploratory data analysis
for data_category, df in main_dfs.items():
    df.to_csv(f"./data/{data_category}/{data_category}.csv",index=False)

In [5]:
# Normalize data so features have same scale, such that the model will not
# give different weights to different features based on their range
# Only team name has values of type string; rest are numeric
for data_category, df in main_dfs.items():
    scaler = StandardScaler()
    encoder = OrdinalEncoder()
    df_non_numeric = df.select_dtypes(exclude=[np.number])
    df[df_non_numeric.columns] =  pd.DataFrame(
        encoder.fit_transform(df_non_numeric),
        columns=df_non_numeric.columns
    )
    pd.DataFrame(scaler.fit_transform(df),columns=df.columns) \
        .to_csv(f"./data/machine_learning/train_{data_category}.csv",index=False)

In [None]:
# Columns pulled from team tables
STANDARD = [
    'players_used', 'avg_age', 'possession', 'games',
    'games_starts', 'minutes', 'minutes_90s', 'goals', 'assists',
    'goals_assists', 'goals_pens', 'pens_made', 'pens_att', 'cards_yellow',
    'cards_red', 'xg', 'npxg', 'xg_assist', 'npxg_xg_assist',
    'progressive_carries', 'progressive_passes', 'goals_per90',
    'assists_per90', 'goals_assists_per90', 'goals_pens_per90',
    'goals_assists_pens_per90', 'xg_per90', 'xg_assist_per90',
    'xg_xg_assist_per90', 'npxg_per90', 'npxg_xg_assist_per90'
]
GOALKEEPING = [
    'players_used', 'gk_games', 'gk_games_starts', 'gk_minutes',
    'minutes_90s', 'gk_goals_against', 'gk_goals_against_per90',
    'gk_shots_on_target_against', 'gk_saves', 'gk_save_pct', 'gk_wins',
    'gk_ties', 'gk_losses', 'gk_clean_sheets', 'gk_clean_sheets_pct',
    'gk_pens_att', 'gk_pens_allowed', 'gk_pens_saved', 'gk_pens_missed',
    'gk_pens_save_pct'
]
ADVANCED_GOALKEEPING = [
    'players_used', 'minutes_90s', 'gk_goals_against',
    'gk_pens_allowed', 'gk_free_kick_goals_against',
    'gk_corner_kick_goals_against', 'gk_own_goals_against', 'gk_psxg',
    'gk_psnpxg_per_shot_on_target_against', 'gk_psxg_net',
    'gk_psxg_net_per90', 'gk_passes_completed_launched',
    'gk_passes_launched', 'gk_passes_pct_launched', 'gk_passes',
    'gk_passes_throws', 'gk_pct_passes_launched', 'gk_passes_length_avg',
    'gk_goal_kicks', 'gk_pct_goal_kicks_launched',
    'gk_goal_kick_length_avg', 'gk_crosses', 'gk_crosses_stopped',
    'gk_crosses_stopped_pct', 'gk_def_actions_outside_pen_area',
    'gk_def_actions_outside_pen_area_per90', 'gk_avg_distance_def_actions'
]
SHOOTING = [
    'players_used', 'minutes_90s', 'goals', 'shots',
    'shots_on_target', 'shots_on_target_pct', 'shots_per90',
    'shots_on_target_per90', 'goals_per_shot', 'goals_per_shot_on_target',
    'average_shot_distance', 'shots_free_kicks', 'pens_made', 'pens_att',
    'xg', 'npxg', 'npxg_per_shot', 'xg_net', 'npxg_net'
]
PASSING = [
    'players_used', 'minutes_90s', 'passes_completed', 'passes',
    'passes_pct', 'passes_total_distance', 'passes_progressive_distance',
    'passes_completed_short', 'passes_short', 'passes_pct_short',
    'passes_completed_medium', 'passes_medium', 'passes_pct_medium',
    'passes_completed_long', 'passes_long', 'passes_pct_long', 'assists',
    'xg_assist', 'pass_xa', 'xg_assist_net', 'assisted_shots',
    'passes_into_final_third', 'passes_into_penalty_area',
    'crosses_into_penalty_area', 'progressive_passes'
]
PASS_TYPES = [
    'players_used', 'minutes_90s', 'passes', 'passes_live',
    'passes_dead', 'passes_free_kicks', 'through_balls', 'passes_switches',
    'crosses', 'throw_ins', 'corner_kicks', 'corner_kicks_in',
    'corner_kicks_out', 'corner_kicks_straight', 'passes_completed',
    'passes_offsides', 'passes_blocked'
]
# Goal-creating actions
GCA = [
    'players_used', 'minutes_90s', 'sca', 'sca_per90',
    'sca_passes_live', 'sca_passes_dead', 'sca_take_ons', 'sca_shots',
    'sca_fouled', 'sca_defense', 'gca', 'gca_per90', 'gca_passes_live',
    'gca_passes_dead', 'gca_take_ons', 'gca_shots', 'gca_fouled',
    'gca_defense'
]
DEFENCE = [
    'players_used', 'minutes_90s', 'tackles', 'tackles_won',
    'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd',
    'challenge_tackles', 'challenges', 'challenge_tackles_pct',
    'challenges_lost', 'blocks', 'blocked_shots', 'blocked_passes',
    'interceptions', 'tackles_interceptions', 'clearances', 'errors'
]
POSSESSION = [
    'players_used', 'possession', 'minutes_90s', 'touches',
    'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd',
    'touches_att_3rd', 'touches_att_pen_area', 'touches_live_ball',
    'take_ons', 'take_ons_won', 'take_ons_won_pct', 'take_ons_tackled',
    'take_ons_tackled_pct', 'carries', 'carries_distance',
    'carries_progressive_distance', 'progressive_carries',
    'carries_into_final_third', 'carries_into_penalty_area', 'miscontrols',
    'dispossessed', 'passes_received', 'progressive_passes_received'
]
MISCELLANEOUS = [
    'players_used', 'minutes_90s', 'cards_yellow', 'cards_red',
    'cards_yellow_red', 'fouls', 'fouled', 'offsides', 'crosses',
    'interceptions', 'tackles_won', 'pens_won', 'pens_conceded',
    'own_goals', 'ball_recoveries', 'aerials_won', 'aerials_lost',
    'aerials_won_pct'
]