In [1]:
import csv
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Embeds graphs in Jupyter notebook (instead of pop-ups)
%matplotlib inline

# pd.set_option('display.max_columns', None) # show all columns

In [17]:
COUNTRIES = ['ENG', 'SPA', 'ITA', 'GER', 'FRA']

In [21]:
dropped_cols = []
individual_dfs = {}

for country in COUNTRIES:
    for year in range(2018, 2023):
        try:
            df = pd.read_csv(
                f"./data/teams/raw/{country}{year}_teams_for_{year+1}.csv",
                encoding='utf-8'
            )
            cols_to_drop = df.columns[df.isnull().all()].tolist()
            df.drop(columns=cols_to_drop, inplace=True)
            individual_dfs[f"{year}_{country}"] = df
            for col in cols_to_drop:
                if col not in dropped_cols:
                    dropped_cols.append(col)
        except FileNotFoundError:
            print(f"File not found for {year}")
            pass

if len(individual_dfs.values()) > 0:
    main_df = pd.concat(
        individual_dfs.values(),
        join="inner",
        ignore_index=True
    )
    print(f"Dropped cols: {dropped_cols}")
else:
    print(f"No dataframes found for any year")

Dropped cols: []


In [22]:
print(main_df.shape)
display(main_df)

(485, 184)


Unnamed: 0,squad,players_used,avg_age,possession,games,games_starts,minutes,minutes_90s,goals,assists,...,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct,season_start_year,season_end_year
0,Arsenal,28,26.7,58.1,38,418,3420,38.0,69,52,...,89,4,7,1,2000,555,637,46.6,2018,2019
1,Bournemouth,28,26.6,46.9,38,418,3420,38.0,55,43,...,57,8,5,3,1910,701,736,48.8,2018,2019
2,Brighton,21,27.4,42.4,38,418,3420,38.0,35,24,...,90,5,10,0,1979,824,780,51.4,2018,2019
3,Burnley,23,28.0,41.2,38,418,3420,38.0,43,32,...,106,1,3,4,1955,1016,1025,49.8,2018,2019
4,Chelsea,24,27.3,62.9,38,418,3420,38.0,61,52,...,83,5,2,0,2171,523,504,50.9,2018,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,Rennes,32,25.2,55.4,38,418,3420,38.0,66,47,...,54,3,5,1,2035,538,520,50.9,2022,2023
481,Strasbourg,29,27.3,45.9,38,418,3420,38.0,48,28,...,43,6,11,0,2132,677,573,54.2,2022,2023
482,Toulouse,29,25.1,51.3,38,418,3420,38.0,50,33,...,74,2,7,3,2168,483,460,51.2,2022,2023
483,Le Havre,35,25.2,55.7,38,418,3420,38.0,43,32,...,79,8,1,2,2251,602,542,52.6,2022,2023


In [23]:
# Save for exploratory data analysis

main_df.to_csv(f"./data/teams/teams.csv",index=False)

In [5]:
# Columns pulled from team tables
DROPPED = [
    'pass_xa', 'ball_recoveries', 'aerials_lost', 'progressive_passes', 'npxg_per_shot',
    'progressive_carries', 'progressive_passes_received', 'xg_per90', 'npxg_xg_assist_per90',
    'aerials_won_pct', 'npxg_per90', 'npxg_xg_assist', 'xg_xg_assist_per90', 'xg', 'npxg_net',
    'xg_assist', 'xg_assist_per90', 'xg_net', 'npxg', 'aerials_won', 'shots_free_kicks'
]
STANDARD = [
    'players_used', 'avg_age', 'possession', 'games',
    'games_starts', 'minutes', 'minutes_90s', 'goals', 'assists',
    'goals_assists', 'goals_pens', 'pens_made', 'pens_att', 'cards_yellow',
    'cards_red', 'xg', 'npxg', 'xg_assist', 'npxg_xg_assist',
    'progressive_carries', 'progressive_passes', 'goals_per90',
    'assists_per90', 'goals_assists_per90', 'goals_pens_per90',
    'goals_assists_pens_per90', 'xg_per90', 'xg_assist_per90',
    'xg_xg_assist_per90', 'npxg_per90', 'npxg_xg_assist_per90'
]
GOALKEEPING = [
    'players_used', 'gk_games', 'gk_games_starts', 'gk_minutes',
    'minutes_90s', 'gk_goals_against', 'gk_goals_against_per90',
    'gk_shots_on_target_against', 'gk_saves', 'gk_save_pct', 'gk_wins',
    'gk_ties', 'gk_losses', 'gk_clean_sheets', 'gk_clean_sheets_pct',
    'gk_pens_att', 'gk_pens_allowed', 'gk_pens_saved', 'gk_pens_missed',
    'gk_pens_save_pct'
]
ADVANCED_GOALKEEPING = [
    'players_used', 'minutes_90s', 'gk_goals_against',
    'gk_pens_allowed', 'gk_free_kick_goals_against',
    'gk_corner_kick_goals_against', 'gk_own_goals_against', 'gk_psxg',
    'gk_psnpxg_per_shot_on_target_against', 'gk_psxg_net',
    'gk_psxg_net_per90', 'gk_passes_completed_launched',
    'gk_passes_launched', 'gk_passes_pct_launched', 'gk_passes',
    'gk_passes_throws', 'gk_pct_passes_launched', 'gk_passes_length_avg',
    'gk_goal_kicks', 'gk_pct_goal_kicks_launched',
    'gk_goal_kick_length_avg', 'gk_crosses', 'gk_crosses_stopped',
    'gk_crosses_stopped_pct', 'gk_def_actions_outside_pen_area',
    'gk_def_actions_outside_pen_area_per90', 'gk_avg_distance_def_actions'
]
SHOOTING = [
    'players_used', 'minutes_90s', 'goals', 'shots',
    'shots_on_target', 'shots_on_target_pct', 'shots_per90',
    'shots_on_target_per90', 'goals_per_shot', 'goals_per_shot_on_target',
    'average_shot_distance', 'shots_free_kicks', 'pens_made', 'pens_att',
    'xg', 'npxg', 'npxg_per_shot', 'xg_net', 'npxg_net'
]
PASSING = [
    'players_used', 'minutes_90s', 'passes_completed', 'passes',
    'passes_pct', 'passes_total_distance', 'passes_progressive_distance',
    'passes_completed_short', 'passes_short', 'passes_pct_short',
    'passes_completed_medium', 'passes_medium', 'passes_pct_medium',
    'passes_completed_long', 'passes_long', 'passes_pct_long', 'assists',
    'xg_assist', 'pass_xa', 'xg_assist_net', 'assisted_shots',
    'passes_into_final_third', 'passes_into_penalty_area',
    'crosses_into_penalty_area', 'progressive_passes'
]
PASS_TYPES = [
    'players_used', 'minutes_90s', 'passes', 'passes_live',
    'passes_dead', 'passes_free_kicks', 'through_balls', 'passes_switches',
    'crosses', 'throw_ins', 'corner_kicks', 'corner_kicks_in',
    'corner_kicks_out', 'corner_kicks_straight', 'passes_completed',
    'passes_offsides', 'passes_blocked'
]
# Goal-creating actions
GCA = [
    'players_used', 'minutes_90s', 'sca', 'sca_per90',
    'sca_passes_live', 'sca_passes_dead', 'sca_take_ons', 'sca_shots',
    'sca_fouled', 'sca_defense', 'gca', 'gca_per90', 'gca_passes_live',
    'gca_passes_dead', 'gca_take_ons', 'gca_shots', 'gca_fouled',
    'gca_defense'
]
DEFENCE = [
    'players_used', 'minutes_90s', 'tackles', 'tackles_won',
    'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd',
    'challenge_tackles', 'challenges', 'challenge_tackles_pct',
    'challenges_lost', 'blocks', 'blocked_shots', 'blocked_passes',
    'interceptions', 'tackles_interceptions', 'clearances', 'errors'
]
POSSESSION = [
    'players_used', 'possession', 'minutes_90s', 'touches',
    'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd',
    'touches_att_3rd', 'touches_att_pen_area', 'touches_live_ball',
    'take_ons', 'take_ons_won', 'take_ons_won_pct', 'take_ons_tackled',
    'take_ons_tackled_pct', 'carries', 'carries_distance',
    'carries_progressive_distance', 'progressive_carries',
    'carries_into_final_third', 'carries_into_penalty_area', 'miscontrols',
    'dispossessed', 'passes_received', 'progressive_passes_received'
]
MISCELLANEOUS = [
    'players_used', 'minutes_90s', 'cards_yellow', 'cards_red',
    'cards_yellow_red', 'fouls', 'fouled', 'offsides', 'crosses',
    'interceptions', 'tackles_won', 'pens_won', 'pens_conceded',
    'own_goals', 'ball_recoveries', 'aerials_won', 'aerials_lost',
    'aerials_won_pct'
]

In [6]:
ALL_FEATURES = list(set(STANDARD + GOALKEEPING + ADVANCED_GOALKEEPING + SHOOTING + PASSING + PASS_TYPES + GCA + DEFENCE + POSSESSION + MISCELLANEOUS) - set(DROPPED))
all_features = pd.Series(ALL_FEATURES)
all_features

0                          touches
1                         gk_saves
2                  sca_passes_dead
3                      minutes_90s
4                     players_used
                  ...             
155            gk_clean_sheets_pct
156                     clearances
157    gk_avg_distance_def_actions
158                     sca_fouled
159               cards_yellow_red
Length: 160, dtype: object

In [7]:
teams_df = main_dfs['teams']
teams_df[['season_start_year', 'squad', *ALL_FEATURES]]

NameError: name 'main_dfs' is not defined

In [None]:
individual_score_dfs = {}

for year in range(1992, 2023):
    try:
        individual_score_dfs[year] = pd.read_csv(
            f"./data/scores/PL{year}_scores.csv",
            encoding='utf-8'
        )
    except FileNotFoundError:
        pass

df_1992 = individual_score_dfs[1992]
df_1992

Unnamed: 0,times,home_teams,home_scores,away_teams,away_scores,season_start_year,season_end_year
0,11.05. 22:00,Arsenal,1,Tottenham,3,1992,1993
1,11.05. 22:00,QPR,3,Sheffield Wed,1,1992,1993
2,09.05. 22:00,QPR,2,Aston Villa,1,1992,1993
3,09.05. 22:00,Wimbledon FC,1,Manchester Utd,2,1992,1993
4,08.05. 22:00,Arsenal,3,Crystal Palace,0,1992,1993
...,...,...,...,...,...,...,...
457,15.08. 22:00,Everton,1,Sheffield Wed,1,1992,1993
458,15.08. 22:00,Ipswich,1,Aston Villa,1,1992,1993
459,15.08. 22:00,Leeds,2,Wimbledon FC,1,1992,1993
460,15.08. 22:00,Sheffield Utd,2,Manchester Utd,1,1992,1993
