In [1]:
import csv
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

# Embeds graphs in Jupyter notebook (instead of pop-ups)
%matplotlib inline

# pd.set_option('display.max_columns', None) # show all columns
pd.set_option('display.max_rows', None) # show all rows

In [2]:
teams_df = pd.read_csv(f"./data/teams/teams.csv")
scores_df = pd.read_csv(f"./data/scores/scores.csv")

In [7]:
list(scores_df.columns)

['time',
 'home_team',
 'home_score',
 'away_team',
 'away_score',
 'season_start_year',
 'season_end_year',
 'correct_year']

In [4]:
# Manually remove similar columns
# list(teams_df.columns)

teams_90s_played_df = teams_df['squad', 'minutes_90s']

# Data that is already averaged per game
TEAMS_PER90_COLUMNS = [
#     Squad stats
    'possession',
    'goals_per90',
    'assists_per90',
    'goals_assists_per90',
    'goals_pens_per90',
    'goals_assists_pens_per90',
    'xg_per90',
    'npxg_per90',
    'xg_assist_per90',
    'xg_xg_assist_per90',
    'npxg_xg_assist_per90',
#     GK stats
    'gk_goals_against_per90',
    'gk_save_pct',
    'gk_clean_sheets_pct',
    'gk_psxg', # post-shot xG
    'gk_psnpxg_per_shot_on_target_against',
    'gk_psxg_net_per90',
    'gk_passes_pct_launched', # long pass completion
    'gk_pct_passes_launched', # percentage of passes that are long
    'gk_goal_kick_length_avg',
    'gk_crosses_stopped_pct',
    'gk_def_actions_outside_pen_area_per90',
#     Shooting stats
    'shots_on_target_pct',
    'shots_per90',
    'shots_on_target_per90',
    'goals_per_shot',
    'goals_per_shot_on_target',
    'average_shot_distance',
    'npxg_per_shot',
#     Passing stats
    'passes_pct',
    'passes_pct_short',
    'passes_pct_medium',
    'passes_pct_long',
#     Shot-creating actions
    'sca_per90',
    'gca_per90',
#     Possession
    'take_ons_won_pct',
    'take_ons_tackled_pct',
#     Defensive
    'challenge_tackles_pct', # tackle success rate
    'aerials_won_pct',
]

# Data that could be averaged per game for "fairer" comparison
TEAMS_PRE_PER90_COLUMNS = [
#     Squad stats
    'players_used',
#     'goals',
#     'assists',
#     'goals_assists',
#     'goals_pens',
    'pens_made',
    'pens_att',
    'cards_yellow',
    'cards_red',
    'cards_yellow_red',
    'fouls',
    'fouled',
    'offsides',
    'own_goals',
#     'xg',
#     'npxg',
#     'xg_assist',
#     'npxg_xg_assist',
    'progressive_carries',
#     GK stats
    'gk_goals_against',
    'gk_shots_on_target_against',
#     'gk_saves',
#     'gk_clean_sheets',
    'gk_pens_saved',
    'gk_pens_missed',
    'gk_free_kick_goals_against',
    'gk_corner_kick_goals_against',
#     'gk_own_goals_against', # same as own_goals
    'gk_psxg_net', # PSxG - goals allowed
#     'gk_passes_completed_launched', # passes longer than 40 yards
#     'gk_passes_launched',
#     'gk_passes',
#     'gk_passes_throws',
#     'gk_goal_kicks',
#     'gk_pct_goal_kicks_launched',
#     'gk_crosses',
#     'gk_crosses_stopped',
#     'gk_def_actions_outside_pen_area',
#     Shooting stats
#     'shots',
#     'shots_on_target',
#     'shots_free_kicks',
#     'xg_net',
#     'npxg_net',
#     Passing stats
#     'passes_completed',
#     'passes',
    'passes_total_distance',
    'progressive_passes',
    'passes_progressive_distance',
#     'passes_completed_short',
#     'passes_short',
#     'passes_completed_medium',
#     'passes_medium',
#     'passes_completed_long',
#     'passes_long',
    'pass_xa',
    'xg_assist_net',
    'assisted_shots',
    'passes_into_final_third',
    'passes_into_penalty_area',
    'crosses_into_penalty_area',
#     Passing Types
    'passes_live',
    'passes_dead',
    'passes_free_kicks',
    'through_balls',
    'passes_switches',
    'crosses',
    'throw_ins',
    'corner_kicks',
    'corner_kicks_in',
    'corner_kicks_out',
    'corner_kicks_straight',
    'passes_offsides',
    'passes_blocked',
#     Shot-creating actions
#     'sca',    
    'sca_passes_live',
    'sca_passes_dead',
    'sca_take_ons',
    'sca_shots',
    'sca_fouled',
    'sca_defense',
#     'gca',    
    'gca_passes_live',
    'gca_passes_dead',
    'gca_take_ons',
    'gca_shots',
    'gca_fouled',
    'gca_defense',
#     Defensive actions
    'tackles',
    'tackles_won',
    'tackles_def_3rd',
    'tackles_mid_3rd',
    'tackles_att_3rd',
    'challenge_tackles',
    'challenges',
    'challenges_lost',
    'blocks',
    'blocked_shots',
    'blocked_passes',
    'interceptions',
    'tackles_interceptions',
    'clearances',
    'errors',
    'ball_recoveries',
    'aerials_won',
    'aerials_lost',
#     Possession
    'touches',
    'touches_def_pen_area',
    'touches_def_3rd',
    'touches_mid_3rd',
    'touches_att_3rd',
    'touches_att_pen_area',
    'touches_live_ball',
    'take_ons',
#     'take_ons_won',
#     'take_ons_tackled',
    'carries',
    'carries_distance',
    'carries_progressive_distance',
    'carries_into_final_third',
    'carries_into_penalty_area',
    'miscontrols',
    'dispossessed',
    'passes_received',
    'progressive_passes_received',
]

# Categorical data/data that does not need to be averaged per game
TEAMS_ABSOLUTE_COLUMNS = [
#     Squad stats
    'squad', # team name
    'avg_age',
#     GK stats
    'gk_wins',
    'gk_ties',
    'gk_losses',   
    'season_start_year',
    'season_end_year',
]

In [5]:
def get_correlation(df, columns):
    '''Returns correlation between selected columns'''
    df_corr = df[columns].corr().abs() # get correlation matrix; absolute values for strength of correlation
    df_corr = df_corr.where(np.triu(np.ones(df_corr.shape)).astype(bool)) # get upper triangular values only
    df_corr = df_corr.stack().sort_values(ascending=False).reset_index()
    df_corr.columns = ['col_1', 'col_2', 'corr']
    return df_corr[df_corr['corr'] < 1]

In [6]:
# print(list(teams_df.columns))

# display(get_correlation(teams_df, list(teams_df.columns)))