In [1]:
import csv
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from constants import *

# Embeds graphs in Jupyter notebook (instead of pop-ups)
%matplotlib inline

pd.set_option('display.max_columns', None) # show all columns
pd.set_option('display.max_rows', None) # show all rows

In [2]:
teams_df = pd.read_csv(f"./data/teams/teams.csv")
scores_df = pd.read_csv(f"./data/scores/scores.csv")

In [3]:
# Categorise data - refer to constants.py
# teams_90s_played_df = teams_df[['squad', 'minutes_90s']]

# Drop unwanted columns
teams_df = teams_df[list(sum(TEAM_COLUMNS_DICT_COMBINED.values(), []))]

In [4]:
def get_correlation(df, columns):
    '''Returns correlation between selected columns'''
    df_corr = df[columns].corr().abs() # get correlation matrix; absolute values for strength of correlation
    df_corr = df_corr.where(np.triu(np.ones(df_corr.shape)).astype(bool)) # get upper triangular values only
    df_corr = df_corr.stack().sort_values(ascending=False).reset_index()
    df_corr.columns = ['col_1', 'col_2', 'corr']
    return df_corr[df_corr['corr'] < 1].reset_index(drop=True)

def in_different_category(col1, col2):
    for col in TEAM_COLUMNS_DICT_COMBINED.values():
        if col1 in col and col2 in col:
            return False
    return True

In [5]:
# Get correlation
teams_corr_df = get_correlation(teams_df, list(teams_df.columns))

# Filter out correlations between similar variables
teams_corr_df = teams_corr_df[
    teams_corr_df.apply(
        lambda row: in_different_category(row['col_1'], row['col_2']),
        axis=1
    )
].reset_index(drop=True)

In [6]:
# Note: the following correlation values are absolute (i.e. |corr|)
# as we are measuring strength of correlationship for now
# KIV: are a few outliers skewing the metrics?

# Strong correlation between passes and carries
# passes_live	carries_distance	0.889320
# passes_into_final_third	carries	0.883365
# passes_pct	carries	0.835825

# Strong correlation between progressive actions and: g/a, possession
# teams_corr_df[
#     (teams_corr_df['col_1'].str.contains("progressive", regex=True)) |
#     (teams_corr_df['col_2'].str.contains("progressive", regex=True))
# ]

# Strong correlation between possession and g/a
# teams_corr_df[
#     (
#         (teams_corr_df['col_1'].str.contains("possession", regex=True)) &
#         (teams_corr_df['col_2'].str.contains("x", regex=True))
#     ) | (
#         (teams_corr_df['col_1'].str.contains("x", regex=True)) &
#         (teams_corr_df['col_2'].str.contains("possession", regex=True))
#     )
# ]

# Looking at defensive metrics
# teams_corr_df[
#     (teams_corr_df['col_1'].isin(TEAM_COLUMNS_DICT_COMBINED["defence"])) |
#     (teams_corr_df['col_2'].isin(TEAM_COLUMNS_DICT_COMBINED["defence"]))
# ]

# High press =/= high possession
# teams_corr_df[
#     (teams_corr_df['col_1'].str.contains("tackles_att_3rd", regex=True)) |
#     (teams_corr_df['col_2'].str.contains("tackles_att_3rd", regex=True))
# ]

# Q: Is technical and physical ability mutually exclusive?
# A: Probably not? More direct/long-ball teams may lose more aerial duels and
# have lower overall pass accuracy (because of long balls)
# passes_pct	aerials_lost	0.756948
# passes_pct_short	aerials_won	0.625621