### Exploratory Data Analysis

Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from pandas.plotting import scatter_matrix

Load cleaned data

In [None]:
df_data = pd.read_csv("../dataset/cleaned_data.csv")
df_data.head()

Brief information about data

In [None]:
df_data.info()

In [None]:
df_data.describe()

When a football club play at home, the goals, shots, shots on goal or possession is always better than away match

In [None]:
cols = ['home_possession_pct', 'away_possession_pct', 'away_shot', 'away_shot_on_goal', 'home_shot', 'home_shot_on_goal', 
    'home_goal', 'away_goal', 'home_corner', 'away_corner', 'result']
info = df_data[cols]
sns.heatmap(info.corr(), annot=True)

We can drop home_corner and away_corner when we consider result

In [None]:
scatter_matrix(info, figsize=(20, 20), hist_kwds={'bins': 50})

In [None]:
df_data.hist(bins=50, figsize=(20, 15))

A team at home has more numbers of 3 points (win) than away

Top correlation with result

In [None]:
df_data.corr(numeric_only=True)['result'].sort_values(ascending=False)

In [None]:
df_data.head()

Make mean of 10 match later (home team from home matches and away team from away matches)

In [None]:
types = ['home', 'away']
df_mean = df_data.copy()

for type in types:
    gk = df_data.groupby(type)
    for team in df_mean[type].unique():
        temp = gk.get_group(team).sort_values('date', ascending=True)
        df_mean.loc[temp.index, [f'mean_10_{type}_goal']] = temp.rolling(10, min_periods=0)[f'{type}_goal'].mean()
        df_mean.loc[temp.index, [f'mean_10_{type}_possession_pct']] = temp.rolling(10, min_periods=0)[f'{type}_possession_pct'].mean()
        df_mean.loc[temp.index, [f'mean_10_{type}_shot_on_goal']] = temp.rolling(10, min_periods=0)[f'{type}_shot_on_goal'].mean()
        df_mean.loc[temp.index, [f'mean_10_{type}_shot']] = temp.rolling(10, min_periods=0)[f'{type}_shot'].mean()
        df_mean.loc[temp.index, [f'mean_10_{type}_corner']] = temp.rolling(10, min_periods=0)[f'{type}_corner'].mean()

df_mean.head()

Drop unnecessary columns

In [None]:
cols = ['date', 'home', 'away', 'home_possession_pct', 'away_possession_pct', 'away_shot', 'away_shot_on_goal', 'home_shot', 'home_shot_on_goal', 
    'home_goal', 'away_goal', 'home_corner', 'away_corner']
df_mean.drop(cols, axis=1, inplace=True)
df_mean.head()

In [None]:
df_mean.info()

Top correlation with result

In [None]:
df_mean.corr(numeric_only=True)['result'].sort_values(ascending=False)

Save analyzed data to csv file

In [None]:
df_mean.to_csv("../dataset/analyzed_data.csv", index=False)