In [228]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import matplotlib.colors as col
import matplotlib.cm as cm
import seaborn as sns
import re
import lxml

plt.style.use('seaborn-colorblind')

In [229]:
# Read data
dict_datasets = {
    "ARS": "./List of Arsenal F.C. seasons - Wikipedia.html",
    "CHE": "./List of Chelsea F.C. seasons - Wikipedia.html",
    "LIV": "./List of Liverpool F.C. seasons - Wikipedia.html",
    "MCI": "List of Manchester City F.C. seasons - Wikipedia.html",
    "MCU": "List of Manchester United F.C. seasons - Wikipedia.html",
    "TOT": "List of Tottenham Hotspur F.C. seasons - Wikipedia.html"
}

# Arsenal
df_ARS = pd.read_html(dict_datasets['ARS'])[3][107:127]
df_ARS.head()
ARS = pd.DataFrame()
ARS['Year'] = df_ARS['Season']['League[58]'].str[0:7]
ARS['Wins'] = df_ARS['W']['Unnamed: 3_level_1']
ARS['Wins'] = pd.to_numeric(ARS['Wins'])
ARS['Losses'] = df_ARS['L']['Unnamed: 5_level_1']
ARS['Losses'] = pd.to_numeric(ARS['Losses'])
ARS['WinRate'] = ARS['Wins']/(ARS['Wins']+ARS['Losses'])
ARS = ARS.reset_index().drop('index',1)

# Chelsea
df_CHE = pd.read_html(dict_datasets['CHE'])[2][87:107]
df_CHE.head()
CHE = pd.DataFrame()
CHE['Year'] = df_CHE['Season']['Division'].str[0:7]
CHE['Wins'] = pd.to_numeric(df_CHE['EFLCup']['D'])
CHE['Losses'] = pd.to_numeric(df_CHE['Top goalscorer(s)[a]']['GF'])
CHE['WinRate'] = CHE['Wins']/(CHE['Wins']+CHE['Losses'])
CHE = CHE.reset_index().drop('index',1)

# Liverpool
df_LIV = pd.read_html(dict_datasets['LIV'])[2][100:120]
df_LIV.head()
LIV = pd.DataFrame()
LIV['Year'] = df_LIV['Season']['Division'].str[0:7]
LIV['Wins'] = pd.to_numeric(df_LIV['League Cup']['D'])
LIV['Losses'] = pd.to_numeric(df_LIV['UCL']['GF'])
LIV['WinRate'] = LIV['Wins']/(LIV['Wins']+LIV['Losses'])
LIV = LIV.reset_index().drop('index',1)

# Manchester City
df_MCI = pd.read_html(dict_datasets['MCI'])[0][116:136]
df_MCI.head()
MCI = pd.DataFrame()
MCI['Year'] = df_MCI['.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Season']['Division (Tier)'].str[0:7]
MCI['Wins'] = df_MCI['LC']['D']
MCI['Losses'] = df_MCI['CW']['GF']
MCI['WinRate'] = MCI['Wins']/(MCI['Wins']+MCI['Losses'])
MCI = MCI.reset_index().drop('index',1)

# Manchester United
df_MCU = pd.read_html(dict_datasets['MCU'])[2][106:126]
df_MCU.head()
MCU = pd.DataFrame()
MCU['Year'] = df_MCU['Season']['Division'].str[0:7]
MCU['Wins'] = pd.to_numeric(df_MCU['EFL Cup']['D'])
MCU['Losses'] = pd.to_numeric(df_MCU['UEFAFIFA']['GF'])
MCU['WinRate'] = MCU['Wins']/(MCU['Wins']+MCU['Losses'])
MCU = MCU.reset_index().drop('index',1)

# Tottenham
df_TOT = pd.read_html(dict_datasets['TOT'])[0][139:160].drop(index= 156)
df_TOT.head()
TOT = pd.DataFrame()
TOT['Year'] = df_TOT[0].str[0:7]
TOT['Wins'] = pd.to_numeric(df_TOT[2])
TOT['Losses'] = pd.to_numeric(df_TOT[4])
TOT['WinRate'] = TOT['Wins']/(TOT['Wins']+TOT['Losses'])
TOT = TOT.reset_index().drop('index',1)

In [239]:
# Form a dataframe
Big6_Winrate = pd.DataFrame()
Big6_Winrate['Year'] = ARS['Year']
Big6_Winrate['ARS'] = ARS['WinRate']
Big6_Winrate['CHE'] = CHE['WinRate']
Big6_Winrate['LIV'] = LIV["WinRate"]
Big6_Winrate['MCI'] = MCI['WinRate']
Big6_Winrate['MCU'] = MCU['WinRate']
Big6_Winrate['TOT'] = TOT['WinRate']
Big6_Winrate.set_index('Year',inplace=True)

In [237]:
%matplotlib notebook
# Draw KDE
kde = Big6_Winrate.plot.kde(bw_method = 0.8)
[kde.spines[loc].set_visible(False) for loc in ['top', 'right']]
kde.set_title('KDE of Big6 Win % in EPL\n(2000-2020)')
kde.legend(loc='upper left',frameon=False,title='Big6')

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x11a52538e10>

In [238]:
# Line Plot
fig, ((ax1,ax2),(ax3,ax4),(ax5,ax6)) = plt.subplots(3,2,sharex=True,sharey=True)
plt.subplots_adjust(hspace = 0.5)
axs = [ax1,ax2,ax3,ax4,ax5,ax6]
fig.suptitle('Big6 Win % in EPL (2000-2020)')

colors = ['g','b','y','r']
titles = ['ARS','CHE','LIV','MCI','MCU','TOT']
axis = [0,20,0.3,1]
y=0.7

for i in range(len(axs)):
    ax = axs[i]
    sns.pointplot(x=Big6_Winrate.index,y=Big6_Winrate[titles[i]],ax=ax,scale=0.5)
    ax.axhline(y=0.7, color='gray', linewidth=1, linestyle='--')
    ax.fill_between(range(0,20),y,Big6_Winrate[titles[i]],where=(y>Big6_Winrate[titles[i]]),color='red',interpolate=True,alpha=0.2)
    ax.fill_between(range(0,20),y,Big6_Winrate[titles[i]],where=(y<Big6_Winrate[titles[i]]),color='blue',interpolate=True,alpha=0.2)

    [ax.spines[loc].set_visible(False) for loc in ['top','right']]
    ax.set_ylabel('Win %')
    ax.set_xlabel('')
    ax.set_title(titles[i],fontsize=8)
    ax.axis(axis)
    ax.set_xticks(np.append(np.arange(0, 20, 5),20))
    ax.set_xticklabels(['2000','2005','2010','2015','2020'])

<IPython.core.display.Javascript object>