## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import FootballDB as fdb
import Features as features

In [2]:
# Pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

## Load Data

In [3]:
# Load dataframe
db = fdb.LoadDB()
fduk = fdb.LoadTbl(db,'football-data-uk')
raw_df = fdb.LoadDframe(fduk)
raw_df # Show

Unnamed: 0,AwayTeam,Date,Div,FTAG,FTHG,FTR,HomeTeam,Source,Season,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HTR,HY,Referee
0,Chester,17/08/96,E3,1,2,H,Brighton,football-data-uk,9697,,,,,,,,,,,,,,,,
1,Barnet,17/08/96,E3,0,1,H,Cambridge,football-data-uk,9697,,,,,,,,,,,,,,,,
2,Hartlepool,17/08/96,E3,2,0,A,Colchester,football-data-uk,9697,,,,,,,,,,,,,,,,
3,Carlisle,17/08/96,E3,1,0,A,Doncaster,football-data-uk,9697,,,,,,,,,,,,,,,,
4,Hereford,17/08/96,E3,0,1,H,Fulham,football-data-uk,9697,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62298,Fulham,11/05/08,E0,1,0,A,Portsmouth,football-data-uk,0708,6,11,0,7,4,0,5,13,0,15,10,0,0,D,0,M Clattenburg
62299,Arsenal,11/05/08,E0,1,0,A,Sunderland,football-data-uk,0708,4,12,0,12,8,1,3,14,0,13,5,1,0,A,1,K Stroud
62300,Liverpool,11/05/08,E0,2,0,A,Tottenham,football-data-uk,0708,4,9,0,18,12,1,8,11,0,6,3,0,0,D,1,U Rennie
62301,Aston Villa,11/05/08,E0,2,2,D,West Ham,football-data-uk,0708,8,21,0,13,8,2,7,9,0,16,8,1,1,D,2,M Dean


In [6]:
df = raw_df
# Convert datatypes
df['Date'] = pd.to_datetime(raw_df['Date'],dayfirst=True,errors='ignore')
## Numeric
num_cols = ['FTAG','FTHG','AC','AF','AR','AS','AST','AY','HC','HF','HR','HS','HST','HTAG','HTHG','HY']
for c in num_cols:
    df[c] = pd.to_numeric(df[c])
df.dtypes

AwayTeam            object
Date        datetime64[ns]
Div                 object
FTAG               float64
FTHG               float64
FTR                 object
HomeTeam            object
Source              object
Season              object
AC                 float64
AF                 float64
AR                 float64
AS                 float64
AST                float64
AY                 float64
HC                 float64
HF                 float64
HR                 float64
HS                 float64
HST                float64
HTAG               float64
HTHG               float64
HTR                 object
HY                 float64
Referee             object
dtype: object

## Summary Statitistics

In [7]:
# Amount of data
df.shape

(62303, 25)

In [8]:
# Missing data
def mean(x):
    return sum(x)/len(x)

missing_df = df.isna()
missing_df.apply(mean,axis='rows')

AwayTeam    0.014333
Date        0.014333
Div         0.014333
FTAG        0.014333
FTHG        0.014333
FTR         0.014333
HomeTeam    0.014333
Source      0.000000
Season      0.000000
AC          0.273245
AF          0.273277
AR          0.242878
AS          0.273245
AST         0.273245
AY          0.242861
HC          0.273245
HF          0.273277
HR          0.242861
HS          0.273293
HST         0.273293
HTAG        0.185770
HTHG        0.185770
HTR         0.185770
HY          0.242861
Referee     0.278622
dtype: float64

In [9]:
# Missing data by season
missing_df = df.isna()
missing_df['Season'] = df['Season']
missing_df['Div'] = df['Div']
missing_df.groupby(['Season','Div']).aggregate(mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,AwayTeam,Date,FTAG,FTHG,FTR,HomeTeam,Source,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HTR,HY,Referee
Season,Div,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,E0,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E1,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E2,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,E3,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,E0,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,E1,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,E2,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,E3,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,E0,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,E1,False,False,False,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df.describe()

Unnamed: 0,FTAG,FTHG,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HY
count,61410.0,61410.0,45279.0,45277.0,47171.0,45279.0,45279.0,47172.0,45279.0,45277.0,47172.0,45276.0,45276.0,50729.0,50729.0,47172.0
mean,1.129295,1.472301,4.874092,11.997173,0.106082,9.734999,4.503456,1.67521,5.988604,11.36354,0.071229,12.160747,5.654917,0.500857,0.653472,1.312516
std,1.079638,1.235058,2.649477,3.941399,0.329626,4.127285,2.582091,1.267199,2.944028,3.809841,0.272653,4.664762,2.97725,0.710974,0.811419,1.139567
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,3.0,9.0,0.0,7.0,3.0,1.0,4.0,9.0,0.0,9.0,4.0,0.0,0.0,0.0
50%,1.0,1.0,5.0,12.0,0.0,9.0,4.0,2.0,6.0,11.0,0.0,12.0,5.0,0.0,0.0,1.0
75%,2.0,2.0,6.0,14.0,0.0,12.0,6.0,2.0,8.0,14.0,0.0,15.0,7.0,1.0,1.0,2.0
max,9.0,9.0,21.0,35.0,4.0,35.0,20.0,10.0,24.0,33.0,9.0,43.0,27.0,5.0,7.0,11.0


## Cleaning Data

In [11]:
# Standardise team names
clean_df = df
clean_df.loc[clean_df.loc[:,'HomeTeam']=='Middlesboro','HomeTeam'] = "Middlesbrough"
clean_df.loc[clean_df.loc[:,'AwayTeam']=='Middlesboro','AwayTeam'] = "Middlesbrough"

In [12]:
clean_df = df
# Filter seasons {0001->1819}
filter_seasons = ['0001','0102','0203','0304','0405','0506','0607','0708','0809','0910','1011','1112','1213','1314'\
                 ,'1415','1516','1617','1718','1819']
clean_df = clean_df[clean_df['Season'].isin(filter_seasons)]
# Filter divisions {E0, E1}
filter_div = ['E0','E1']
clean_df = clean_df[clean_df['Div'].isin(filter_div)]
clean_df # show

Unnamed: 0,AwayTeam,Date,Div,FTAG,FTHG,FTR,HomeTeam,Source,Season,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HTR,HY,Referee
728,Aston Villa,2015-08-08,E0,1.0,0.0,A,Bournemouth,football-data-uk,1516,3.0,13.0,0.0,7.0,3.0,4.0,6.0,13.0,0.0,11.0,2.0,0.0,0.0,D,3.0,M Clattenburg
729,Swansea,2015-08-08,E0,2.0,2.0,D,Chelsea,football-data-uk,1516,8.0,16.0,0.0,18.0,10.0,3.0,4.0,15.0,1.0,11.0,3.0,1.0,2.0,H,1.0,M Oliver
730,Watford,2015-08-08,E0,2.0,2.0,D,Everton,football-data-uk,1516,2.0,13.0,0.0,11.0,5.0,2.0,8.0,7.0,0.0,10.0,5.0,1.0,0.0,A,1.0,M Jones
731,Sunderland,2015-08-08,E0,2.0,4.0,H,Leicester,football-data-uk,1516,3.0,17.0,0.0,10.0,5.0,4.0,6.0,13.0,0.0,19.0,8.0,0.0,3.0,H,2.0,L Mason
732,Tottenham,2015-08-08,E0,0.0,1.0,H,Man United,football-data-uk,1516,2.0,12.0,0.0,9.0,4.0,3.0,1.0,12.0,0.0,9.0,1.0,0.0,1.0,H,2.0,J Moss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62298,Fulham,2008-05-11,E0,1.0,0.0,A,Portsmouth,football-data-uk,0708,6.0,11.0,0.0,7.0,4.0,0.0,5.0,13.0,0.0,15.0,10.0,0.0,0.0,D,0.0,M Clattenburg
62299,Arsenal,2008-05-11,E0,1.0,0.0,A,Sunderland,football-data-uk,0708,4.0,12.0,0.0,12.0,8.0,1.0,3.0,14.0,0.0,13.0,5.0,1.0,0.0,A,1.0,K Stroud
62300,Liverpool,2008-05-11,E0,2.0,0.0,A,Tottenham,football-data-uk,0708,4.0,9.0,0.0,18.0,12.0,1.0,8.0,11.0,0.0,6.0,3.0,0.0,0.0,D,1.0,U Rennie
62301,Aston Villa,2008-05-11,E0,2.0,2.0,D,West Ham,football-data-uk,0708,8.0,21.0,0.0,13.0,8.0,2.0,7.0,9.0,0.0,16.0,8.0,1.0,1.0,D,2.0,M Dean


In [13]:
# Use mean to calculate impute values
impute_df = clean_df
impute_vals = impute_df.groupby(['Div','FTR','Season']).mean().reset_index()
impute_vals['HTR']=impute_vals['FTR']
impute_vals['Referee']='Unknown'
impute_vals # Show impute vals
# Impute missing data with mean values
impute_df.loc[impute_df.isna().index]=impute_df.loc[impute_df.isna().index].apply(lambda x: x.fillna(impute_vals[(impute_vals['Div']==x['Div']) & (impute_vals['FTR']==x['FTR']) & (impute_vals['Season']==x['Season'])].iloc[0]),axis=1)
impute_df # Show

Unnamed: 0,AwayTeam,Date,Div,FTAG,FTHG,FTR,HomeTeam,Source,Season,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HTR,HY,Referee
728,Aston Villa,2015-08-08,E0,1.0,0.0,A,Bournemouth,football-data-uk,1516,3.0,13.0,0.0,7.0,3.0,4.0,6.0,13.0,0.0,11.0,2.0,0.0,0.0,D,3.0,M Clattenburg
729,Swansea,2015-08-08,E0,2.0,2.0,D,Chelsea,football-data-uk,1516,8.0,16.0,0.0,18.0,10.0,3.0,4.0,15.0,1.0,11.0,3.0,1.0,2.0,H,1.0,M Oliver
730,Watford,2015-08-08,E0,2.0,2.0,D,Everton,football-data-uk,1516,2.0,13.0,0.0,11.0,5.0,2.0,8.0,7.0,0.0,10.0,5.0,1.0,0.0,A,1.0,M Jones
731,Sunderland,2015-08-08,E0,2.0,4.0,H,Leicester,football-data-uk,1516,3.0,17.0,0.0,10.0,5.0,4.0,6.0,13.0,0.0,19.0,8.0,0.0,3.0,H,2.0,L Mason
732,Tottenham,2015-08-08,E0,0.0,1.0,H,Man United,football-data-uk,1516,2.0,12.0,0.0,9.0,4.0,3.0,1.0,12.0,0.0,9.0,1.0,0.0,1.0,H,2.0,J Moss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62298,Fulham,2008-05-11,E0,1.0,0.0,A,Portsmouth,football-data-uk,0708,6.0,11.0,0.0,7.0,4.0,0.0,5.0,13.0,0.0,15.0,10.0,0.0,0.0,D,0.0,M Clattenburg
62299,Arsenal,2008-05-11,E0,1.0,0.0,A,Sunderland,football-data-uk,0708,4.0,12.0,0.0,12.0,8.0,1.0,3.0,14.0,0.0,13.0,5.0,1.0,0.0,A,1.0,K Stroud
62300,Liverpool,2008-05-11,E0,2.0,0.0,A,Tottenham,football-data-uk,0708,4.0,9.0,0.0,18.0,12.0,1.0,8.0,11.0,0.0,6.0,3.0,0.0,0.0,D,1.0,U Rennie
62301,Aston Villa,2008-05-11,E0,2.0,2.0,D,West Ham,football-data-uk,0708,8.0,21.0,0.0,13.0,8.0,2.0,7.0,9.0,0.0,16.0,8.0,1.0,1.0,D,2.0,M Dean


In [14]:
# Check missing data
missing_df = clean_df.isna()
missing_df.apply(mean,axis='rows')

AwayTeam    0.0
Date        0.0
Div         0.0
FTAG        0.0
FTHG        0.0
FTR         0.0
HomeTeam    0.0
Source      0.0
Season      0.0
AC          0.0
AF          0.0
AR          0.0
AS          0.0
AST         0.0
AY          0.0
HC          0.0
HF          0.0
HR          0.0
HS          0.0
HST         0.0
HTAG        0.0
HTHG        0.0
HTR         0.0
HY          0.0
Referee     0.0
dtype: float64

## Save Cleaned Data

In [21]:
timestamp = datetime.datetime.now().strftime("%Y%m%d")
fname = "../Data/clean_data_{}.csv".format(timestamp)
clean_df.to_csv(fname,index=False)
fname #show filename

'../Data/clean_data_20200128.csv'

## Analysis

In [60]:
# List E0 teams
df = clean_df
df = df[df['Div']=='E0']
teams = df.HomeTeam.unique()
np.append(E0_teams,df.AwayTeam.unique())
teams

array(['Bournemouth', 'Chelsea', 'Everton', 'Leicester', 'Man United',
       'Norwich', 'Arsenal', 'Newcastle', 'Stoke', 'West Brom',
       'Aston Villa', 'Southampton', 'Sunderland', 'Swansea', 'Tottenham',
       'Watford', 'West Ham', 'Crystal Palace', 'Man City', 'Liverpool',
       'Brighton', 'Burnley', 'Huddersfield', 'QPR', 'Hull', 'Blackburn',
       'Bolton', 'Middlesbrough', 'Portsmouth', 'Birmingham', 'Charlton',
       'Fulham', 'Leeds', 'Derby', 'Ipswich', 'Reading', 'Wigan',
       'Wolves', 'Cardiff', 'Sheffield United', 'Coventry', 'Bradford',
       'Blackpool'], dtype=object)

In [43]:
# Assign colours to E0 teams
team_colours = dict()
team_colours['Bournemouth'] = ["#ed1c24","#000000"]
team_colours['Chelsea'] = ["#034694","#034694"]
team_colours['Everton'] = ["#274488","#ffffff"]
team_colours['Leicester'] = ["#0053a0","#ffffff"]
team_colours['Man United'] = ["#da030e","#ffe500"]
team_colours['Norwich'] = ["#00a650","#fff200"]
team_colours['Arsenal'] = ["#ef0107","#023474"]
team_colours['Newcastle'] = ["#000000","#ffffff"]
team_colours['Stoke'] = ["#1b449c","#e03a3e"]
team_colours['West Brom'] = ["#091453","#ffffff"]
team_colours['Aston Villa'] = ["#a3c5e9","#7b003a"]
team_colours['Southampton'] = ["#ff0000","#ffffff"]
team_colours['Sunderland'] = ["#ffffff","#eb172b"]
team_colours['Swansea'] = ["#222222","#8a0829"]
team_colours['Tottenham'] = ["#ffffff","#000040"]
team_colours['Watford'] = ["#fbee23","#ed2127"]
team_colours['West Ham'] = ["#7f0000","#7acbe5"]
team_colours['Crystal Palace'] = ["#292d6b","#c4122e"]
team_colours['Man City'] = ["#97c1e7","#97c1e7"]
team_colours['Liverpool'] = ["#dd0000","#dd0000"]
team_colours['Brighton'] = ["#005daa","#ffffff"]
team_colours['Burnley'] = ["#80bfff","#800000"]
team_colours['Huddersfield'] = ["#192552","#ffffff"]
team_colours['QPR'] = ["#fd35c8","#0c00cd"]
team_colours['Hull'] = ["#f5971d","#231f20"]
team_colours['Blackburn'] = ["#78bcff","#ff0000"]
team_colours['Bolton'] = ["#263c7e","#df0024"]
team_colours['Middlesbrough'] = ["#ffffff","#942923"]
team_colours['Portsmouth'] = ["#001489","#e1e4f3"]
team_colours['Birmingham'] = ["#2d5593","#ffffff"]
team_colours['Charlton'] = ["#d4021d","#000000"]
team_colours['Fulham'] = ["#000000","#cc0000"]
team_colours['Leeds'] = ["#1D428A","#FFCD00"]
team_colours['Derby'] = ["#000040","#bbbbda"]
team_colours['Ipswich'] = ["#3a64a3","#de2c37"]
team_colours['Reading'] = ["#004494","#dd9300"]
team_colours['Wigan'] = ["#1d59af","#006838"]
team_colours['Wolves'] = ["#fdb913","#000000"]
team_colours['Cardiff'] = ["#003366","#ffffff"]
team_colours['Sheffield United'] = ["#010101","#ec2227"]
team_colours['Coventry'] = ["#77bbff","#007711"]
team_colours['Bradford'] = ["#fdb913","#800000"]
team_colours['Blackpool'] = ["#ffffff","#ff5f00"]
team_colours

{'Bournemouth': ['#ed1c24', '#000000'],
 'Chelsea': ['#034694', '#034694'],
 'Everton': ['#274488', '#ffffff'],
 'Leicester': ['#0053a0', '#ffffff'],
 'Man United': ['#da030e', '#ffe500'],
 'Norwich': ['#00a650', '#fff200'],
 'Arsenal': ['#ef0107', '#023474'],
 'Newcastle': ['#000000', '#ffffff'],
 'Stoke': ['#1b449c', '#e03a3e'],
 'West Brom': ['#091453', '#ffffff'],
 'Aston Villa': ['#a3c5e9', '#7b003a'],
 'Southampton': ['#ff0000', '#ffffff'],
 'Sunderland': ['#ffffff', '#eb172b'],
 'Swansea': ['#222222', '#8a0829'],
 'Tottenham': ['#ffffff', '#000040'],
 'Watford': ['#fbee23', '#ed2127'],
 'West Ham': ['#7f0000', '#7acbe5'],
 'Crystal Palace': ['#292d6b', '#c4122e'],
 'Man City': ['#97c1e7', '#97c1e7'],
 'Liverpool': ['#dd0000', '#dd0000'],
 'Brighton': ['#005daa', '#ffffff'],
 'Burnley': ['#80bfff', '#800000'],
 'Huddersfield': ['#192552', '#ffffff'],
 'QPR': ['#fd35c8', '#0c00cd'],
 'Hull': ['#f5971d', '#231f20'],
 'Blackburn': ['#78bcff', '#ff0000'],
 'Bolton': ['#263c7e', '#df0

In [64]:

home_df

Unnamed: 0,AwayTeam,Date,Div,FTAG,FTHG,FTR,HomeTeam,Source,Season,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HTR,HY,Referee,PrimaryTeam,OpposingTeam,HomeAway,PC,PF,PR,PY,PS,PST,HTPG,OC,OF,OR,OY,OS,OST,HTOG,FTPG,FTOG
728,Aston Villa,2015-08-08,E0,1.0,0.0,A,Bournemouth,football-data-uk,1516,3.0,13.0,0.0,7.0,3.0,4.0,6.0,13.0,0.0,11.0,2.0,0.0,0.0,D,3.0,M Clattenburg,Bournemouth,Aston Villa,Home,3.0,13.0,0.0,4.0,7.0,3.0,0.0,6.0,13.0,0.0,3.0,11.0,2.0,0.0,1.0,0.0
729,Swansea,2015-08-08,E0,2.0,2.0,D,Chelsea,football-data-uk,1516,8.0,16.0,0.0,18.0,10.0,3.0,4.0,15.0,1.0,11.0,3.0,1.0,2.0,H,1.0,M Oliver,Chelsea,Swansea,Home,8.0,16.0,0.0,3.0,18.0,10.0,1.0,4.0,15.0,1.0,1.0,11.0,3.0,2.0,2.0,2.0
730,Watford,2015-08-08,E0,2.0,2.0,D,Everton,football-data-uk,1516,2.0,13.0,0.0,11.0,5.0,2.0,8.0,7.0,0.0,10.0,5.0,1.0,0.0,A,1.0,M Jones,Everton,Watford,Home,2.0,13.0,0.0,2.0,11.0,5.0,1.0,8.0,7.0,0.0,1.0,10.0,5.0,0.0,2.0,2.0
731,Sunderland,2015-08-08,E0,2.0,4.0,H,Leicester,football-data-uk,1516,3.0,17.0,0.0,10.0,5.0,4.0,6.0,13.0,0.0,19.0,8.0,0.0,3.0,H,2.0,L Mason,Leicester,Sunderland,Home,3.0,17.0,0.0,4.0,10.0,5.0,0.0,6.0,13.0,0.0,2.0,19.0,8.0,3.0,2.0,4.0
732,Tottenham,2015-08-08,E0,0.0,1.0,H,Man United,football-data-uk,1516,2.0,12.0,0.0,9.0,4.0,3.0,1.0,12.0,0.0,9.0,1.0,0.0,1.0,H,2.0,J Moss,Man United,Tottenham,Home,2.0,12.0,0.0,3.0,9.0,4.0,0.0,1.0,12.0,0.0,2.0,9.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62298,Fulham,2008-05-11,E0,1.0,0.0,A,Portsmouth,football-data-uk,0708,6.0,11.0,0.0,7.0,4.0,0.0,5.0,13.0,0.0,15.0,10.0,0.0,0.0,D,0.0,M Clattenburg,Portsmouth,Fulham,Home,6.0,11.0,0.0,0.0,7.0,4.0,0.0,5.0,13.0,0.0,0.0,15.0,10.0,0.0,1.0,0.0
62299,Arsenal,2008-05-11,E0,1.0,0.0,A,Sunderland,football-data-uk,0708,4.0,12.0,0.0,12.0,8.0,1.0,3.0,14.0,0.0,13.0,5.0,1.0,0.0,A,1.0,K Stroud,Sunderland,Arsenal,Home,4.0,12.0,0.0,1.0,12.0,8.0,1.0,3.0,14.0,0.0,1.0,13.0,5.0,0.0,1.0,0.0
62300,Liverpool,2008-05-11,E0,2.0,0.0,A,Tottenham,football-data-uk,0708,4.0,9.0,0.0,18.0,12.0,1.0,8.0,11.0,0.0,6.0,3.0,0.0,0.0,D,1.0,U Rennie,Tottenham,Liverpool,Home,4.0,9.0,0.0,1.0,18.0,12.0,0.0,8.0,11.0,0.0,1.0,6.0,3.0,0.0,2.0,0.0
62301,Aston Villa,2008-05-11,E0,2.0,2.0,D,West Ham,football-data-uk,0708,8.0,21.0,0.0,13.0,8.0,2.0,7.0,9.0,0.0,16.0,8.0,1.0,1.0,D,2.0,M Dean,West Ham,Aston Villa,Home,8.0,21.0,0.0,2.0,13.0,8.0,1.0,7.0,9.0,0.0,2.0,16.0,8.0,1.0,2.0,2.0


In [65]:
# Summarise season stats
df = clean_df
df = df.loc[df['Div']=='E0',:]
## Home stats
home_df = df
home_df.loc[:,'PrimaryTeam'] = home_df['HomeTeam']
home_df.loc[:,'OpposingTeam'] = home_df['AwayTeam']
home_df.loc[:,'HomeAway'] = 'Home'
### Primary team stats
home_df.loc[:,'PC'] = home_df['HC']
home_df.loc[:,'PF'] = home_df['HF']
home_df.loc[:,'PR'] = home_df['HR']
home_df.loc[:,'PY'] = home_df['HY']
home_df.loc[:,'PS'] = home_df['HS']
home_df.loc[:,'PST'] = home_df['HST']
home_df.loc[:,'HTPG'] = home_df['HTHG']
home_df.loc[:,'FTPG'] = home_df['FTHG']
### Opposing team stats
home_df.loc[:,'OC'] = home_df['AC']
home_df.loc[:,'OF'] = home_df['AF']
home_df.loc[:,'OR'] = home_df['AR']
home_df.loc[:,'OY'] = home_df['AY']
home_df.loc[:,'OS'] = home_df['AS']
home_df.loc[:,'OST'] = home_df['AST']
home_df.loc[:,'HTOG'] = home_df['HTAG']
home_df.loc[:,'FTOG'] = home_df['FTAG']
## Away stats
away_df = df
away_df.loc[:,'PrimaryTeam'] = away_df['AwayTeam']
away_df.loc[:,'OpposingTeam'] = away_df['HomeTeam']
away_df.loc[:,'HomeAway'] = 'Away'
### Primary team stats
away_df.loc[:,'PC'] = away_df['AC']
away_df.loc[:,'PF'] = away_df['AF']
away_df.loc[:,'PR'] = away_df['AR']
away_df.loc[:,'PY'] = away_df['AY']
away_df.loc[:,'PS'] = away_df['AS']
away_df.loc[:,'PST'] = away_df['AST']
away_df.loc[:,'HTPG'] = away_df['HTAG']
away_df.loc[:,'FTPG'] = away_df['FTAG']
### Opposing team stats
away_df.loc[:,'OC'] = away_df['HC']
away_df.loc[:,'OF'] = away_df['HF']
away_df.loc[:,'OR'] = away_df['HR']
away_df.loc[:,'OY'] = away_df['HY']
away_df.loc[:,'OS'] = away_df['HS']
away_df.loc[:,'OST'] = away_df['HST']
away_df.loc[:,'HTOG'] = away_df['HTHG']
away_df.loc[:,'FTOG'] = away_df['FTHG']
# Collate team stats
team_stats_df = pd.concat([home_df,away_df])
team_stats_df = team_stats_df.filter(['PrimaryTeam','OpposingTeam','HomeAway','Date','Season','Div','PC','PF','PR','PY','PS','PST','HTPG','FTPG','OC','OF','OR','OY','OS','OST','HTOG','FTOG'])
team_stats_df

Unnamed: 0,PrimaryTeam,OpposingTeam,HomeAway,Date,Season,Div,PC,PF,PR,PY,PS,PST,HTPG,FTPG,OC,OF,OR,OY,OS,OST,HTOG,FTOG
728,Aston Villa,Bournemouth,Away,2015-08-08,1516,E0,3.0,13.0,0.0,4.0,7.0,3.0,0.0,1.0,6.0,13.0,0.0,3.0,11.0,2.0,0.0,0.0
729,Swansea,Chelsea,Away,2015-08-08,1516,E0,8.0,16.0,0.0,3.0,18.0,10.0,1.0,2.0,4.0,15.0,1.0,1.0,11.0,3.0,2.0,2.0
730,Watford,Everton,Away,2015-08-08,1516,E0,2.0,13.0,0.0,2.0,11.0,5.0,1.0,2.0,8.0,7.0,0.0,1.0,10.0,5.0,0.0,2.0
731,Sunderland,Leicester,Away,2015-08-08,1516,E0,3.0,17.0,0.0,4.0,10.0,5.0,0.0,2.0,6.0,13.0,0.0,2.0,19.0,8.0,3.0,4.0
732,Tottenham,Man United,Away,2015-08-08,1516,E0,2.0,12.0,0.0,3.0,9.0,4.0,0.0,0.0,1.0,12.0,0.0,2.0,9.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62298,Fulham,Portsmouth,Away,2008-05-11,0708,E0,6.0,11.0,0.0,0.0,7.0,4.0,0.0,1.0,5.0,13.0,0.0,0.0,15.0,10.0,0.0,0.0
62299,Arsenal,Sunderland,Away,2008-05-11,0708,E0,4.0,12.0,0.0,1.0,12.0,8.0,1.0,1.0,3.0,14.0,0.0,1.0,13.0,5.0,0.0,0.0
62300,Liverpool,Tottenham,Away,2008-05-11,0708,E0,4.0,9.0,0.0,1.0,18.0,12.0,0.0,2.0,8.0,11.0,0.0,1.0,6.0,3.0,0.0,0.0
62301,Aston Villa,West Ham,Away,2008-05-11,0708,E0,8.0,21.0,0.0,2.0,13.0,8.0,1.0,2.0,7.0,9.0,0.0,2.0,16.0,8.0,1.0,2.0


In [54]:
# Sort seasons
df = clean_df
sorted_seasons = np.sort(df.Season.unique())
sorted_seasons

array(['0001', '0102', '0203', '0304', '0405', '0506', '0607', '0708',
       '0809', '0910', '1011', '1112', '1213', '1314', '1415', '1516',
       '1617', '1718', '1819'], dtype=object)

In [None]:
# Get season stats
team_stats

In [None]:
# Cluster last season stats

In [None]:
# Analyse cluster movements between seasons

In [13]:
analysis_df = clean_df
features.prev_n_matches(analysis_df,'Fulham',n=12,date=datetime.datetime.today(),homeaway='both')

Unnamed: 0,AwayTeam,Date,Div,FTAG,FTHG,FTR,HomeTeam,Source,Season,AC,AF,AR,AS,AST,AY,HC,HF,HR,HS,HST,HTAG,HTHG,HTR,HY,Referee
22977,Newcastle,2019-05-12,E0,4.0,0.0,A,Fulham,football-data-uk,1819,5.0,8.0,0.0,13.0,6.0,0.0,5.0,6.0,0.0,16.0,2.0,2.0,0.0,A,1.0,K Friend
22969,Fulham,2019-05-04,E0,0.0,1.0,H,Wolves,football-data-uk,1819,1.0,15.0,0.0,6.0,2.0,3.0,7.0,10.0,0.0,19.0,6.0,0.0,0.0,D,1.0,J Moss
22957,Cardiff,2019-04-27,E0,0.0,1.0,H,Fulham,football-data-uk,1819,3.0,8.0,0.0,13.0,8.0,0.0,10.0,10.0,0.0,8.0,2.0,0.0,0.0,D,0.0,C Kavanagh
22940,Fulham,2019-04-20,E0,1.0,0.0,A,Bournemouth,football-data-uk,1819,7.0,18.0,0.0,17.0,5.0,3.0,3.0,11.0,0.0,15.0,5.0,0.0,0.0,D,1.0,D Coote
22932,Everton,2019-04-13,E0,0.0,2.0,H,Fulham,football-data-uk,1819,3.0,10.0,0.0,8.0,1.0,1.0,7.0,8.0,0.0,12.0,5.0,0.0,0.0,D,2.0,L Probert
22918,Fulham,2019-04-02,E0,1.0,4.0,H,Watford,football-data-uk,1819,4.0,5.0,0.0,17.0,7.0,2.0,6.0,12.0,0.0,15.0,7.0,1.0,1.0,D,3.0,R East
22911,Man City,2019-03-30,E0,2.0,0.0,A,Fulham,football-data-uk,1819,11.0,12.0,0.0,24.0,7.0,0.0,0.0,4.0,0.0,5.0,0.0,2.0,0.0,A,2.0,K Friend
22907,Liverpool,2019-03-17,E0,2.0,1.0,A,Fulham,football-data-uk,1819,10.0,7.0,0.0,16.0,6.0,1.0,1.0,11.0,0.0,7.0,2.0,1.0,0.0,A,2.0,C Pawson
22896,Fulham,2019-03-09,E0,1.0,3.0,H,Leicester,football-data-uk,1819,5.0,13.0,0.0,6.0,3.0,2.0,6.0,9.0,0.0,18.0,8.0,0.0,1.0,H,0.0,D Coote
22891,Chelsea,2019-03-03,E0,2.0,1.0,A,Fulham,football-data-uk,1819,4.0,10.0,0.0,20.0,7.0,1.0,5.0,11.0,0.0,12.0,5.0,2.0,1.0,A,2.0,G Scott


In [17]:
# Can we cluster succesful teams from non-succesful teams?
## Win percentage by team + season
analysis_df = clean_df
analysis_df[Division="E0"]

SyntaxError: invalid syntax (<ipython-input-17-38a3c1fb8cd8>, line 4)