# EPL Analysis - Team Segmentation

## Abstract
The goal is to analyse season statistics of EPL teams to identify possible clusters or segments which can later be used for predictive modelling. 

## 1. Setup my environment

In [6]:
import sys
sys.path.append("..") # Adds higher directory to python modules path.

In [36]:
import pandas as pd
import FootballDataAnalysis as fda

In [32]:
import importlib
importlib.reload(fda)

<module 'FootballDataAnalysis' from 'C:\\Users\\jonat\\Workspace\\FootballPrediction\\FootballDataAnalysis.py'>

## 2. Load data

In [57]:
# Load match data
match_data = fda.LoadMatchData()
match_data # Show

Unnamed: 0,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,HC,AC,HF,AF,HO,AO,HY,AY,HR,AR
0,E0,0910,15/08/09,Aston Villa,Wigan,0,2,A,0,1,...,4,6,15,14,,,2,2,0,0
1,E0,0910,15/08/09,Blackburn,Man City,0,2,A,0,1,...,5,4,12,9,,,2,1,0,0
2,E0,0910,15/08/09,Bolton,Sunderland,0,1,A,0,1,...,4,7,16,10,,,2,1,0,0
3,E0,0910,15/08/09,Chelsea,Hull,2,1,H,1,1,...,12,4,13,15,,,1,2,0,0
4,E0,0910,15/08/09,Everton,Arsenal,1,6,A,0,3,...,4,9,11,13,,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,E0,1819,12/05/2019,Liverpool,Wolves,2,0,H,1,0,...,4,1,3,11,,,0,2,0,0
3796,E0,1819,12/05/2019,Man United,Cardiff,0,2,A,0,1,...,11,2,9,6,,,3,3,0,0
3797,E0,1819,12/05/2019,Southampton,Huddersfield,1,1,D,1,0,...,4,3,8,6,,,0,1,0,0
3798,E0,1819,12/05/2019,Tottenham,Everton,2,2,D,1,0,...,7,4,10,13,,,0,2,0,0


## 3. Prepare Data

In [59]:
match_data['MatchIdx'] = match_data.index
pd.melt(match_data,
       id_vars=['MatchIdx'],
       value_vars=['HomeTeam','AwayTeam'],
       value_name='Team',
       var_name='HomeAway').merge(match_data, left_on='MatchIdx', right_on='MatchIdx', how='inner')

Unnamed: 0,MatchIdx,HomeAway,Team,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,...,AC,HF,AF,HO,AO,HY,AY,HR,AR,match_idx
0,0,HomeTeam,Aston Villa,E0,0910,15/08/09,Aston Villa,Wigan,0,2,...,6,15,14,,,2,2,0,0,0
1,0,AwayTeam,Wigan,E0,0910,15/08/09,Aston Villa,Wigan,0,2,...,6,15,14,,,2,2,0,0,0
2,1,HomeTeam,Blackburn,E0,0910,15/08/09,Blackburn,Man City,0,2,...,4,12,9,,,2,1,0,0,1
3,1,AwayTeam,Man City,E0,0910,15/08/09,Blackburn,Man City,0,2,...,4,12,9,,,2,1,0,0,1
4,2,HomeTeam,Bolton,E0,0910,15/08/09,Bolton,Sunderland,0,1,...,7,16,10,,,2,1,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7595,3797,AwayTeam,Huddersfield,E0,1819,12/05/2019,Southampton,Huddersfield,1,1,...,3,8,6,,,0,1,0,0,3797
7596,3798,HomeTeam,Tottenham,E0,1819,12/05/2019,Tottenham,Everton,2,2,...,4,10,13,,,0,2,0,0,3798
7597,3798,AwayTeam,Everton,E0,1819,12/05/2019,Tottenham,Everton,2,2,...,4,10,13,,,0,2,0,0,3798
7598,3799,HomeTeam,Watford,E0,1819,12/05/2019,Watford,West Ham,1,4,...,2,10,10,,,1,0,1,0,3799


In [82]:
def Points(result):
    points = 0
    if result=='W':
        points = 3
    elif result=='D':
        points = 1
    else:
        points = 0
    return points

def Goals(ForAgainst,HomeAway,FTHG,FTAG):
    if HomeAway=='Home':
        if ForAgainst=='For':
            return FTHG
        else:
            return FTAG
    else:
        if ForAgainst=='For':
            return FTAG
        else:
            return FTHG

def TeamMatchStatistics(match_data):
    """Match statistics by team"""
    match_data['MatchIdx'] = match_data.index
    team_match_statistics = pd.melt(match_data,
                                    id_vars=['MatchIdx'],
                                    value_vars=['HomeTeam','AwayTeam'],
                                    value_name='Team',
                                    var_name='HomeAway')\
    .merge(match_data,left_on='MatchIdx',right_on='MatchIdx',how='inner')
    team_match_statistics['HomeAway'] = team_match_statistics.apply(lambda x: x['HomeAway'].replace('Team',''),axis='columns')
    team_match_statistics['Result'] = team_match_statistics.apply(lambda x: fda._WinLossDraw(x['HomeAway'],x['FTR']),axis='columns')
    team_match_statistics['Points'] = team_match_statistics.apply(lambda x: Points(x['Result']),axis='columns')
    team_match_statistics['GS'] = team_match_statistics.apply(lambda x: Goals('For',x['HomeAway'],x['FTHG'],x['FTAG']),axis='columns')
    team_match_statistics['GC'] = team_match_statistics.apply(lambda x: Goals('Against',x['HomeAway'],x['FTHG'],x['FTAG']),axis='columns')
    team_match_statistics['GD'] = team_match_statistics.apply(lambda x: int(x['GS'])-int(x['GC']),axis='columns')
    return team_match_statistics.loc[:,['Season','Div','Team','Result','Points','GS','GC','GD']]

TeamMatchStatistics(match_data)

Unnamed: 0,Season,Div,Team,Result,Points,GS,GC,GD
0,0910,E0,Aston Villa,L,0,0,2,-2
1,0910,E0,Wigan,W,3,2,0,2
2,0910,E0,Blackburn,L,0,0,2,-2
3,0910,E0,Man City,W,3,2,0,2
4,0910,E0,Bolton,L,0,0,1,-1
...,...,...,...,...,...,...,...,...
7595,1819,E0,Huddersfield,D,1,1,1,0
7596,1819,E0,Tottenham,D,1,2,2,0
7597,1819,E0,Everton,D,1,2,2,0
7598,1819,E0,Watford,L,0,1,4,-3
