In [2]:
!pip install scipy



In [1]:
import pandas as pd
import pickle
from scipy.stats import poisson
import re

In [2]:
df_table = pd.read_csv('clean_epl_table.csv')
df_results = pd.read_csv('clean_epl_results.csv')
df_fixture = pd.read_csv('epl_fixture.csv')

In [7]:
df_results

Unnamed: 0,Home,Away,HomeGoals,AwayGoals,TotalGoals
0,Newcastle United,Nottingham Forest,1,3,4
1,Bournemouth,Fulham,3,0,3
2,Sheffield United,Luton Town,2,3,5
3,Burnley,Liverpool,0,2,2
4,Manchester United,Aston Villa,3,2,5
...,...,...,...,...,...
178,Brighton and Hove Albion,Luton Town,4,1,5
179,Everton,Fulham,0,1,1
180,Sheffield United,Crystal Palace,0,1,1
181,Newcastle United,Aston Villa,5,1,6


In [8]:
#split home and away
df_home = df_results[['Home', 'HomeGoals','AwayGoals']]
df_away = df_results[['Away', 'HomeGoals','AwayGoals']]

df_home = df_home.rename(columns={'Home':'Team','HomeGoals':'GoalsScored','AwayGoals':'GoalsConceded'})
df_away = df_away.rename(columns={'Home':'Team','HomeGoals':'GoalsConceded','AwayGoals':'GoalsScored'})

In [9]:
df_stats = pd.concat([df_home,df_away], ignore_index=True).groupby('Team').mean()
df_stats

Unnamed: 0_level_0,GoalsScored,GoalsConceded,Away
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Arsenal,2.444444,0.888889,
Aston Villa,2.888889,0.666667,
Bournemouth,1.222222,1.333333,
Brentford,1.777778,1.555556,
Brighton and Hove Albion,2.0,1.333333,
Burnley,1.0,2.4,
Chelsea,1.666667,1.444444,
Crystal Palace,0.888889,1.444444,
Everton,1.111111,1.0,
Fulham,1.888889,1.222222,


In [10]:
def point_prediction(home, away):
    if home in df_stats.index and away in df_stats.index:
        # 2 different lambda for home team and away team
        lamb_home = df_stats.at[home, 'GoalsScored'] * df_stats.at[away, 'GoalsConceded']
        lamb_away = df_stats.at[away, 'GoalsScored'] * df_stats.at[home, 'GoalsConceded']
        prob_home, prob_away, prob_draw = 0,0,0
        for x in range(0,11):
            for y in range(0,11):
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                # p = 0 or 1
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p
        # win = 3 points, draw = 1 point, lose = 0 point
        points_home = 3 * prob_home + prob_draw
        points_away = 3 * prob_away + prob_draw
        return (points_home, points_away)
    else:
        return(0,0)

In [11]:
point_prediction('Manchester United', 'Bournemouth')

(1.2591470824516895, 1.5256470201350134)

In [12]:
df_table['Club'] = df_table['Club'].apply(lambda x: re.sub(r'\s+\b\w{3}\b$', '', x))
df_table


Unnamed: 0,Position,Club,Played,Won,Draw,Lost,GoalsFor,GoalsAgainst,GoalDiff,Points
0,1,Liverpool,19,12,6,1,39,16,23,42
1,2,Arsenal,18,12,4,2,36,16,20,40
2,3,Aston Villa,19,12,3,4,40,25,15,39
3,4,Tottenham Hotspur,18,11,3,4,37,24,13,36
4,5,Manchester City,17,10,4,3,40,20,20,34
5,6,Manchester United,19,10,1,8,21,25,-4,31
6,7,West Ham United,18,9,3,6,31,30,1,30
7,8,Newcastle United,19,9,2,8,37,25,12,29
8,9,Brighton and Hove Albion,18,7,6,5,34,31,3,27
9,10,Bournemouth,18,7,4,7,27,32,-5,25


In [13]:
final_table = df_table.copy()
teams = final_table['Club'].values
fixtures = df_fixture[df_fixture['Home'].isin(teams)]
for index, row in fixtures.iterrows():
    home, away = row['Home'], row['Away']
    points_home, points_away = point_prediction(home,away)
    final_table.loc[final_table['Club'] == home, 'Points'] += points_home
    final_table.loc[final_table['Club'] == away, 'Points'] += points_away

final_table = final_table.sort_values('Points', ascending=False).reset_index()
final_table = final_table[['Club', 'Points']]
final_table = final_table.round(0)

  final_table.loc[final_table['Club'] == home, 'Points'] += points_home


In [14]:
final_table

Unnamed: 0,Club,Points
0,Liverpool,87.0
1,Aston Villa,86.0
2,Arsenal,81.0
3,Manchester City,74.0
4,Newcastle United,72.0
5,Tottenham Hotspur,65.0
6,West Ham United,63.0
7,Brighton and Hove Albion,57.0
8,Fulham,52.0
9,Manchester United,49.0


In [15]:
final_fixture = df_fixture.copy()
final_fixture['Winner'] = '?'

In [16]:
def get_winner(fixture):
    for index, row in fixture.iterrows():
        home, away = row['Home'], row['Away']
        points_home, points_away = point_prediction(home, away)
        if points_home > points_away:
            winner = home
        else:
            winner = away
        fixture.loc[index, 'Winner'] = winner
    return fixture

In [17]:
get_winner(final_fixture)

Unnamed: 0,date,Home,Away,Winner
0,Saturday 30th December,Luton Town,Chelsea,Chelsea
1,Saturday 30th December,Aston Villa,Burnley,Aston Villa
2,Saturday 30th December,Crystal Palace,Brentford,Brentford
3,Saturday 30th December,Manchester City,Sheffield United,Manchester City
4,Saturday 30th December,Wolverhampton Wanderers,Everton,Everton
...,...,...,...,...
186,Saturday 30th December,Crystal Palace,Aston Villa,Aston Villa
187,Saturday 30th December,Liverpool,Wolverhampton Wanderers,Liverpool
188,Saturday 30th December,Luton Town,Fulham,Fulham
189,Saturday 30th December,Manchester City,West Ham United,Manchester City
