In [122]:
import psycopg2
import matplotlib
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [123]:
conn = psycopg2.connect("dbname=football")
cur = conn.cursor()

sql = """
SELECT
match.id,
country_id,
league_id,
season,
date,
home_team_api_id,
a.team_long_name AS home_team_long_name,
away_team_api_id,
b.team_long_name AS away_team_long_name,
home_team_goal,
away_team_goal,
goal
FROM
MATCH
INNER JOIN
team a
ON match.home_team_api_id = a.team_api_id
INNER JOIN
team b
ON match.away_team_api_id = b.team_api_id
WHERE goal IS NOT NULL;
"""

cur.execute(sql)

results = cur.fetchall()
colnames = [desc[0] for desc in cur.description]
conn.close()

In [124]:
results[0]

(7753,
 6,
 8,
 '2013/2014',
 '2014-03-29 00:00:00',
 8472,
 'Sunderland',
 8654,
 'West Ham',
 1,
 2,
 '<goal><value><event_incident_typefk>406</event_incident_typefk><elapsed>10</elapsed><comment>n</comment><sortorder>1</sortorder><type>goal</type><subtype>header</subtype><goal_type>n</goal_type><team>8654</team><player2>37169</player2><player1>47382</player1><stats><goals>1</goals><shoton>1</shoton></stats><id>3357995</id><n>202</n></value><value><event_incident_typefk>414</event_incident_typefk><elapsed>50</elapsed><comment>n</comment><sortorder>3</sortorder><type>goal</type><subtype>deflected</subtype><goal_type>n</goal_type><team>8654</team><player1>40015</player1><player2>47382</player2><stats><goals>1</goals><shoton>1</shoton></stats><id>3358213</id><n>235</n></value><value><event_incident_typefk>393</event_incident_typefk><elapsed>65</elapsed><comment>n</comment><sortorder>1</sortorder><type>goal</type><subtype>shot</subtype><goal_type>n</goal_type><team>8472</team><player1>24

In [125]:
len(results)

6711

In [126]:
colnames

['id',
 'country_id',
 'league_id',
 'season',
 'date',
 'home_team_api_id',
 'home_team_long_name',
 'away_team_api_id',
 'away_team_long_name',
 'home_team_goal',
 'away_team_goal',
 'goal']

In [127]:
df = pd.DataFrame(results, columns=colnames)
df.head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal
0,7753,6,8,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,2,<goal><value><event_incident_typefk>406</event...
1,7766,6,8,2013/2014,2014-04-12 00:00:00,10194,Stoke,10261,Newcastle United,1,0,<goal><value><event_incident_typefk>407</event...
2,7765,6,8,2013/2014,2014-05-04 00:00:00,10260,Manchester United,8667,Hull,3,1,<goal><value><event_incident_typefk>411</event...
3,7754,6,8,2013/2014,2014-03-29 00:00:00,10194,Stoke,8667,Hull,1,0,<goal><value><event_incident_typefk>80</event_...
4,8472,6,8,2012/2013,2012-12-01 00:00:00,9850,Norwich,8472,Sunderland,2,1,<goal><value><event_incident_typefk>393</event...


In [128]:
df.head(1).goal.values[0]

'<goal><value><event_incident_typefk>406</event_incident_typefk><elapsed>10</elapsed><comment>n</comment><sortorder>1</sortorder><type>goal</type><subtype>header</subtype><goal_type>n</goal_type><team>8654</team><player2>37169</player2><player1>47382</player1><stats><goals>1</goals><shoton>1</shoton></stats><id>3357995</id><n>202</n></value><value><event_incident_typefk>414</event_incident_typefk><elapsed>50</elapsed><comment>n</comment><sortorder>3</sortorder><type>goal</type><subtype>deflected</subtype><goal_type>n</goal_type><team>8654</team><player1>40015</player1><player2>47382</player2><stats><goals>1</goals><shoton>1</shoton></stats><id>3358213</id><n>235</n></value><value><event_incident_typefk>393</event_incident_typefk><elapsed>65</elapsed><comment>n</comment><sortorder>1</sortorder><type>goal</type><subtype>shot</subtype><goal_type>n</goal_type><team>8472</team><player1>24159</player1><player2>25075</player2><stats><goals>1</goals><shoton>1</shoton></stats><id>3358283</id><n

In [129]:
df.head(1)[['home_team_goal', 'away_team_goal']]

Unnamed: 0,home_team_goal,away_team_goal
0,1,2


In [130]:
import xml.etree.ElementTree as ET

test = ET.fromstring(df.head(1).goal.values[0])
test

<Element 'goal' at 0x117a98c78>

In [131]:
for neigh in test.findall('value'):
    print(neigh.find('team').text)
    print(neigh.find('elapsed').text)

8654
10
8654
50
8472
65


## Let's pick 60 minutes as the time we care about

In [132]:
def score_at_minutes(goal_xml, minutes, home_team_id, away_team_id):
    parsed = ET.fromstring(goal_xml)
    home_score = 0
    away_score = 0
    for neigh in parsed.findall('value'):
        time = int(neigh.find('elapsed').text)
        if time and time < minutes:
            try:
                goal_type = neigh.find('goal_type').text
                if goal_type and goal_type in ('n', 'o', 'p'):
                    if int(neigh.find('team').text) == home_team_id:
                        home_score += 1
                    elif int(neigh.find('team').text) == away_team_id:
                        away_score += 1
                    else:
                        print("Balls")
            except:
                continue
    return home_score, away_score

score_at_minutes(df.head(1).goal.values[0], 60, 9788, 9905)

Balls
Balls


(0, 0)

In [133]:
def final_score(row):
    return row.home_team_goal, row.away_team_goal

final_score(df.head(1))

(0    1
 Name: home_team_goal, dtype: int64, 0    2
 Name: away_team_goal, dtype: int64)

In [134]:
dicty = {}
for index, row in df.iterrows():
    score_at_60 = score_at_minutes(
        row.goal,
        78,
        row.home_team_api_id,
        row.away_team_api_id
    )
    score_at_end = final_score(row)
    try:
        dicty[str(score_at_60)].append(str(score_at_end))
    except:
        dicty[str(score_at_60)] = [str(score_at_end)]

In [135]:
len(dicty)

44

In [136]:
from collections import Counter

dicty.keys()

dict_keys(['(1, 2)', '(1, 0)', '(2, 1)', '(0, 4)', '(1, 1)', '(2, 2)', '(3, 1)', '(3, 0)', '(2, 0)', '(0, 1)', '(0, 0)', '(3, 2)', '(4, 1)', '(0, 2)', '(0, 3)', '(1, 3)', '(5, 2)', '(4, 0)', '(3, 3)', '(4, 2)', '(2, 3)', '(1, 5)', '(0, 6)', '(1, 4)', '(2, 4)', '(2, 5)', '(6, 3)', '(5, 1)', '(5, 0)', '(4, 4)', '(4, 3)', '(6, 0)', '(5, 3)', '(3, 4)', '(1, 7)', '(7, 1)', '(0, 5)', '(7, 0)', '(2, 7)', '(6, 1)', '(5, 4)', '(1, 6)', '(2, 6)', '(3, 5)'])

In [137]:
new_dicty = dict((key, Counter(val)) for key, val in dicty.items())

In [138]:
new_dicty

{'(1, 2)': Counter({'(1, 2)': 196,
          '(1, 3)': 66,
          '(2, 2)': 68,
          '(1, 4)': 8,
          '(3, 3)': 2,
          '(3, 2)': 14,
          '(2, 1)': 12,
          '(2, 3)': 10,
          '(2, 4)': 1,
          '(0, 3)': 3,
          '(3, 1)': 5,
          '(4, 2)': 1,
          '(5, 1)': 1}),
 '(1, 0)': Counter({'(1, 0)': 516,
          '(0, 1)': 14,
          '(2, 0)': 170,
          '(1, 1)': 146,
          '(1, 2)': 24,
          '(2, 1)': 35,
          '(3, 0)': 24,
          '(4, 0)': 9,
          '(2, 2)': 4,
          '(2, 3)': 1,
          '(4, 1)': 1,
          '(0, 2)': 2,
          '(-1, -1)': 1,
          '(3, 1)': 2,
          '(1, 3)': 1}),
 '(2, 1)': Counter({'(3, 1)': 90,
          '(2, 1)': 276,
          '(5, 1)': 4,
          '(2, 2)': 73,
          '(4, 1)': 17,
          '(3, 2)': 24,
          '(4, 2)': 5,
          '(1, 2)': 22,
          '(5, 0)': 2,
          '(1, 3)': 2,
          '(2, 3)': 10,
          '(3, 0)': 11,
          '(3, 3)'

In [139]:
[(i, new_dicty['(1, 1)'][i] / len(new_dicty['(1, 1)']) * 100.0) for i in new_dicty['(1, 1)']]

[('(1, 2)', 869.2307692307692),
 ('(2, 1)', 1146.1538461538462),
 ('(1, 1)', 3623.0769230769233),
 ('(1, 3)', 100.0),
 ('(0, 3)', 38.46153846153847),
 ('(2, 2)', 146.15384615384613),
 ('(3, 2)', 46.15384615384615),
 ('(2, 0)', 146.15384615384613),
 ('(3, 1)', 207.6923076923077),
 ('(1, 4)', 7.6923076923076925),
 ('(0, 2)', 61.53846153846154),
 ('(3, 0)', 92.3076923076923),
 ('(4, 1)', 7.6923076923076925)]

In [140]:
sum(new_dicty['(1, 1)'].values())

844

In [141]:
for key in new_dicty['(1, 1)']:
    print(key)

(1, 2)
(2, 1)
(1, 1)
(1, 3)
(0, 3)
(2, 2)
(3, 2)
(2, 0)
(3, 1)
(1, 4)
(0, 2)
(3, 0)
(4, 1)


In [142]:
new_dicty['(1, 1)']['(1, 4)']

1

In [143]:
import operator
tah = new_dicty['(0, 2)']
sorted([(e, tah[e]/sum(tah.values())) for e in tah], key=lambda x: x[1], reverse=True)

[('(0, 2)', 0.4859550561797753),
 ('(1, 2)', 0.1853932584269663),
 ('(0, 3)', 0.15730337078651685),
 ('(1, 1)', 0.05056179775280899),
 ('(1, 3)', 0.033707865168539325),
 ('(2, 2)', 0.033707865168539325),
 ('(0, 4)', 0.019662921348314606),
 ('(2, 1)', 0.011235955056179775),
 ('(2, 3)', 0.0056179775280898875),
 ('(1, 4)', 0.0056179775280898875),
 ('(2, 0)', 0.0028089887640449437),
 ('(0, 5)', 0.0028089887640449437),
 ('(1, 5)', 0.0028089887640449437),
 ('(3, 3)', 0.0028089887640449437)]

In [144]:
small_df = df[df.season != '2018/2019']
small_dicty = {}
for index, row in small_df.iterrows():
    score_at_60 = score_at_minutes(
        row.goal,
        78,
        row.home_team_api_id,
        row.away_team_api_id
    )
    score_at_end = final_score(row)
    try:
        small_dicty[str(score_at_60)].append(str(score_at_end))
    except:
        small_dicty[str(score_at_60)] = [str(score_at_end)]
        
small_new_dicty = dict((key, Counter(val)) for key, val in small_dicty.items())

In [145]:
import operator
tah = small_new_dicty['(2, 0)']
sorted([(e, tah[e]/sum(tah.values())) for e in tah], key=lambda x: x[1], reverse=True)

[('(2, 0)', 0.55078125),
 ('(3, 0)', 0.177734375),
 ('(2, 1)', 0.134765625),
 ('(3, 1)', 0.037109375),
 ('(1, 1)', 0.03125),
 ('(4, 0)', 0.02734375),
 ('(2, 2)', 0.01953125),
 ('(3, 2)', 0.005859375),
 ('(4, 1)', 0.00390625),
 ('(5, 0)', 0.00390625),
 ('(3, 3)', 0.001953125),
 ('(2, 3)', 0.001953125),
 ('(1, 2)', 0.001953125),
 ('(0, 3)', 0.001953125)]

In [146]:
new_df = df[df.season == '2018/2019']
newer_dicty = {}
for index, row in new_df.iterrows():
    score_at_60 = score_at_minutes(
        row.goal,
        78,
        row.home_team_api_id,
        row.away_team_api_id
    )
    score_at_end = final_score(row)
    try:
        newer_dicty[str(score_at_60)].append(str(score_at_end))
    except:
        newer_dicty[str(score_at_60)] = [str(score_at_end)]
        
new_new_dicty = dict((key, Counter(val)) for key, val in newer_dicty.items())

In [147]:
new_new_dicty

{'(3, 1)': Counter({'(3, 1)': 9, '(4, 1)': 2}),
 '(0, 0)': Counter({'(0, 0)': 37,
          '(0, 1)': 8,
          '(1, 1)': 1,
          '(0, 3)': 1,
          '(2, 0)': 2,
          '(1, 0)': 6,
          '(-1, -1)': 3,
          '(2, 1)': 1,
          '(0, 2)': 1}),
 '(1, 2)': Counter({'(1, 2)': 12,
          '(2, 2)': 9,
          '(3, 1)': 3,
          '(1, 3)': 1,
          '(3, 2)': 2,
          '(2, 3)': 2}),
 '(1, 0)': Counter({'(1, 1)': 13,
          '(2, 0)': 10,
          '(1, 0)': 37,
          '(2, 1)': 4,
          '(-1, -1)': 1,
          '(2, 2)': 1,
          '(3, 0)': 2,
          '(1, 2)': 1}),
 '(0, 1)': Counter({'(1, 1)': 11,
          '(1, 2)': 2,
          '(0, 1)': 31,
          '(0, 2)': 8,
          '(0, 4)': 1,
          '(1, 0)': 1,
          '(2, 2)': 1,
          '(0, 3)': 1,
          '(2, 0)': 1}),
 '(2, 1)': Counter({'(3, 1)': 6,
          '(2, 1)': 12,
          '(2, 3)': 1,
          '(1, 2)': 4,
          '(4, 1)': 3,
          '(2, 2)': 6,
        

In [148]:
small_new_dicty['(0, 0)']

Counter({'(0, 0)': 460,
         '(2, 0)': 9,
         '(0, 1)': 105,
         '(1, 0)': 124,
         '(1, 1)': 24,
         '(0, 2)': 18,
         '(1, 2)': 4,
         '(2, 1)': 8,
         '(2, 2)': 2,
         '(3, 0)': 3})

In [149]:
new_new_dicty['(0, 0)']

Counter({'(0, 0)': 37,
         '(0, 1)': 8,
         '(1, 1)': 1,
         '(0, 3)': 1,
         '(2, 0)': 2,
         '(1, 0)': 6,
         '(-1, -1)': 3,
         '(2, 1)': 1,
         '(0, 2)': 1})

In [150]:
tah = small_new_dicty['(3, 0)']
sorted([(e, tah[e]/sum(tah.values())) for e in tah], key=lambda x: x[1], reverse=True)

[('(3, 0)', 0.5205992509363296),
 ('(4, 0)', 0.20599250936329588),
 ('(3, 1)', 0.1348314606741573),
 ('(4, 1)', 0.04119850187265917),
 ('(2, 1)', 0.0299625468164794),
 ('(5, 0)', 0.026217228464419477),
 ('(2, 2)', 0.00749063670411985),
 ('(5, 1)', 0.00749063670411985),
 ('(4, 2)', 0.00749063670411985),
 ('(3, 2)', 0.00749063670411985),
 ('(2, 3)', 0.003745318352059925),
 ('(6, 1)', 0.003745318352059925),
 ('(6, 0)', 0.003745318352059925)]

In [151]:
mah = new_new_dicty['(1, 0)']
sorted([(e, mah[e]/sum(mah.values())) for e in mah], key=lambda x: x[1], reverse=True)

[('(1, 0)', 0.5362318840579711),
 ('(1, 1)', 0.18840579710144928),
 ('(2, 0)', 0.14492753623188406),
 ('(2, 1)', 0.057971014492753624),
 ('(3, 0)', 0.028985507246376812),
 ('(-1, -1)', 0.014492753623188406),
 ('(2, 2)', 0.014492753623188406),
 ('(1, 2)', 0.014492753623188406)]

In [152]:
sum([mah[e] for e in mah if int(e.split(',')[0].lstrip('()')) + int(e.split(',')[1].rstrip(')').strip()) >= 3]) / sum(mah.values())

0.11594202898550725

In [153]:
sum([tah[e] for e in tah if int(e.split(',')[0].lstrip('()')) + int(e.split(',')[1].rstrip(')').strip()) >= 3]) / sum(tah.values())

1.0

In [154]:
df.head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal
0,7753,6,8,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,2,<goal><value><event_incident_typefk>406</event...
1,7766,6,8,2013/2014,2014-04-12 00:00:00,10194,Stoke,10261,Newcastle United,1,0,<goal><value><event_incident_typefk>407</event...
2,7765,6,8,2013/2014,2014-05-04 00:00:00,10260,Manchester United,8667,Hull,3,1,<goal><value><event_incident_typefk>411</event...
3,7754,6,8,2013/2014,2014-03-29 00:00:00,10194,Stoke,8667,Hull,1,0,<goal><value><event_incident_typefk>80</event_...
4,8472,6,8,2012/2013,2012-12-01 00:00:00,9850,Norwich,8472,Sunderland,2,1,<goal><value><event_incident_typefk>393</event...


## ELO

In [155]:
HISTORY = 10
k_factor = 32
elo_width = 400.

def calculate_new_elos(rating_a, rating_b, score_a, k_factor, elo_width):
    """Calculates and returns the new Elo ratings for two players.
    score_a is 1 for a win by player A, 0 for a loss by player A, or 0.5 for a draw.
    """

    e_a = expected_result(rating_a, rating_b, elo_width)
    e_b = 1. - e_a
    new_rating_a = rating_a + k_factor * (score_a - e_a)
    score_b = 1. - score_a
    new_rating_b = rating_b + k_factor * (score_b - e_b)
    return new_rating_a, new_rating_b
    

def expected_result(elo_a, elo_b, elo_width):
    """
    https://en.wikipedia.org/wiki/Elo_rating_system#Mathematical_details
    """
    expect_a = 1.0/(1+10**((elo_b - elo_a)/elo_width))
    return expect_a


def update_end_of_season(elos):
    """Regression towards the mean
    
    Following 538 nfl methods
    https://fivethirtyeight.com/datalab/nfl-elo-ratings-are-back/
    """
    diff_from_mean = elos - np.mean(elos)
    elos -= diff_from_mean/3
    return elos

In [156]:
df.columns

Index(['id', 'country_id', 'league_id', 'season', 'date', 'home_team_api_id',
       'home_team_long_name', 'away_team_api_id', 'away_team_long_name',
       'home_team_goal', 'away_team_goal', 'goal'],
      dtype='object')

In [157]:
def build_season_teams(frame, season, previous_elos=None):
    season_teams = pd.DataFrame(pd.unique(frame[frame.season == season][['home_team_long_name', 'away_team_long_name']].values.ravel('K')), columns=['home_team_long_name'])
#    season_teams = frame[frame.season == season].home_team_long_name.drop_duplicates().reset_index()
    season_teams.loc[:, 'elo'] = 1000
    if previous_elos is not None:
        meany = np.mean(previous_elos.elo)
        previous_elos.loc[:, 'updated_elo'] = previous_elos.elo.apply(lambda x: x - (x - meany)/3)
        joiny = season_teams.merge(previous_elos, how='left', left_on='home_team_long_name', right_on='team_long_name')
        season_teams = joiny[['home_team_long_name', 'updated_elo']]
        season_teams.columns = ['home_team_long_name', 'elo']
        season_teams.fillna(1000., inplace=True)
    season_teams = season_teams[['home_team_long_name', 'elo']]
    season_teams.columns = ['team_long_name', 'elo']
    return season_teams


def calculate_result(row):
    if row.home_team_goal > row.away_team_goal:
        return 1
    elif row.home_team_goal == row.away_team_goal:
        return 0.5
    else:
        return 0

df.loc[:, 'home_elo'] = 1000.
df.loc[:, 'away_elo'] = 1000.
for key, frame in df.groupby(['country_id', 'league_id']):
    country, league = key
    sorted_frame = frame.sort_values(by='date')
    earliest_season = sorted_frame.head(1).season.values[0]
    season_frame = build_season_teams(sorted_frame, earliest_season)
    for index, row in sorted_frame.iterrows():
        new_season = row.season
        if new_season != earliest_season:
            earliest_season = new_season
            season_frame = build_season_teams(sorted_frame, new_season, season_frame)
            print("New season")
        
        home = row.home_team_long_name
        away = row.away_team_long_name
        home_elo = season_frame[season_frame.team_long_name == home].elo.values[0]
        away_elo = season_frame[season_frame.team_long_name == away].elo.values[0]
        df.loc[df.id == row.id, 'home_elo'] = home_elo
        df.loc[df.id == row.id, 'away_elo'] = away_elo
        expected_results = expected_result(home_elo, away_elo, elo_width)
        result = calculate_result(row)
        new_home_elo, new_away_elo = calculate_new_elos(home_elo, away_elo, result, k_factor, elo_width)
        season_frame.loc[season_frame.team_long_name == home, 'elo'] = new_home_elo
        season_frame.loc[season_frame.team_long_name == away, 'elo'] = new_away_elo
            
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season


In [158]:
df.head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo
0,7753,6,8,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,2,<goal><value><event_incident_typefk>406</event...,981.965594,972.570717
1,7766,6,8,2013/2014,2014-04-12 00:00:00,10194,Stoke,10261,Newcastle United,1,0,<goal><value><event_incident_typefk>407</event...,977.639499,970.314256
2,7765,6,8,2013/2014,2014-05-04 00:00:00,10260,Manchester United,8667,Hull,3,1,<goal><value><event_incident_typefk>411</event...,1076.975539,971.063257
3,7754,6,8,2013/2014,2014-03-29 00:00:00,10194,Stoke,8667,Hull,1,0,<goal><value><event_incident_typefk>80</event_...,961.061327,973.621643
4,8472,6,8,2012/2013,2012-12-01 00:00:00,9850,Norwich,8472,Sunderland,2,1,<goal><value><event_incident_typefk>393</event...,1015.063435,985.278237


In [159]:
df.sort_values(by='home_elo').tail()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo
2613,9730,7,9,2017/2018,2018-04-29 00:00:00,9847,Paris Saint-Germain,9747,Guingamp,2,2,<goal><value><event_incident_typefk>39</event_...,1233.759736,952.676943
5597,16433,10,15,2015/2016,2016-04-16 00:00:00,9885,Juventus,8540,Palermo,4,0,<goal><value><event_incident_typefk>411</event...,1235.951515,883.940895
2333,13163,8,12,2014/2015,2015-05-23 00:00:00,8634,Barcelona,9783,Deportivo La Coruna,2,2,<goal><value><event_incident_typefk>406</event...,1239.647373,986.90949
1809,8567,6,8,2017/2018,2018-03-04 00:00:00,8456,Manchester City,8455,Chelsea,1,0,<goal><value><event_incident_typefk>393</event...,1245.105838,1091.68585
72,8632,6,8,2017/2018,2018-05-09 00:00:00,8456,Manchester City,10204,Brighton,3,1,<goal><value><event_incident_typefk>393</event...,1259.126906,962.006253


In [160]:
np.mean(df.home_team_goal)

1.5398599314558188

In [161]:
np.mean(df.away_team_goal)

1.1701683802711966

In [162]:
HOME_TEAM_BASE = np.mean(df.home_team_goal)
AWAY_TEAM_BASE = np.mean(df.away_team_goal)

k_factor = 32
score_k_factor = 0.05

def calculate_new_elos_score(rating_a, rating_b, score_a, home, k_factor):
    """Calculates and returns the new Elo ratings for two players.
    score_a is 1 for a win by player A, 0 for a loss by player A, or 0.5 for a draw.
    """
    if home:
        e_a = expected_result_score(rating_a, rating_b, HOME_TEAM_BASE)
    else:
        e_a = expected_result_score(rating_a, rating_b, AWAY_TEAM_BASE)
    new_rating_a = rating_a + (k_factor * (score_a - e_a))
    new_rating_b = rating_b + (k_factor * (score_a - e_a))
    return new_rating_a, new_rating_b
    

def expected_result_score(elo_a, elo_b, base):
    """
    https://en.wikipedia.org/wiki/Elo_rating_system#Mathematical_details
    """
    expect_a = (elo_a/base) * (elo_b/base) * base
    return expect_a


def update_end_of_season(elos):
    """Regression towards the mean
    
    Following 538 nfl methods
    https://fivethirtyeight.com/datalab/nfl-elo-ratings-are-back/
    """
    diff_from_mean = elos - np.mean(elos)
    elos -= diff_from_mean/3
    return elos

In [163]:
def build_season_frame(frame, season, base_score, previous_elos=None):
    season_teams = pd.DataFrame(pd.unique(frame[frame.season == season][['home_team_long_name', 'away_team_long_name']].values.ravel('K')), columns=['home_team_long_name'])
    season_teams.loc[:, 'elo'] = base_score
    if previous_elos is not None:
        meany = np.mean(previous_elos.elo)
        previous_elos.loc[:, 'updated_elo'] = previous_elos.elo.apply(lambda x: x - (x - meany)/3)
        joiny = season_teams.merge(previous_elos, how='left', left_on='home_team_long_name', right_on='team_long_name')
        season_teams = joiny[['home_team_long_name', 'updated_elo']]
        season_teams.columns = ['home_team_long_name', 'elo']
        season_teams.fillna(base_score, inplace=True)
    season_teams = season_teams[['home_team_long_name', 'elo']]
    season_teams.columns = ['team_long_name', 'elo']
    return season_teams

def get_elos(frame, names):
    return [frame[frame.team_long_name == entry].elo.values[0] for entry in names]

df.loc[:, 'home_elo'] = 1000.
df.loc[:, 'away_elo'] = 1000.
df.loc[:, 'home_attack_elo'] = HOME_TEAM_BASE
df.loc[:, 'home_defence_elo'] = AWAY_TEAM_BASE
df.loc[:, 'away_attack_elo'] = AWAY_TEAM_BASE
df.loc[:, 'away_defence_elo'] = HOME_TEAM_BASE

for key, frame in df.groupby(['country_id', 'league_id']):
    country, league = key
    sorted_frame = frame.sort_values(by='date')
    earliest_season = sorted_frame.head(1).season.values[0]
    season_frame = build_season_frame(sorted_frame, earliest_season, 1000.)
    home_attack_frame = build_season_frame(sorted_frame, earliest_season, HOME_TEAM_BASE)
    away_attack_frame = build_season_frame(sorted_frame, earliest_season, AWAY_TEAM_BASE)
    home_defence_frame = build_season_frame(sorted_frame, earliest_season, AWAY_TEAM_BASE)
    away_defence_frame = build_season_frame(sorted_frame, earliest_season, HOME_TEAM_BASE)    
    for index, row in sorted_frame.iterrows():
        new_season = row.season
        if new_season != earliest_season:
            earliest_season = new_season
            season_frame = build_season_frame(sorted_frame, new_season, 1000., season_frame)
            home_attack_frame = build_season_frame(sorted_frame, new_season, HOME_TEAM_BASE, home_attack_frame)
            away_attack_frame = build_season_frame(sorted_frame, new_season, AWAY_TEAM_BASE, away_attack_frame)
            home_defence_frame = build_season_frame(sorted_frame, new_season, AWAY_TEAM_BASE, home_defence_frame)
            away_defence_frame = build_season_frame(sorted_frame, new_season, HOME_TEAM_BASE, away_defence_frame)                
            print("New season")
        
        home = row.home_team_long_name
        away = row.away_team_long_name
        home_elo, away_elo = get_elos(season_frame, (home, away))
        home_attack_elo = get_elos(home_attack_frame, (home,))[0]
        home_defence_elo = get_elos(home_defence_frame, (home,))[0]
        away_attack_elo = get_elos(away_attack_frame, (away,))[0]
        away_defence_elo = get_elos(away_defence_frame, (away,))[0]
        df.loc[df.id == row.id, 'home_elo'] = home_elo
        df.loc[df.id == row.id, 'away_elo'] = away_elo
        df.loc[df.id == row.id, 'home_attack_elo'] = home_attack_elo
        df.loc[df.id == row.id, 'away_attack_elo'] = away_attack_elo
        df.loc[df.id == row.id, 'home_defence_elo'] = home_defence_elo
        df.loc[df.id == row.id, 'away_defence_elo'] = away_defence_elo        
        expected_results = expected_result(home_elo, away_elo, elo_width)
        result = calculate_result(row)
        new_home_elo, new_away_elo = calculate_new_elos(home_elo, away_elo, result, k_factor, elo_width)
        season_frame.loc[season_frame.team_long_name == home, 'elo'] = new_home_elo
        season_frame.loc[season_frame.team_long_name == away, 'elo'] = new_away_elo
        
        home_team_goals = row.home_team_goal
        away_team_goals = row.away_team_goal
        if home_team_goals >= 0 and away_team_goals >= 0:
            new_home_attack_elo, new_away_defence_elo = calculate_new_elos_score(home_attack_elo, away_defence_elo, home_team_goals, True, score_k_factor)
            new_away_attack_elo, new_home_defence_elo = calculate_new_elos_score(away_attack_elo, home_defence_elo, away_team_goals, False, score_k_factor)        
            home_attack_frame.loc[home_attack_frame.team_long_name == home, 'elo'] = new_home_attack_elo
            home_defence_frame.loc[home_defence_frame.team_long_name == home, 'elo'] = new_home_defence_elo
            away_attack_frame.loc[away_attack_frame.team_long_name == away, 'elo'] = new_away_attack_elo
            away_defence_frame.loc[away_defence_frame.team_long_name == away, 'elo'] = new_away_defence_elo
        


New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season
New season


In [164]:
home

'Brescia'

In [165]:
df.sort_values(by='away_defence_elo').head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo
1297,16800,10,15,2017/2018,2018-04-29 00:00:00,8535,Fiorentina,9875,SSC Napoli,3,0,<goal><value><event_incident_typefk>393</event...,1077.346396,1227.126806,1.680332,1.048039,1.784249,0.829564
5923,16806,10,15,2017/2018,2018-04-22 00:00:00,9885,Juventus,9875,SSC Napoli,0,1,<goal><value><event_incident_typefk>406</event...,1211.109857,1211.127624,1.793017,0.632106,1.78239,0.880848
5746,16543,10,15,2015/2016,2016-05-08 00:00:00,9876,Verona,9885,Juventus,2,1,<goal><value><event_incident_typefk>20</event_...,963.049527,1248.81981,1.522245,1.414516,1.633214,0.922603
5911,16794,10,15,2017/2018,2018-04-14 00:00:00,8564,AC Milan,9875,SSC Napoli,0,0,<goal />,1120.33557,1215.3994,1.485367,1.050322,1.866141,0.925484
5551,16419,10,15,2015/2016,2016-04-24 00:00:00,8535,Fiorentina,9885,Juventus,1,2,<goal><value><event_incident_typefk>411</event...,1080.463687,1239.678409,1.812434,0.97501,1.599866,0.927167


In [166]:
df.loc[:, 'expected_home_goals'] = df.apply(lambda x: expected_result_score(x.home_attack_elo, x.away_defence_elo, HOME_TEAM_BASE), axis=1)
df.loc[:, 'expected_away_goals'] = df.apply(lambda x: expected_result_score(x.away_attack_elo, x.home_defence_elo, AWAY_TEAM_BASE), axis=1)
df.head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
0,7753,6,8,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,2,<goal><value><event_incident_typefk>406</event...,981.965594,972.570717,1.40645,1.131463,1.022914,1.28823,1.17662,0.989079
1,7766,6,8,2013/2014,2014-04-12 00:00:00,10194,Stoke,10261,Newcastle United,1,0,<goal><value><event_incident_typefk>407</event...,977.639499,970.314256,1.462535,1.224341,1.299059,1.593193,1.51319,1.3592
2,7765,6,8,2013/2014,2014-05-04 00:00:00,10260,Manchester United,8667,Hull,3,1,<goal><value><event_incident_typefk>411</event...,1076.975539,971.063257,1.593608,1.154404,0.85083,1.487305,1.539219,0.839368
3,7754,6,8,2013/2014,2014-03-29 00:00:00,10194,Stoke,8667,Hull,1,0,<goal><value><event_incident_typefk>80</event_...,961.061327,973.621643,1.48537,1.273296,0.899784,1.51014,1.456702,0.979082
4,8472,6,8,2012/2013,2012-12-01 00:00:00,9850,Norwich,8472,Sunderland,2,1,<goal><value><event_incident_typefk>393</event...,1015.063435,985.278237,1.427784,0.989749,1.154784,1.508813,1.398997,0.976737


## Building Models

In [167]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

new_df = df[(df.home_team_goal >= 0) & (df.away_team_goal >= 0)]
X_train, X_test, y_train, y_test = train_test_split(new_df[['home_elo', 'away_elo', 'home_attack_elo', 'home_defence_elo', 'away_attack_elo', 'away_defence_elo']].values,
    new_df.home_team_goal, test_size=0.15, random_state=42)

lr = LinearRegression()

lr.fit(
    X_train,
    y_train
)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [168]:
testy = lr.predict(X_test)

In [169]:
testy

array([1.63411563, 1.53707619, 1.54020824, ..., 2.05522614, 2.17199315,
       1.0859623 ])

In [170]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, testy)

1.508213408404409

In [171]:
lr.coef_

array([ 0.0025112 , -0.00263344,  0.78296954,  0.05083165, -0.13149658,
        0.32450907])

In [172]:
df.head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
0,7753,6,8,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,2,<goal><value><event_incident_typefk>406</event...,981.965594,972.570717,1.40645,1.131463,1.022914,1.28823,1.17662,0.989079
1,7766,6,8,2013/2014,2014-04-12 00:00:00,10194,Stoke,10261,Newcastle United,1,0,<goal><value><event_incident_typefk>407</event...,977.639499,970.314256,1.462535,1.224341,1.299059,1.593193,1.51319,1.3592
2,7765,6,8,2013/2014,2014-05-04 00:00:00,10260,Manchester United,8667,Hull,3,1,<goal><value><event_incident_typefk>411</event...,1076.975539,971.063257,1.593608,1.154404,0.85083,1.487305,1.539219,0.839368
3,7754,6,8,2013/2014,2014-03-29 00:00:00,10194,Stoke,8667,Hull,1,0,<goal><value><event_incident_typefk>80</event_...,961.061327,973.621643,1.48537,1.273296,0.899784,1.51014,1.456702,0.979082
4,8472,6,8,2012/2013,2012-12-01 00:00:00,9850,Norwich,8472,Sunderland,2,1,<goal><value><event_incident_typefk>393</event...,1015.063435,985.278237,1.427784,0.989749,1.154784,1.508813,1.398997,0.976737


In [173]:
from scipy.stats import poisson

x = np.arange(0, 10)
y = poisson.pmf(x, 1.406450)
y

array([2.45011532e-01, 3.44596469e-01, 2.42328852e-01, 1.13607805e-01,
       3.99459242e-02, 1.12363890e-02, 2.63390322e-03, 5.29207598e-04,
       9.30380034e-05, 1.45392555e-05])

In [174]:
new_df.expected_home_goals.apply(lambda y: poisson.pmf(x, y)[0]).head()

0    0.308319
1    0.220206
2    0.214549
3    0.233003
4    0.246844
Name: expected_home_goals, dtype: float64

In [175]:
new_df.home_team_goal.drop_duplicates()

0       1
2       3
4       2
5       0
8       4
46      5
228     6
457     7
1092    8
1782    9
Name: home_team_goal, dtype: int64

In [176]:
new_df.head(1)

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
0,7753,6,8,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,2,<goal><value><event_incident_typefk>406</event...,981.965594,972.570717,1.40645,1.131463,1.022914,1.28823,1.17662,0.989079


In [177]:
values = new_df.home_team_goal.values
probs = [poisson.pmf(x, e) for e in new_df.expected_home_goals.values]

In [178]:
from sklearn.metrics import log_loss
log_loss(values, probs)

1.547608313156479

In [179]:
away_values = new_df.away_team_goal.values
away_probs = [poisson.pmf(x[:-1], e) for e in new_df.expected_away_goals.values]
log_loss(away_values, away_probs)

1.3924873162935216

In [180]:
import xgboost

In [181]:
reg = xgboost.XGBRegressor(
    objective='count:poisson',
    max_depth=2,
    n_estimators=50,
    booster='gbtree',
    colsample_bytree=0.8,
    subsample=0.6
)

In [182]:
reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='count:poisson', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.6)

In [183]:
preds = reg.predict(X_test)

In [184]:
preds

array([1.5263484, 1.26412  , 1.5071039, ..., 1.9949919, 2.2121072,
       1.2111323], dtype=float32)

In [185]:
X_test[0]

array([1043.6940152 , 1042.72799128,    1.72769523,    1.13523469,
          1.36436616,    1.52448992])

In [186]:
handy = [poisson.pmf(x[:-1], e) for e in preds]

In [187]:
log_loss(y_test, handy)

1.5335642344437712

In [188]:
[(i, 100*e/sum(handy[0])) for i, e in enumerate(handy[0])]

[(0, 21.7334712672203),
 (1, 33.17284806135537),
 (2, 25.316660991967574),
 (3, 12.880681264725181),
 (4, 4.91510165665456),
 (5, 1.5004314631343434),
 (6, 0.381696848615436),
 (7, 0.08322890800182185),
 (8, 0.015879538325414517)]

## Getting serious

In [189]:
new_df.head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
0,7753,6,8,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,2,<goal><value><event_incident_typefk>406</event...,981.965594,972.570717,1.40645,1.131463,1.022914,1.28823,1.17662,0.989079
1,7766,6,8,2013/2014,2014-04-12 00:00:00,10194,Stoke,10261,Newcastle United,1,0,<goal><value><event_incident_typefk>407</event...,977.639499,970.314256,1.462535,1.224341,1.299059,1.593193,1.51319,1.3592
2,7765,6,8,2013/2014,2014-05-04 00:00:00,10260,Manchester United,8667,Hull,3,1,<goal><value><event_incident_typefk>411</event...,1076.975539,971.063257,1.593608,1.154404,0.85083,1.487305,1.539219,0.839368
3,7754,6,8,2013/2014,2014-03-29 00:00:00,10194,Stoke,8667,Hull,1,0,<goal><value><event_incident_typefk>80</event_...,961.061327,973.621643,1.48537,1.273296,0.899784,1.51014,1.456702,0.979082
4,8472,6,8,2012/2013,2012-12-01 00:00:00,9850,Norwich,8472,Sunderland,2,1,<goal><value><event_incident_typefk>393</event...,1015.063435,985.278237,1.427784,0.989749,1.154784,1.508813,1.398997,0.976737


In [190]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest


pipeline = Pipeline(
    [
        ('feat_select', SelectKBest(k=5)),
        ('gb', xgboost.XGBRegressor(objective='count:poisson'))
    ]
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
handy = [poisson.pmf(x[:-1], e) for e in y_pred]
log_loss(y_test, handy)

1.5356407303942032

In [191]:
pipeline = Pipeline(
    [
        ('feat_select', SelectKBest()),
        ('gb', xgboost.XGBRegressor(objective='count:poisson'))
    ]
)

parameters = {}
parameters['feat_select__k'] = [3, 'all']
parameters['gb__max_depth'] = [2, 4, 6]
parameters['gb__n_estimators'] = [10, 50, 100]
parameters['gb__subsample'] = [0.6, 0.9]
parameters['gb__colsample_by_tree'] = [0.6, 0.8, 0.95]

CV = GridSearchCV(pipeline, parameters, cv=3)
CV.fit(X_train, y_train)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('feat_select', SelectKBest(k=10, score_func=<function f_classif at 0x116516400>)), ('gb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_...       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'feat_select__k': [3, 'all'], 'gb__max_depth': [2, 4, 6], 'gb__n_estimators': [10, 50, 100], 'gb__subsample': [0.6, 0.9], 'gb__colsample_by_tree': [0.6, 0.8, 0.95]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [192]:
CV.best_score_

0.0775229299438998

In [193]:
CV.best_params_

{'feat_select__k': 'all',
 'gb__colsample_by_tree': 0.6,
 'gb__max_depth': 2,
 'gb__n_estimators': 50,
 'gb__subsample': 0.9}

In [194]:
y_pred = CV.predict(X_test)

In [195]:
handy = [poisson.pmf(x[:-1], e) for e in y_pred]
log_loss(y_test, handy)

1.5348428540107673

In [196]:
pipeline = Pipeline(
    [
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
#        ('poly', PolynomialFeatures()),
        ('gb', xgboost.XGBRegressor(objective='count:poisson'))
    ]
)

parameters = {}
#parameters['scaler__with_mean'] = [True, False]
#parameters['scaler__with_std'] = [True, False]
#parameters['poly__degree'] = [2, 3]
#parameters['poly__iteraction_only'] = [True, False]
parameters['gb__max_depth'] = [2]
parameters['gb__n_estimators'] = [35, 40, 45, 50]
parameters['gb__subsample'] = [0.8, 0.9, 0.95]
parameters['gb__colsample_by_tree'] = [0.2, 0.3, 0.4, 0.5]


CV = GridSearchCV(pipeline, parameters, cv=3, n_jobs=2)
CV.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1...       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'gb__max_depth': [2], 'gb__n_estimators': [35, 40, 45, 50], 'gb__subsample': [0.8, 0.9, 0.95], 'gb__colsample_by_tree': [0.2, 0.3, 0.4, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [197]:
CV.best_score_

0.07808628786864702

In [198]:
CV.best_params_

{'gb__colsample_by_tree': 0.2,
 'gb__max_depth': 2,
 'gb__n_estimators': 40,
 'gb__subsample': 0.9}

In [199]:
y_pred = CV.predict(X_test)
handy = [poisson.pmf(x[:-1], e) for e in y_pred]
log_loss(y_test, handy)

1.5349144746676286

## More Features

In [200]:
new_df.loc[:, 'country_id'] = new_df.country_id.astype(str)
new_df.loc[:, 'league_id'] = new_df.country_id.astype(str)
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
0,7753,6,6,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,2,<goal><value><event_incident_typefk>406</event...,981.965594,972.570717,1.40645,1.131463,1.022914,1.28823,1.17662,0.989079
1,7766,6,6,2013/2014,2014-04-12 00:00:00,10194,Stoke,10261,Newcastle United,1,0,<goal><value><event_incident_typefk>407</event...,977.639499,970.314256,1.462535,1.224341,1.299059,1.593193,1.51319,1.3592
2,7765,6,6,2013/2014,2014-05-04 00:00:00,10260,Manchester United,8667,Hull,3,1,<goal><value><event_incident_typefk>411</event...,1076.975539,971.063257,1.593608,1.154404,0.85083,1.487305,1.539219,0.839368
3,7754,6,6,2013/2014,2014-03-29 00:00:00,10194,Stoke,8667,Hull,1,0,<goal><value><event_incident_typefk>80</event_...,961.061327,973.621643,1.48537,1.273296,0.899784,1.51014,1.456702,0.979082
4,8472,6,6,2012/2013,2012-12-01 00:00:00,9850,Norwich,8472,Sunderland,2,1,<goal><value><event_incident_typefk>393</event...,1015.063435,985.278237,1.427784,0.989749,1.154784,1.508813,1.398997,0.976737


In [201]:
X_train, X_test, y_train, y_test = train_test_split(new_df[['country_id', 'league_id', 'season', 'home_elo', 'away_elo', 'home_attack_elo', 'home_defence_elo', 'away_attack_elo', 'away_defence_elo', 'expected_home_goals', 'expected_away_goals']].values,
    new_df.home_team_goal, test_size=0.2, random_state=19)

In [202]:
X_train[0]

array(['6', '6', '2017/2018', 1060.8837180400612, 1007.5394899949963,
       1.5879118115452453, 1.0569980708326523, 1.0164996776129402,
       1.5639324112393722, 1.6127354816730535, 0.9181911051039343],
      dtype=object)

In [203]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
ohe_country = OneHotEncoder(sparse=False)
ohe_league = OneHotEncoder(sparse=False)
ohe_season = OneHotEncoder(sparse=False)

country = ohe_country.fit_transform(X_train[:, 0].reshape(-1, 1))
league = ohe_league.fit_transform(X_train[:, 1].reshape(-1, 1))
season = ohe_season.fit_transform(X_train[:, 2].reshape(-1, 1))

modified_train = np.hstack((league, country, season, X_train[:, 3:]))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [204]:
pipeline = Pipeline(
    [
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('poly', PolynomialFeatures()),
        ('gb', xgboost.XGBRegressor(objective='count:poisson'))
    ]
)

parameters = {}
#parameters['scaler__with_mean'] = [False]
#parameters['scaler__with_std'] = [True, False]
parameters['poly__degree'] = [2]
#parameters['poly__iteraction_only'] = [True, False]
#parameters['gb__max_depth'] = [2, 3]
parameters['gb__n_estimators'] = [30]
parameters['gb__subsample'] = [0.6]
#parameters['gb__colsample_by_tree'] = [0.0, 0.01, 0.02]
parameters['gb__booster'] = ['gbtree']


CV = GridSearchCV(pipeline, parameters, cv=3, n_jobs=2)
CV.fit(modified_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('gb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'poly__degree': [2], 'gb__n_estimators': [30], 'gb__subsample': [0.6], 'gb__booster': ['gbtree']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [205]:
CV.best_score_

0.05919252689561855

In [206]:
CV.best_params_

{'gb__booster': 'gbtree',
 'gb__n_estimators': 30,
 'gb__subsample': 0.6,
 'poly__degree': 2}

In [207]:
country_test = ohe_country.transform(X_test[:, 0].reshape(-1, 1))
league_test = ohe_league.transform(X_test[:, 1].reshape(-1, 1))
season_test = ohe_season.transform(X_test[:, 2].reshape(-1, 1))

modified_test = np.hstack((league_test, country_test, season_test, X_test[:, 3:]))

y_pred = CV.predict(modified_test)
handy = [poisson.pmf(x[:-2], e) for e in y_pred]
log_loss(y_test, handy)



1.525431986909279

## Comparing to some odds

In [208]:
new_df[(new_df.home_team_long_name == 'Fulham')].sort_values(by='date', ascending=False).head(4).tail(1)

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
2342,8724,6,6,2018/2019,2018-08-11 00:00:00,9879,Fulham,9826,Crystal Palace,0,2,<goal><value><event_incident_typefk>393</event...,1000.0,1018.808843,1.53986,1.170168,1.157063,1.440781,1.440781,1.157063


In [209]:
test_row = new_df[(new_df.home_team_long_name == 'Fulham')].sort_values(by='date', ascending=False).head(4).tail(1)
test_row

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
2342,8724,6,6,2018/2019,2018-08-11 00:00:00,9879,Fulham,9826,Crystal Palace,0,2,<goal><value><event_incident_typefk>393</event...,1000.0,1018.808843,1.53986,1.170168,1.157063,1.440781,1.440781,1.157063


In [210]:
test_row_values = test_row[['country_id', 'league_id', 'season', 'home_elo', 'away_elo', 'home_attack_elo', 'home_defence_elo', 'away_attack_elo', 'away_defence_elo', 'expected_home_goals', 'expected_away_goals']].values

In [211]:
country_test_row = ohe_country.transform(test_row_values[:, 0].reshape(-1, 1))
league_test_row = ohe_league.transform(test_row_values[:, 1].reshape(-1, 1))
season_test_row = ohe_season.transform(test_row_values[:, 2].reshape(-1, 1))

modified_test_row = np.hstack((league_test_row, country_test_row, season_test_row, test_row_values[:, 3:]))

y_pred_row = CV.predict(modified_test_row)
handy_row = [poisson.pmf(x, e) for e in y_pred_row][0]
print(handy_row)

[2.73852625e-01 3.54684385e-01 2.29687433e-01 9.91610552e-02
 3.21074865e-02 8.31689972e-03 1.79529315e-03 3.32171598e-04
 5.37771359e-05 7.73891933e-06]




In [212]:
for i, f in enumerate(handy_row):
    print("{0} home goals: prob: {1:.3f}".format(i, 100.*f/sum(handy_row)))

0 home goals: prob: 27.385
1 home goals: prob: 35.468
2 home goals: prob: 22.969
3 home goals: prob: 9.916
4 home goals: prob: 3.211
5 home goals: prob: 0.832
6 home goals: prob: 0.180
7 home goals: prob: 0.033
8 home goals: prob: 0.005
9 home goals: prob: 0.001


In [213]:
sum(handy_row[1:])

0.7261462402311294

Shit me - got one. According to what I can find, the ltp of the odds of Palace keeping a clean sheet (i.e. Fulham scoring 0 goals) was 4.0 (or 25%) and of Palace not keeping a clean sheet (i.e. Fulham scoring 1+ goals of 1.29 (or 77.5%).

So I think the actual probability is 27.39% and therefore I think whoever bought a Crystal Palace clean sheet at 25% probably did well, and anybody who bought *not* a Crystal Palace clean sheet at 79% did a bad job.

In [214]:
new_df[(new_df.home_team_long_name == 'Chelsea')].sort_values(by='date', ascending=False).head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
981,8749,6,6,2018/2019,2018-11-11 00:00:00,8455,Chelsea,8668,Everton,0,0,<goal />,1149.105549,1082.361124,1.84782,1.009403,1.199808,1.619296,1.943142,1.03497
316,8716,6,6,2018/2019,2018-09-29 00:00:00,8455,Chelsea,8650,Liverpool,1,1,<goal><value><event_incident_typefk>393</event...,1138.504247,1128.461549,1.884563,1.033611,1.680245,1.417542,1.734864,1.484162
2451,8763,6,6,2018/2019,2018-09-15 00:00:00,8455,Chelsea,8344,Cardiff,4,1,<goal><value><event_incident_typefk>393</event...,1129.287421,972.074269,1.774608,1.032289,1.103608,1.562687,1.800916,0.973571
2377,8738,6,6,2018/2019,2018-09-01 00:00:00,8455,Chelsea,8678,Bournemouth,2,0,<goal><value><event_incident_typefk>393</event...,1117.338808,1027.408037,1.768364,1.089098,1.220751,1.632813,1.87511,1.136177
2432,8755,6,6,2018/2019,2018-08-18 00:00:00,8455,Chelsea,9825,Arsenal,3,2,<goal><value><event_incident_typefk>393</event...,1092.357777,1063.411744,1.698183,1.045692,1.266613,1.447548,1.596379,1.131877


In [215]:
test_row = new_df[(new_df.home_team_long_name == 'Chelsea')].sort_values(by='date', ascending=False).head(5).tail(1)
test_row

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
2432,8755,6,6,2018/2019,2018-08-18 00:00:00,8455,Chelsea,9825,Arsenal,3,2,<goal><value><event_incident_typefk>393</event...,1092.357777,1063.411744,1.698183,1.045692,1.266613,1.447548,1.596379,1.131877


In [216]:
test_row_values = test_row[['country_id', 'league_id', 'season', 'home_elo', 'away_elo', 'home_attack_elo', 'home_defence_elo', 'away_attack_elo', 'away_defence_elo', 'expected_home_goals', 'expected_away_goals']].values
country_test_row = ohe_country.transform(test_row_values[:, 0].reshape(-1, 1))
league_test_row = ohe_league.transform(test_row_values[:, 1].reshape(-1, 1))
season_test_row = ohe_season.transform(test_row_values[:, 2].reshape(-1, 1))

modified_test_row = np.hstack((league_test_row, country_test_row, season_test_row, test_row_values[:, 3:]))

y_pred_row = CV.predict(modified_test_row)
handy_row = [poisson.pmf(x, e) for e in y_pred_row][0]
print(handy_row)

[2.01097407e-01 3.22553378e-01 2.58682306e-01 1.38305864e-01
 5.54594718e-02 1.77910201e-02 4.75603152e-03 1.08978747e-03
 2.18497739e-04 3.89403243e-05]




In [217]:
for i, f in enumerate(handy_row):
    print("{0} home goals: prob: {1:.3f}".format(i, 100.*f/sum(handy_row)))

0 home goals: prob: 20.110
1 home goals: prob: 32.256
2 home goals: prob: 25.868
3 home goals: prob: 13.831
4 home goals: prob: 5.546
5 home goals: prob: 1.779
6 home goals: prob: 0.476
7 home goals: prob: 0.109
8 home goals: prob: 0.022
9 home goals: prob: 0.004


In [218]:
sum(handy_row[1:])

0.798895298106301

In [219]:
## Arsenal probability of clean sheet: 6.4 or 15.6%
## Arsenal probability of not clean sheet: 1.15 or 87%

In [220]:
new_df.home_team_long_name

0                  Sunderland
1                       Stoke
2           Manchester United
3                       Stoke
4                     Norwich
5                       Lille
6              Crystal Palace
7        West Bromwich Albion
8           Manchester United
9                     Swansea
10      Evian Thonon Gaillard
11                    Norwich
12                  Tottenham
13                    Everton
14                    Swansea
15                    Chelsea
16                   West Ham
17                    Arsenal
18                  Liverpool
19                  Tottenham
20              Saint-Etienne
21            Manchester City
22                   Guingamp
23           Newcastle United
24                  Liverpool
25                 Sunderland
26                  Liverpool
27                Southampton
28                  SC Bastia
29                 Sunderland
                ...          
6681                  Ternana
6682                Benevento
6683      

In [221]:
all_teams = pd.DataFrame(pd.unique(new_df[['home_team_long_name', 'away_team_long_name']].values.ravel('K')), columns=['home_team_long_name'])

In [222]:
all_teams

Unnamed: 0,home_team_long_name
0,Sunderland
1,Stoke
2,Manchester United
3,Norwich
4,Lille
5,Crystal Palace
6,West Bromwich Albion
7,Swansea
8,Evian Thonon Gaillard
9,Tottenham


In [223]:
conn = psycopg2.connect("dbname=football")
cur = conn.cursor()

sql = """
SELECT
event_name,
event_id,
open_date
from
market_changes
group by
event_name,
event_id,
open_date
"""

cur.execute(sql)

results = cur.fetchall()
colnames = [desc[0] for desc in cur.description]
conn.close()
match_df = pd.DataFrame(results, columns=colnames)
match_df.head()

Unnamed: 0,event_name,event_id,open_date
0,West Ham v Tottenham,28921881,2018-10-20 14:00:00
1,Arbroath v Airdrieonians,28919832,2018-09-29 14:00:00
2,Reading v Derby,28774943,2018-08-03 19:00:00
3,Colchester v Lincoln,28958868,2018-10-27 14:00:00
4,Armenia v Liechtenstein,28799539,2018-09-06 16:45:00


In [224]:
match_df.loc[:, 'home_team'] = match_df.event_name.apply(lambda x: x.split(' v ')[0])
match_df.loc[:, 'away_team'] = match_df.event_name.apply(lambda x: x.split(' v ')[1])
match_df.head()

Unnamed: 0,event_name,event_id,open_date,home_team,away_team
0,West Ham v Tottenham,28921881,2018-10-20 14:00:00,West Ham,Tottenham
1,Arbroath v Airdrieonians,28919832,2018-09-29 14:00:00,Arbroath,Airdrieonians
2,Reading v Derby,28774943,2018-08-03 19:00:00,Reading,Derby
3,Colchester v Lincoln,28958868,2018-10-27 14:00:00,Colchester,Lincoln
4,Armenia v Liechtenstein,28799539,2018-09-06 16:45:00,Armenia,Liechtenstein


In [225]:
all_teams.merge(
    match_df, 
    how='inner', 
    left_on=['home_team_long_name'],
    right_on=['away_team']
).home_team_long_name.drop_duplicates().shape

(33,)

In [226]:
df[['home_team_api_id', 'home_team_long_name']].drop_duplicates().to_csv('match_stats_teams.csv')
match_df[['home_team']].drop_duplicates().to_csv('match_odds_teams.csv')

In [227]:
new_df.groupby('country_id').size()

country_id
10    1884
6     1208
7     1213
8     1208
9     1194
dtype: int64

## Away Goals

In [228]:
X_train, X_test, y_train, y_test = train_test_split(new_df[['country_id', 'league_id', 'season', 'home_elo', 'away_elo', 'home_attack_elo', 'home_defence_elo', 'away_attack_elo', 'away_defence_elo', 'expected_home_goals', 'expected_away_goals']].values,
    new_df.away_team_goal, test_size=0.2, random_state=23)

ohe_country_a = OneHotEncoder(sparse=False)
ohe_league_a = OneHotEncoder(sparse=False)
ohe_season_a = OneHotEncoder(sparse=False)

country = ohe_country_a.fit_transform(X_train[:, 0].reshape(-1, 1))
league = ohe_league_a.fit_transform(X_train[:, 1].reshape(-1, 1))
season = ohe_season_a.fit_transform(X_train[:, 2].reshape(-1, 1))

modified_train = np.hstack((league, country, season, X_train[:, 3:]))

pipeline = Pipeline(
    [
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('poly', PolynomialFeatures()),
        ('gb', xgboost.XGBRegressor(objective='count:poisson'))
    ]
)

parameters = {}
#parameters['scaler__with_mean'] = [False]
#parameters['scaler__with_std'] = [True, False]
parameters['poly__degree'] = [2]
#parameters['poly__iteraction_only'] = [True, False]
#parameters['gb__max_depth'] = [2, 3]
parameters['gb__n_estimators'] = [30]
parameters['gb__subsample'] = [0.6]
#parameters['gb__colsample_by_tree'] = [0.0, 0.01, 0.02]
parameters['gb__booster'] = ['gbtree']


CV_a = GridSearchCV(pipeline, parameters, cv=3, n_jobs=2)
CV_a.fit(modified_train, y_train)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('gb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'poly__degree': [2], 'gb__n_estimators': [30], 'gb__subsample': [0.6], 'gb__booster': ['gbtree']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [229]:
CV_a.best_score_

0.045849245380143386

In [230]:
CV_a.best_params_

{'gb__booster': 'gbtree',
 'gb__n_estimators': 30,
 'gb__subsample': 0.6,
 'poly__degree': 2}

In [231]:
country_test = ohe_country_a.transform(X_test[:, 0].reshape(-1, 1))
league_test = ohe_league_a.transform(X_test[:, 1].reshape(-1, 1))
season_test = ohe_season_a.transform(X_test[:, 2].reshape(-1, 1))

modified_test = np.hstack((league_test, country_test, season_test, X_test[:, 3:]))

y_pred = CV_a.predict(modified_test)
handy = [poisson.pmf(x[:-3], e) for e in y_pred]
log_loss(y_test, handy)



1.3805115322775756

In [232]:
test_row = new_df[(new_df.home_team_long_name == 'Chelsea')].sort_values(by='date', ascending=False).head(5).tail(1)
test_row

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
2432,8755,6,6,2018/2019,2018-08-18 00:00:00,8455,Chelsea,9825,Arsenal,3,2,<goal><value><event_incident_typefk>393</event...,1092.357777,1063.411744,1.698183,1.045692,1.266613,1.447548,1.596379,1.131877


In [233]:
test_row_values = test_row[['country_id', 'league_id', 'season', 'home_elo', 'away_elo', 'home_attack_elo', 'home_defence_elo', 'away_attack_elo', 'away_defence_elo', 'expected_home_goals', 'expected_away_goals']].values
country_test_row = ohe_country.transform(test_row_values[:, 0].reshape(-1, 1))
league_test_row = ohe_league.transform(test_row_values[:, 1].reshape(-1, 1))
season_test_row = ohe_season.transform(test_row_values[:, 2].reshape(-1, 1))

modified_test_row = np.hstack((league_test_row, country_test_row, season_test_row, test_row_values[:, 3:]))

y_pred_row = CV_a.predict(modified_test_row)
handy_row = [poisson.pmf(x, e) for e in y_pred_row][0]
print(handy_row)



[3.52054378e-01 3.67534080e-01 1.91847209e-01 6.67608866e-02
 1.74240845e-02 3.63804302e-03 6.33001072e-04 9.44048422e-05
 1.23194735e-05 1.42901736e-06]


In [234]:
for i, f in enumerate(handy_row):
    print("{0} away goals: prob: {1:.3f}".format(i, 100.*f/sum(handy_row)))

0 away goals: prob: 35.205
1 away goals: prob: 36.753
2 away goals: prob: 19.185
3 away goals: prob: 6.676
4 away goals: prob: 1.742
5 away goals: prob: 0.364
6 away goals: prob: 0.063
7 away goals: prob: 0.009
8 away goals: prob: 0.001
9 away goals: prob: 0.000


In [235]:
test_row = new_df[(new_df.date >= '2018-10-01') & (new_df.date < '2018-11-01')].head(3).tail(1)
test_row_values = test_row[['country_id', 'league_id', 'season', 'home_elo', 'away_elo', 'home_attack_elo', 'home_defence_elo', 'away_attack_elo', 'away_defence_elo', 'expected_home_goals', 'expected_away_goals']].values
country_test_row = ohe_country.transform(test_row_values[:, 0].reshape(-1, 1))
league_test_row = ohe_league.transform(test_row_values[:, 1].reshape(-1, 1))
season_test_row = ohe_season.transform(test_row_values[:, 2].reshape(-1, 1))

modified_test_row = np.hstack((league_test_row, country_test_row, season_test_row, test_row_values[:, 3:]))

y_pred_row = CV.predict(modified_test_row)
handy_row = [poisson.pmf(x, e) for e in y_pred_row][0]
print(handy_row)

[2.90091825e-01 3.59005391e-01 2.22144956e-01 9.16390719e-02
 2.83521613e-02 7.01748750e-03 1.44742436e-03 2.55895895e-04
 3.95857441e-05 5.44329391e-06]




In [236]:
for i, f in enumerate(handy_row):
    print("{0} home goals: prob: {1:.3f}".format(i, 100.*f/sum(handy_row)))

0 home goals: prob: 29.009
1 home goals: prob: 35.901
2 home goals: prob: 22.215
3 home goals: prob: 9.164
4 home goals: prob: 2.835
5 home goals: prob: 0.702
6 home goals: prob: 0.145
7 home goals: prob: 0.026
8 home goals: prob: 0.004
9 home goals: prob: 0.001


In [237]:
y_pred_row_a = CV_a.predict(modified_test_row)
handy_row_a = [poisson.pmf(x, e) for e in y_pred_row_a][0]
for i, f in enumerate(handy_row_a):
    print("{0} away goals: prob: {1:.3f}".format(i, 100.*f/sum(handy_row_a)))

0 away goals: prob: 27.176
1 away goals: prob: 35.406
2 away goals: prob: 23.064
3 away goals: prob: 10.016
4 away goals: prob: 3.262
5 away goals: prob: 0.850
6 away goals: prob: 0.185
7 away goals: prob: 0.034
8 away goals: prob: 0.006
9 away goals: prob: 0.001




In [242]:
for hg, ho in enumerate(handy_row):
    for ag, ao in enumerate(handy_row_a):
        if hg <= 3 and ag <= 3:
            print("{0} - {1}: prob: {2:.3f}".format(hg, ag, 100.*(ho / sum(handy_row)) * (ao/sum(handy_row_a))))

0 - 0: prob: 7.884
0 - 1: prob: 10.271
0 - 2: prob: 6.691
0 - 3: prob: 2.906
1 - 0: prob: 9.756
1 - 1: prob: 12.711
1 - 2: prob: 8.280
1 - 3: prob: 3.596
2 - 0: prob: 6.037
2 - 1: prob: 7.865
2 - 2: prob: 5.124
2 - 3: prob: 2.225
3 - 0: prob: 2.490
3 - 1: prob: 3.245
3 - 2: prob: 2.114
3 - 3: prob: 0.918


In [243]:
test_row

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,away_team_goal,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals
317,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,3,<goal><value><event_incident_typefk>20</event_...,935.646356,1016.288978,1.444335,1.390078,1.112862,1.769227,1.659474,1.322002


## Testing

In [244]:
conn = psycopg2.connect("dbname=football")
cur = conn.cursor()

sql = """
SELECT
    a.event_name,
    a.name,
    a.ltp AS lay_ltp,
    a.odds AS lay_odds,
    a.vol AS lay_vol,
    a.pct_chance AS lay_pct_chance,
    b.ltp AS back_ltp,
    b.odds AS back_odds,
    b.vol AS back_vol,
    b.pct_chance AS back_pct_chance
FROM
last_correct_score_lay_odds a
INNER JOIN
last_correct_score_back_odds b
ON a.event_name = b.event_name
AND a.mc_id = b.mc_id
AND a.open_date = b.open_date
AND a.rc_id = b.rc_id
"""

cur.execute(sql)

recent_odds = cur.fetchall()
recent_colnames = [desc[0] for desc in cur.description]
conn.close()

In [245]:
odds_df = pd.DataFrame(recent_odds, columns=recent_colnames)
odds_df.head()

Unnamed: 0,event_name,name,lay_ltp,lay_odds,lay_vol,lay_pct_chance,back_ltp,back_odds,back_vol,back_pct_chance
0,18746827,0 - 0,0.0,60,9.66,1.6666666666666667,0.0,30.0,6.9,3.333333333333333
1,18746827,1 - 0,0.0,24,21.11,4.166666666666666,0.0,13.5,16.0,7.4074074074074066
2,18746827,1 - 1,0.0,17,26.53,5.882352941176471,0.0,12.5,4.0,8.0
3,18746827,0 - 1,0.0,46,1.57,2.1739130434782608,0.0,27.0,7.69,3.703703703703704
4,18746827,2 - 0,0.0,18,10.2,5.555555555555556,0.0,12.5,21.38,8.0


In [371]:
odds_df.columns = [
    'event_name',
    'score_name',
    'lay_ltp',
    'lay_odds',
    'lay_vol',
    'lay_pct_chance',
    'back_ltp',
    'back_odds',
    'back_vol',
    'back_pct_chance'
]

In [372]:
new_df.loc[:, 'joiny_string'] = new_df.home_team_long_name + ' v ' + new_df.away_team_long_name
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,...,goal,home_elo,away_elo,home_attack_elo,home_defence_elo,away_attack_elo,away_defence_elo,expected_home_goals,expected_away_goals,joiny_string
0,7753,6,6,2013/2014,2014-03-29 00:00:00,8472,Sunderland,8654,West Ham,1,...,<goal><value><event_incident_typefk>406</event...,981.965594,972.570717,1.40645,1.131463,1.022914,1.28823,1.17662,0.989079,Sunderland v West Ham
1,7766,6,6,2013/2014,2014-04-12 00:00:00,10194,Stoke,10261,Newcastle United,1,...,<goal><value><event_incident_typefk>407</event...,977.639499,970.314256,1.462535,1.224341,1.299059,1.593193,1.51319,1.3592,Stoke v Newcastle United
2,7765,6,6,2013/2014,2014-05-04 00:00:00,10260,Manchester United,8667,Hull,3,...,<goal><value><event_incident_typefk>411</event...,1076.975539,971.063257,1.593608,1.154404,0.85083,1.487305,1.539219,0.839368,Manchester United v Hull
3,7754,6,6,2013/2014,2014-03-29 00:00:00,10194,Stoke,8667,Hull,1,...,<goal><value><event_incident_typefk>80</event_...,961.061327,973.621643,1.48537,1.273296,0.899784,1.51014,1.456702,0.979082,Stoke v Hull
4,8472,6,6,2012/2013,2012-12-01 00:00:00,9850,Norwich,8472,Sunderland,2,...,<goal><value><event_incident_typefk>393</event...,1015.063435,985.278237,1.427784,0.989749,1.154784,1.508813,1.398997,0.976737,Norwich v Sunderland


In [373]:
new_df.merge(odds_df, how='inner', left_on='joiny_string', right_on='event_name')[['home_team_long_name', 'away_team_long_name']].drop_duplicates()

Unnamed: 0,home_team_long_name,away_team_long_name
0,Norwich,Stoke
19,Cardiff,Fulham
57,Everton,Southampton
133,Arsenal,Leicester
152,Fulham,Bournemouth
171,West Ham,Tottenham
249,Leicester,West Ham
287,Burnley,Chelsea
363,Leicester,Everton
420,Bournemouth,Norwich


In [374]:
current = new_df[(new_df.date >= '2018-10-01') & (new_df.date < '2018-11-01')]
merged = current.merge(odds_df, how='inner', left_on='joiny_string', right_on='event_name')
merged.head(20)

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,...,event_name,score_name,lay_ltp,lay_odds,lay_vol,lay_pct_chance,back_ltp,back_odds,back_vol,back_pct_chance
0,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,0 - 0,22.0,22.0,190.91,4.545454545454546,21.0,21.0,2.73,4.761904761904762
1,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,1 - 0,16.5,17.5,316.17,5.7142857142857135,16.5,16.5,11.54,6.0606060606060606
2,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,1 - 1,8.2,8.6,241.97,11.627906976744185,8.2,8.2,327.31,12.195121951219512
3,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,0 - 1,14.5,15.0,7.91,6.666666666666666,14.5,14.5,26.88,6.8965517241379315
4,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,2 - 0,22.0,23.0,121.25,4.3478260869565215,22.0,22.0,144.31,4.545454545454546
5,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,2 - 1,12.0,12.5,374.39,8.0,12.0,12.0,158.85,8.333333333333332
6,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,2 - 2,14.0,14.0,153.03,7.142857142857143,13.5,13.5,131.84,7.4074074074074066
7,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,1 - 2,11.0,11.5,213.23,8.695652173913043,11.0,11.0,191.21,9.090909090909092
8,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,0 - 2,18.5,19.0,109.54,5.263157894736842,18.5,18.5,99.55,5.405405405405404
9,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,Fulham v Bournemouth,3 - 0,46.0,48.0,16.18,2.083333333333333,46.0,46.0,76.92,2.1739130434782608


In [375]:
def calculate_home_score_probabilities(test_row_values):
    country_test_row = ohe_country.transform(test_row_values[:, 0].reshape(1,-1))
    league_test_row = ohe_league.transform(test_row_values[:, 1].reshape(1, -1))
    season_test_row = ohe_season.transform(test_row_values[:, 2].reshape(1, -1))
    modified_test_row = np.hstack((league_test_row, country_test_row, season_test_row, test_row_values[:, 3:]))
    y_pred_row = CV.predict(modified_test_row)
    handy_row = [poisson.pmf(x, e) for e in y_pred_row][0]
    return handy_row

def calculate_away_score_probabilities(test_row_values):
    country_test_row = ohe_country_a.transform(test_row_values[:, 0].reshape(1, -1))
    league_test_row = ohe_league_a.transform(test_row_values[:, 1].reshape(1, -1))
    season_test_row = ohe_season_a.transform(test_row_values[:, 2].reshape(1, -1))

    modified_test_row = np.hstack((league_test_row, country_test_row, season_test_row, test_row_values[:, 3:]))

    y_pred_row = CV_a.predict(modified_test_row)
    handy_row = [poisson.pmf(x, e) for e in y_pred_row][0]
    return handy_row
    
def calculate_score_probabilities(row):
    prob = None
    test_row = row[['country_id', 'league_id', 'season', 'home_elo', 'away_elo', 'home_attack_elo', 'home_defence_elo', 'away_attack_elo', 'away_defence_elo', 'expected_home_goals', 'expected_away_goals']].values
    home_probs = calculate_home_score_probabilities(test_row.reshape(1, -1))
    away_probs = calculate_away_score_probabilities(test_row.reshape(1, -1))
    for hg, ho in enumerate(home_probs):
        for ag, ao in enumerate(away_probs):
            if hg <= 3 and ag <= 3:
                name = '{0} - {1}'.format(hg, ag)
                if name == row.score_name:
                    prob = 100.*(ho / sum(home_probs)) * (ao / sum(away_probs))
    return prob
                

In [377]:
merged.loc[:, 'calculated_prob'] = merged.apply(lambda x: calculate_score_probabilities(x), axis=1, raw=True)
merged.head()













Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,...,score_name,lay_ltp,lay_odds,lay_vol,lay_pct_chance,back_ltp,back_odds,back_vol,back_pct_chance,calculated_prob
0,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,0 - 0,22.0,22.0,190.91,4.545454545454546,21.0,21.0,2.73,4.761904761904762,7.883663
1,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,1 - 0,16.5,17.5,316.17,5.7142857142857135,16.5,16.5,11.54,6.0606060606060606,9.756488
2,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,1 - 1,8.2,8.6,241.97,11.627906976744185,8.2,8.2,327.31,12.195121951219512,12.710965
3,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,0 - 1,14.5,15.0,7.91,6.666666666666666,14.5,14.5,26.88,6.8965517241379315,10.271007
4,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,2 - 0,22.0,23.0,121.25,4.3478260869565215,22.0,22.0,144.31,4.545454545454546,6.037109


In [378]:
merged.head()

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,...,score_name,lay_ltp,lay_odds,lay_vol,lay_pct_chance,back_ltp,back_odds,back_vol,back_pct_chance,calculated_prob
0,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,0 - 0,22.0,22.0,190.91,4.545454545454546,21.0,21.0,2.73,4.761904761904762,7.883663
1,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,1 - 0,16.5,17.5,316.17,5.7142857142857135,16.5,16.5,11.54,6.0606060606060606,9.756488
2,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,1 - 1,8.2,8.6,241.97,11.627906976744185,8.2,8.2,327.31,12.195121951219512,12.710965
3,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,0 - 1,14.5,15.0,7.91,6.666666666666666,14.5,14.5,26.88,6.8965517241379315,10.271007
4,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,2 - 0,22.0,23.0,121.25,4.3478260869565215,22.0,22.0,144.31,4.545454545454546,6.037109


In [389]:
import decimal
merged.loc[:, 'prob_diff'] = merged[['back_pct_chance', 'calculated_prob']].apply(lambda x: x.calculated_prob - float(x.back_pct_chance), axis=1)
merged.sort_values(by='prob_diff').head(10)

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,...,lay_ltp,lay_odds,lay_vol,lay_pct_chance,back_ltp,back_odds,back_vol,back_pct_chance,calculated_prob,prob_diff
129,8730,6,6,2018/2019,2018-10-28 00:00:00,8191,Burnley,8455,Chelsea,0,...,10.5,10.5,89.45,9.523809523809524,10.5,10.0,819.05,10.0,5.38908,-4.61092
133,8733,6,6,2018/2019,2018-10-06 00:00:00,8191,Burnley,9796,Huddersfield,1,...,7.8,8.0,754.03,12.5,7.8,7.8,51.7,12.82051282051282,8.71668,-4.103833
40,8736,6,6,2018/2019,2018-10-01 00:00:00,8197,Leicester,8668,Everton,1,...,7.0,7.0,12.51,14.285714285714285,7.0,6.8,194.83,14.705882352941176,10.989,-3.716858
122,8730,6,6,2018/2019,2018-10-28 00:00:00,8191,Burnley,8455,Chelsea,0,...,7.8,8.0,82.69,12.5,7.8,7.8,22.64,12.82051282051282,9.25409,-3.56642
38,8736,6,6,2018/2019,2018-10-01 00:00:00,8197,Leicester,8668,Everton,1,...,12.5,13.0,299.99,7.6923076923076925,12.5,12.5,16.99,8.0,4.88058,-3.11942
39,8736,6,6,2018/2019,2018-10-01 00:00:00,8197,Leicester,8668,Everton,1,...,9.2,9.2,130.97,10.869565217391305,9.0,9.0,71.17,11.111111111111112,8.1919,-2.919215
136,8733,6,6,2018/2019,2018-10-06 00:00:00,8191,Burnley,9796,Huddersfield,1,...,9.8,10.0,2.64,10.0,9.8,9.8,349.17,10.20408163265306,7.31282,-2.891262
135,8733,6,6,2018/2019,2018-10-06 00:00:00,8191,Burnley,9796,Huddersfield,1,...,7.0,7.2,754.93,13.888888888888888,7.0,7.0,56.0,14.285714285714285,11.7078,-2.577938
166,8734,6,6,2018/2019,2018-10-07 00:00:00,9879,Fulham,9825,Arsenal,1,...,12.5,12.5,272.96,8.0,12.5,12.0,223.01,8.333333333333332,5.98601,-2.347321
6,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,14.0,14.0,153.03,7.142857142857143,13.5,13.5,131.84,7.4074074074074066,5.12353,-2.283881


In [390]:
merged.sort_values(by='prob_diff', ascending=False).head(10)

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,...,lay_ltp,lay_odds,lay_vol,lay_pct_chance,back_ltp,back_odds,back_vol,back_pct_chance,calculated_prob,prob_diff
1,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,16.5,17.5,316.17,5.7142857142857135,16.5,16.5,11.54,6.0606060606060606,9.75649,3.695882
3,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,14.5,15.0,7.91,6.666666666666666,14.5,14.5,26.88,6.8965517241379315,10.271,3.374456
119,8730,6,6,2018/2019,2018-10-28 00:00:00,8191,Burnley,8455,Chelsea,0,...,36.0,38.0,31.58,2.631578947368421,36.0,36.0,100.82,2.7777777777777777,5.90441,3.126636
0,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,22.0,22.0,190.91,4.545454545454546,21.0,21.0,2.73,4.761904761904762,7.88366,3.121758
115,8730,6,6,2018/2019,2018-10-28 00:00:00,8191,Burnley,8455,Chelsea,0,...,32.0,32.0,86.94,3.125,32.0,30.0,229.94,3.333333333333333,6.40224,3.06891
155,8734,6,6,2018/2019,2018-10-07 00:00:00,9879,Fulham,9825,Arsenal,1,...,14.0,14.5,264.85,6.8965517241379315,14.0,14.0,576.26,7.142857142857143,10.0491,2.90625
153,8734,6,6,2018/2019,2018-10-07 00:00:00,9879,Fulham,9825,Arsenal,1,...,32.0,34.0,108.66,2.9411764705882355,32.0,32.0,77.62,3.125,5.92489,2.799895
142,8733,6,6,2018/2019,2018-10-06 00:00:00,8191,Burnley,9796,Huddersfield,1,...,30.0,32.0,269.1,3.125,30.0,30.0,103.48,3.333333333333333,5.96168,2.628344
79,8704,6,6,2018/2019,2018-10-20 00:00:00,8678,Bournemouth,8466,Southampton,0,...,15.5,15.5,126.91,6.451612903225807,15.0,15.0,52.19,6.666666666666666,9.2801,2.613435
137,8733,6,6,2018/2019,2018-10-06 00:00:00,8191,Burnley,9796,Huddersfield,1,...,11.5,12.0,103.59,8.333333333333332,11.5,11.5,186.08,8.695652173913043,11.1712,2.475556


In [396]:
def calculate_return(row, stake=1):
    try:
        predicted_home_goals = int(row.score_name.split('-')[0].strip())
        predicted_away_goals = int(row.score_name.split('-')[1].strip())
        if predicted_home_goals == row.home_team_goal and predicted_away_goals == row.away_team_goal:
            return stake * row.back_odds
        else:
            return -stake
    except:
        return None
    
merged.loc[:, 'bet_return'] = merged.apply(lambda x: calculate_return(x), axis=1)
merged.sort_values(by='bet_return', ascending=False).head(10)

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,...,lay_odds,lay_vol,lay_pct_chance,back_ltp,back_odds,back_vol,back_pct_chance,calculated_prob,prob_diff,bet_return
15,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,36.0,83.62,2.7777777777777777,34.0,34.0,103.28,2.9411764705882355,2.90557,-0.035603,34.0
76,8704,6,6,2018/2019,2018-10-20 00:00:00,8678,Bournemouth,8466,Southampton,0,...,16.0,10.44,6.25,16.0,15.0,170.41,6.666666666666666,7.44988,0.783213,15.0
45,8736,6,6,2018/2019,2018-10-01 00:00:00,8197,Leicester,8668,Everton,1,...,15.5,57.89,6.451612903225807,14.5,14.5,324.23,6.8965517241379315,7.37062,0.474065,14.5
21,8731,6,6,2018/2019,2018-10-27 00:00:00,8197,Leicester,8654,West Ham,1,...,7.6,1379.14,13.157894736842104,7.4,7.4,3307.12,13.513513513513512,12.5117,-1.001849,7.4
98,8707,6,6,2018/2019,2018-10-20 00:00:00,9796,Huddersfield,8650,Liverpool,0,...,7.8,531.07,12.82051282051282,7.4,7.4,1135.1,13.513513513513512,11.608,-1.905529,7.4
135,8733,6,6,2018/2019,2018-10-06 00:00:00,8191,Burnley,9796,Huddersfield,1,...,7.2,754.93,13.888888888888888,7.0,7.0,56.0,14.285714285714285,11.7078,-2.577938,7.0
117,8730,6,6,2018/2019,2018-10-28 00:00:00,8191,Burnley,8455,Chelsea,0,...,8.8,12.11,11.363636363636363,8.8,8.4,774.92,11.904761904761903,10.594,-1.310719,-1.0
110,8707,6,6,2018/2019,2018-10-20 00:00:00,9796,Huddersfield,8650,Liverpool,0,...,9.4,169.03,10.638297872340424,9.4,9.2,3.0,10.869565217391305,9.57172,-1.297848,-1.0
114,8730,6,6,2018/2019,2018-10-28 00:00:00,8191,Burnley,8455,Chelsea,0,...,17.0,80.73,5.882352941176471,16.5,16.5,33.1,6.0606060606060606,6.06401,0.0034,-1.0
115,8730,6,6,2018/2019,2018-10-28 00:00:00,8191,Burnley,8455,Chelsea,0,...,32.0,86.94,3.125,32.0,30.0,229.94,3.333333333333333,6.40224,3.06891,-1.0


In [405]:
merged[merged.bet_return > 0]

Unnamed: 0,id,country_id,league_id,season,date,home_team_api_id,home_team_long_name,away_team_api_id,away_team_long_name,home_team_goal,...,lay_odds,lay_vol,lay_pct_chance,back_ltp,back_odds,back_vol,back_pct_chance,calculated_prob,prob_diff,bet_return
15,8727,6,6,2018/2019,2018-10-22 00:00:00,9879,Fulham,8678,Bournemouth,0,...,36.0,83.62,2.7777777777777777,34.0,34.0,103.28,2.9411764705882355,2.90557,-0.035603,34.0
21,8731,6,6,2018/2019,2018-10-27 00:00:00,8197,Leicester,8654,West Ham,1,...,7.6,1379.14,13.157894736842104,7.4,7.4,3307.12,13.513513513513512,12.5117,-1.001849,7.4
45,8736,6,6,2018/2019,2018-10-01 00:00:00,8197,Leicester,8668,Everton,1,...,15.5,57.89,6.451612903225807,14.5,14.5,324.23,6.8965517241379315,7.37062,0.474065,14.5
76,8704,6,6,2018/2019,2018-10-20 00:00:00,8678,Bournemouth,8466,Southampton,0,...,16.0,10.44,6.25,16.0,15.0,170.41,6.666666666666666,7.44988,0.783213,15.0
98,8707,6,6,2018/2019,2018-10-20 00:00:00,9796,Huddersfield,8650,Liverpool,0,...,7.8,531.07,12.82051282051282,7.4,7.4,1135.1,13.513513513513512,11.608,-1.905529,7.4
135,8733,6,6,2018/2019,2018-10-06 00:00:00,8191,Burnley,9796,Huddersfield,1,...,7.2,754.93,13.888888888888888,7.0,7.0,56.0,14.285714285714285,11.7078,-2.577938,7.0


In [408]:
merged[['home_team_long_name', 'away_team_long_name', 'home_team_goal', 'away_team_goal']].drop_duplicates()

Unnamed: 0,home_team_long_name,away_team_long_name,home_team_goal,away_team_goal
0,Fulham,Bournemouth,0,3
19,Leicester,West Ham,1,1
38,Leicester,Everton,1,2
57,Cardiff,Fulham,4,2
76,Bournemouth,Southampton,0,0
95,Huddersfield,Liverpool,0,1
114,Burnley,Chelsea,0,4
133,Burnley,Huddersfield,1,1
152,Fulham,Arsenal,1,5
