# Bet on numbers of corners

Three parts:
- given data, create model
- given model, calculate probabilities
- given probabilities, decide bets and stakes


In [1]:
import os, math, datetime
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels
import statsmodels.api
import scipy

# scipy bugfix https://github.com/statsmodels/statsmodels/issues/3931
scipy.stats.chisqprob = lambda chisq, df: scipy.stats.chi2.sf(chisq, df)

  from pandas.core import datetools


# Part I

Given data, create model

## Load all data in data/ folder (for a single division)

- `data_dir` is the folder which keeps all data zip files.
- `cols` is an array of the column names that we want to load into memory
- `csv_file` is the name of the CSV file within each zip that we will load

In [2]:
csv_file = "SP1.csv"
data_dir = "data/"
cols = ["Date", "HomeTeam", "AwayTeam", "HC", "AC"]
# Date can be useful to keep in order to organize cross-validation (TODO)

df = pd.DataFrame()

for datafilename in os.listdir(data_dir):
    with zipfile.ZipFile(os.path.join(data_dir,datafilename), mode='r') as archive:
        t = pd.read_csv(archive.open(csv_file),
                        usecols = cols,
                        encoding='ISO-8859-1')
        df = df.append(t, ignore_index=True)

# careful, don't use pd.to_datetime here!
# pd.to_datetime reads "23/08/14" into "2014-08-23" (correct) but "01/11/14" into "2014-01-11" (wrong)
# dates should be parsed explicitely, not guessed.
# df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))

df.sort_values("Date", ascending=True, inplace=True)
df.index.name = "MatchId"
df.head()

Unnamed: 0_level_0,Date,HomeTeam,AwayTeam,HC,AC
MatchId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2014-08-23,Almeria,Espanol,11,7
1,2014-08-23,Granada,La Coruna,5,3
2,2014-08-23,Malaga,Ath Bilbao,5,4
3,2014-08-23,Sevilla,Valencia,3,3
4,2014-08-24,Barcelona,Elche,3,1


### Some stats:

In [3]:
display(len(df))
display(len(set(df['HomeTeam'])))
display(len(set(df['AwayTeam'])))

1140

26

26

## Description of the model

To predict number of corners we will predict Home Corners and Away Corners separately, and then sum them.

Each of these will be a Generalized Linear Model with Poisson errors and log link function, fitted using Maximum Likelihood Estimation (also known as [Poisson regression](https://en.wikipedia.org/wiki/Poisson_regression)).

- The first model will predict Home Corners and the link function factors will be `average number of corners that the home team scores at home` and `average number of corners that the away team suffers away`.

- The second model will predict Away Corners and the link function factors will be `average number of corners that the home team suffers at home` and `average number of corners that the away team scores away`.

So lets start by adding these features to the dataset

In [4]:
t = pd.concat([df.groupby("HomeTeam").agg({'HC': np.mean})
               .rename(columns={'HC':'corners_scored_home'}),
               df.groupby("AwayTeam").agg({'AC': np.mean})
               .rename(columns={'AC':'corners_scored_away'}),
               df.groupby("HomeTeam").agg({'AC': np.mean})
               .rename(columns={'AC':'corners_suffered_home'}),
               df.groupby("AwayTeam").agg({'HC': np.mean})
               .rename(columns={'HC':'corners_suffered_away'}),
              ], axis=1)
t.index.name = "Team"
t.head()

Unnamed: 0_level_0,corners_scored_home,corners_scored_away,corners_suffered_home,corners_suffered_away
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alaves,4.052632,3.210526,4.947368,7.631579
Almeria,5.421053,4.631579,6.210526,7.947368
Ath Bilbao,6.508772,4.649123,3.684211,5.087719
Ath Madrid,6.438596,4.175439,3.54386,4.614035
Barcelona,6.807018,5.403509,2.929825,4.0


(There must be a better way to do the steps below...)

In [5]:
df = pd.merge(df.reset_index(),
              t.reset_index()[["Team",'corners_scored_home']],
              left_on="HomeTeam",
              right_on="Team")\
.set_index("MatchId")\
.sort_index()\
.rename(columns={"corners_scored_home":'HomeTeam_AvgCornersScored'})
del df['Team']

df = pd.merge(df.reset_index(),
              t.reset_index()[["Team",'corners_suffered_away']],
              left_on="AwayTeam",
              right_on="Team")\
.set_index("MatchId")\
.sort_index()\
.rename(columns={"corners_suffered_away":'AwayTeam_AvgCornersSuffered'})
del df['Team']

df = pd.merge(df.reset_index(),
              t.reset_index()[["Team",'corners_scored_away']],
              left_on="AwayTeam",
              right_on="Team")\
.set_index("MatchId")\
.sort_index()\
.rename(columns={"corners_scored_away":'AwayTeam_AvgCornersScored'})
del df['Team']

df = pd.merge(df.reset_index(),
              t.reset_index()[["Team",'corners_suffered_home']],
              left_on="HomeTeam",
              right_on="Team")\
.set_index("MatchId")\
.sort_index()\
.rename(columns={"corners_suffered_home":'HomeTeam_AvgCornersSuffered'})
del df['Team']

df.head()

Unnamed: 0_level_0,Date,HomeTeam,AwayTeam,HC,AC,HomeTeam_AvgCornersScored,AwayTeam_AvgCornersSuffered,AwayTeam_AvgCornersScored,HomeTeam_AvgCornersSuffered
MatchId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2014-08-23,Almeria,Espanol,11,7,5.421053,6.070175,3.508772,6.210526
1,2014-08-23,Granada,La Coruna,5,3,5.070175,6.333333,4.122807,4.719298
2,2014-08-23,Malaga,Ath Bilbao,5,4,6.017544,5.087719,4.649123,3.894737
3,2014-08-23,Sevilla,Valencia,3,3,6.403509,6.22807,3.842105,3.754386
4,2014-08-24,Barcelona,Elche,3,1,6.807018,6.736842,3.894737,2.929825


# Fitting the models

Again, `HomeTeam_AvgCornersScored` and `AwayTeam_AvgCornersSuffered` will be used to predict `HomeCorners` and `AwayTeam_AvgCornersScored` and `HomeTeam_AvgCornersSuffered` will be used to predict `AwayCorners`

In [6]:
m = statsmodels.api.Poisson(df['HC'], 
                            df[['HomeTeam_AvgCornersScored','AwayTeam_AvgCornersSuffered']])\
                            .fit()
home_corners_model = m
m.summary()

Optimization terminated successfully.
         Current function value: 2.386274
         Iterations 9


0,1,2,3
Dep. Variable:,HC,No. Observations:,1140.0
Model:,Poisson,Df Residuals:,1138.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 04 May 2018",Pseudo R-squ.:,0.04165
Time:,11:39:21,Log-Likelihood:,-2720.4
converged:,True,LL-Null:,-2838.6
,,LLR p-value:,2.3039999999999996e-53

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
HomeTeam_AvgCornersScored,0.1495,0.011,13.009,0.000,0.127,0.172
AwayTeam_AvgCornersSuffered,0.1536,0.011,13.428,0.000,0.131,0.176


In [7]:
m = statsmodels.api.Poisson(df['AC'], 
                            df[['HomeTeam_AvgCornersSuffered','AwayTeam_AvgCornersScored']])\
                            .fit()
away_corners_model = m
m.summary()

Optimization terminated successfully.
         Current function value: 2.228510
         Iterations 8


0,1,2,3
Dep. Variable:,AC,No. Observations:,1140.0
Model:,Poisson,Df Residuals:,1138.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 04 May 2018",Pseudo R-squ.:,0.03517
Time:,11:39:21,Log-Likelihood:,-2540.5
converged:,True,LL-Null:,-2633.1
,,LLR p-value:,3.533e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
HomeTeam_AvgCornersSuffered,0.1616,0.016,9.837,0.000,0.129,0.194
AwayTeam_AvgCornersScored,0.1772,0.016,10.877,0.000,0.145,0.209


It seems we found some signal. In both cases the p-value is tiny and definitely allows us to reject the null hypothesis (the null hypotheses here is that those dimensions don't predict anything). The generalized $R^2$ is around 4% for the Home Corners model and around 3.5% for the Away Corners model.

# Summing our predictions for HomeCorners and AwayCorners to obtain a prediction for Corners

In [8]:
predicted_corners = home_corners_model.predict(df[['HomeTeam_AvgCornersScored', 'AwayTeam_AvgCornersSuffered']]) +\
                    away_corners_model.predict(df[['AwayTeam_AvgCornersScored', 'HomeTeam_AvgCornersSuffered']])
df['predicted_corners'] = predicted_corners


df.head()

Unnamed: 0_level_0,Date,HomeTeam,AwayTeam,HC,AC,HomeTeam_AvgCornersScored,AwayTeam_AvgCornersSuffered,AwayTeam_AvgCornersScored,HomeTeam_AvgCornersSuffered,predicted_corners
MatchId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2014-08-23,Almeria,Espanol,11,7,5.421053,6.070175,3.508772,6.210526,11.014422
1,2014-08-23,Granada,La Coruna,5,3,5.070175,6.333333,4.122807,4.719298,10.139984
2,2014-08-23,Malaga,Ath Bilbao,5,4,6.017544,5.087719,4.649123,3.894737,9.60034
3,2014-08-23,Sevilla,Valencia,3,3,6.403509,6.22807,3.842105,3.754386,10.401136
4,2014-08-24,Barcelona,Elche,3,1,6.807018,6.736842,3.894737,2.929825,10.943228


# Part II

Given model, calculate probabilities

# Part III

Given probabilities, decide bets and stakes

# TODO:

- cross-validation
- model should take into account that for some teams we have less data
- bet sizing according to the kelly criterion