In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from sklearn import linear_model
from scipy.stats import poisson
import matplotlib.pyplot as plt

In [2]:
def save_obj(obj, name):
    with open('C:/Users/Konny/DataScience/SpicedAcademy/fussball_vorhersagen/data/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('C:/Users/Konny/DataScience/SpicedAcademy/fussball_vorhersagen/data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
data = load_obj("Eng17")

In [4]:
def get_team_encoding(df):
    labels, levels = pd.factorize(pd.concat([df["HomeTeam"], df["AwayTeam"]]))
    club_dict = dict(zip(levels, range(len(set(levels)))))
    df["HomeId"] = df["HomeTeam"].map(club_dict)
    df["AwayId"] = df["AwayTeam"].map(club_dict)
    df["Result"] = df["FTHG"] - df["FTAG"]
    df["round"] = df["round"].apply(int)
    return df

In [5]:
def buildGamesMatrix3(games, num_teams):
    l = len(games)
    M = np.zeros([l*2, num_teams*2+1])
    R = np.zeros([l*2])
    for i, j in enumerate(games):
        M[i, j[0]] += 1
        M[i, j[1]+num_teams] += -1
        M[i, num_teams*2] += 1
        M[l+i, j[1]] += 1
        M[l+i, j[0]+num_teams] += -1
        R[i] += j[2]
        R[l+i] += j[3]
    return M, R

In [6]:
def get_ratings(df):
    df2 = df[["HomeId", "AwayId", "FTHG", "FTAG"]]
    M, R = buildGamesMatrix3(df2.values,20)
    clf = linear_model.Ridge(fit_intercept=False, alpha=0.1)
    clf.fit(M,R)
    ratings = clf.coef_
    correction = np.min(ratings[20:40])
    ratings[0:40] -= correction
    rat_off_dict = dict(zip(range(20),np.round(ratings[:20], 4)))
    rat_def_dict = dict(zip(range(20),np.round(ratings[20:40], 4)))
    rat_hfa = np.round(ratings[40], 4)
    return rat_off_dict, rat_def_dict, rat_hfa

In [7]:
def get_all_matchday_ratings(df):
    rat_offense_full = {}
    rat_defense_full = {}
    rat_hfa_full = {}
    for i in range(1, df["round"].max() + 1):
        matchday = df[df["round"] <= i].copy()
        offense, defense, hfa = get_ratings(matchday)
        rat_offense_full[i] = offense
        rat_defense_full[i] = defense
        rat_hfa_full[i] = hfa
    return rat_offense_full, rat_defense_full, rat_hfa_full

In [8]:
def get_exp_goals(df):
    df["exp_HG"] = np.where(df["Home_Off_Rat"] - df["Away_Def_Rat"] + df["HFA"] > 0, df["Home_Off_Rat"] - df["Away_Def_Rat"] + df["HFA"], 0)
    df["exp_AG"] = np.where(df["Away_Off_Rat"] - df["Home_Def_Rat"] > 0, df["Away_Off_Rat"] - df["Home_Def_Rat"], 0)
    df["exp_GD"] = df["exp_HG"] - df["exp_AG"]
    return df

In [9]:
def get_massey(df):
    df = get_team_encoding(df)
    rat_offense_full, rat_defense_full, hfa = get_all_matchday_ratings(df)
    home_off = [df[df["round"] == i+1].HomeId.map(j) for i, j in rat_offense_full.items()]
    away_off = [df[df["round"] == i+1].AwayId.map(j) for i, j in rat_offense_full.items()]
    home_def = [df[df["round"] == i+1].HomeId.map(j) for i, j in rat_defense_full.items()]
    away_def = [df[df["round"] == i+1].AwayId.map(j) for i, j in rat_defense_full.items()]
    home_off_rat = pd.DataFrame(pd.concat(home_off))
    home_off_rat.columns = ["Home_Off_Rat"]
    away_off_rat = pd.DataFrame(pd.concat(away_off))
    away_off_rat.columns = ["Away_Off_Rat"]
    home_def_rat = pd.DataFrame(pd.concat(home_def))
    home_def_rat.columns = ["Home_Def_Rat"]
    away_def_rat = pd.DataFrame(pd.concat(away_def))
    away_def_rat.columns = ["Away_Def_Rat"]
    hfa = pd.DataFrame(pd.Series(hfa), columns=["HFA"]).shift()
    df = df.merge(home_off_rat, left_index=True, right_index=True, how="left")
    df = df.merge(away_off_rat, left_index=True, right_index=True, how="left")
    df = df.merge(home_def_rat, left_index=True, right_index=True, how="left")
    df = df.merge(away_def_rat, left_index=True, right_index=True, how="left")
    df = df.merge(hfa, how="left", left_on="round", right_index=True)
    df = get_exp_goals(df)
    return df

In [11]:
test = get_massey(data)

In [12]:
test

Unnamed: 0,_id,away_player1,away_player10,away_player11,away_player2,away_player3,away_player4,away_player5,away_player6,away_player7,...,AwayId,Result,Home_Off_Rat,Away_Off_Rat,Home_Def_Rat,Away_Def_Rat,HFA,exp_HG,exp_AG,exp_GD
0,1125,Kasper Schmeichel,Ahmed Musa,Jamie Vardy,Luis Hernandez,Wes Morgan,Danny Simpson,Christian Fuchs,Danny Drinkwater,Andy King,...,16,1,,,,,,0.0000,0.0000,0.0000
1,1079,Heurelho Gomes,Troy Deeney,Odion Ighalo,Miguel Britos,Sebastian Proedl,Craig Cathcart,Nordin Amrabat,Valon Behrami,Adlene Guedioura,...,12,0,,,,,,0.0000,0.0000,0.0000
2,1057,Ben Foster,Jose Salomon Rondon,Saido Berahino,Jonas Olsson,Jonny Evans,Gareth McAuley,Craig Dawson,Claudio Yacob,Craig Gardner,...,14,-1,,,,,,0.0000,0.0000,0.0000
3,1064,Shay Given,Bojan Krkic,Mame Biram Diouf,Phil Bardsley,Erik Pieters,Ryan Shawcross,Philipp Wollscheid,Glenn Whelan,Marko Arnautovic,...,11,0,,,,,,0.0000,0.0000,0.0000
4,1123,Lukasz Fabianski,Wayne Routledge,Modou Barrow,Jordi Amat,Kyle Naughton,Federico Fernandez,Stephen Kingsley,Leon Britton,Leroy Fer,...,15,-1,,,,,,0.0000,0.0000,0.0000
5,1140,Hugo Lloris,Christian Eriksen,Harry Kane,Kyle Walker,Danny Rose,Toby Alderweireld,Jan Vertonghen,Erik Lamela,Victor Wanyama,...,13,0,,,,,,0.0000,0.0000,0.0000
6,1076,Vito Mannone,Duncan Watmore,Jermain Defoe,Patrick van Aanholt,Younes Kaboul,Donald Love,Lamine Kone,Jack Rodwell,John O'Shea,...,17,1,,,,,,0.0000,0.0000,0.0000
7,1061,David De Gea,Marouane Fellaini,Zlatan Ibrahimovic,Eric Bailly,Daley Blind,Luke Shaw,Antonio Valencia,Juan Mata,Wayne Rooney,...,10,-2,,,,,,0.0000,0.0000,0.0000
8,1139,Simon Mignolet,Roberto Firmino,Sadio Mane,Nathaniel Clyne,Dejan Lovren,Ragnar Klavan,Alberto Moreno,Georginio Wijnaldum,Jordan Henderson,...,19,-1,,,,,,0.0000,0.0000,0.0000
9,1053,Adrian,Enner Valencia,Andre Ayew,Winston Reid,James Collins,Arthur Masuaku,Michail Antonio,Haavard Nordtveit,Cheikhou Kouyate,...,18,1,,,,,,0.0000,0.0000,0.0000


In [13]:
def get_probs(H_xG, A_xG):
    """
    Takes expected goals for home and away team, and calculates
    probabilities for home win, draw and away win by using a poisson
    distribution
    """
    goals_home = []
    goals_away = []
    for i in range(8):
        goals_home.append(poisson.pmf(i, H_xG))
        goals_away.append(poisson.pmf(i, A_xG))
    row = pd.Series(goals_home, index=np.arange(0, 8))
    col = pd.Series(goals_away, index=np.arange(0, 8))
    df = row.apply(lambda r: r * col)
    home = sum(sum(np.tril(df, -1)))
    away = sum(sum(np.triu(df, 1)))
    draw = 1 - home - away
    return np.round(home, 4), np.round(draw, 4), np.round(away, 4)

In [14]:
test["H_pred_Poi"], test["D_pred_Poi"], test["A_pred_Poi"] = get_probs(test["exp_HG"], test["exp_AG"])

In [15]:
test.tail()

Unnamed: 0,_id,away_player1,away_player10,away_player11,away_player2,away_player3,away_player4,away_player5,away_player6,away_player7,...,Away_Off_Rat,Home_Def_Rat,Away_Def_Rat,HFA,exp_HG,exp_AG,exp_GD,H_pred_Poi,D_pred_Poi,A_pred_Poi
375,843,Hugo Lloris,Christian Eriksen,Harry Kane,Toby Alderweireld,Jan Vertonghen,Kieran Trippier,Ben Davies,Heung-Min Son,Victor Wanyama,...,2.4743,0.0,1.2526,0.4135,0.5211,2.4743,-1.9532,0.0589,0.1425,0.7985
376,764,Adrian,Robert Snodgrass,Andre Ayew,James Collins,Angelo Obinze Ogbonna,Jose Fonte,Aaron Cresswell,Manuel Lanzini,Sam Byram,...,1.5955,0.5601,0.2744,0.4135,1.5494,1.0354,0.514,0.4919,0.2547,0.2534
377,769,Jordan Pickford,Adnan Januzaj,Fabio Borini,Billy Jones,Joleon Lescott,John O'Shea,Bryan Oviedo,Lee Cattermole,Sebastian Larsson,...,1.125,1.0792,0.243,0.4135,2.6857,0.0458,2.6399,0.917,0.0798,0.0032
378,859,Wilfredo Caballero,Yaya Toure,Sergio Aguero,Vincent Kompany,Gael Clichy,Fernandinho,Nicolas Otamendi,Kevin De Bruyne,Leroy Sane,...,2.3795,0.2665,0.8776,0.4135,0.9996,2.113,-1.1134,0.1695,0.2029,0.6276
379,762,Brad Guzan,Adam Forshaw,Rudy Gestede,Fabio,George Friend,Ben Gibson,Calum Chambers,Grant Leadbitter,Adam Clayton,...,1.0844,0.8178,0.6261,0.4135,2.1716,0.2666,1.905,0.8177,0.1475,0.0348


In [17]:
test["Home"] = np.where(test["FTHG"] > test["FTAG"], 1, 0)
test["Draw"] = np.where(test["FTHG"] == test["FTAG"], 1, 0)
test["Away"] = np.where(test["FTHG"] < test["FTAG"], 1, 0)

In [18]:
def rps(prob_h, prob_d, prob_a, home, draw, away):
    """
    Calculates the rank probability score
    prob_h/d/a -> predicted probability
    home/draw/away -> 0 or 1 for actual result
    The lower RPS the better
    """
    step1 = prob_h - home
    step2 = prob_d - draw
    summe = step1 + step2
    rps = (step1 ** 2 + summe ** 2) / 2
    return rps

In [19]:
test["rps_poi"] = rps(test["H_pred_Poi"], test["D_pred_Poi"], test["A_pred_Poi"], test["Home"],test["Draw"],test["Away"])

In [20]:
test.tail()

Unnamed: 0,_id,away_player1,away_player10,away_player11,away_player2,away_player3,away_player4,away_player5,away_player6,away_player7,...,exp_HG,exp_AG,exp_GD,H_pred_Poi,D_pred_Poi,A_pred_Poi,Home,Draw,Away,rps
375,843,Hugo Lloris,Christian Eriksen,Harry Kane,Toby Alderweireld,Jan Vertonghen,Kieran Trippier,Ben Davies,Heung-Min Son,Victor Wanyama,...,0.5211,2.4743,-1.9532,0.0589,0.1425,0.7985,0,0,1,0.022016
376,764,Adrian,Robert Snodgrass,Andre Ayew,James Collins,Angelo Obinze Ogbonna,Jose Fonte,Aaron Cresswell,Manuel Lanzini,Sam Byram,...,1.5494,1.0354,0.514,0.4919,0.2547,0.2534,0,0,1,0.399689
377,769,Jordan Pickford,Adnan Januzaj,Fabio Borini,Billy Jones,Joleon Lescott,John O'Shea,Bryan Oviedo,Lee Cattermole,Sebastian Larsson,...,2.6857,0.0458,2.6399,0.917,0.0798,0.0032,1,0,0,0.00345
378,859,Wilfredo Caballero,Yaya Toure,Sergio Aguero,Vincent Kompany,Gael Clichy,Fernandinho,Nicolas Otamendi,Kevin De Bruyne,Leroy Sane,...,0.9996,2.113,-1.1134,0.1695,0.2029,0.6276,0,0,1,0.083706
379,762,Brad Guzan,Adam Forshaw,Rudy Gestede,Fabio,George Friend,Ben Gibson,Calum Chambers,Grant Leadbitter,Adam Clayton,...,2.1716,0.2666,1.905,0.8177,0.1475,0.0348,1,0,0,0.017222


In [22]:
test[(test["round"] >= 7) & (test["round"] <=34)]["rps"].mean()

0.1889678365714286