In [2]:
import os
import pickle
import pandas as pd
import numpy as np
from scipy.stats import poisson

In [43]:
def save_obj(obj, name):
    with open('Pickles/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('Pickles/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [44]:
data2 = load_obj("Eng17")

In [45]:
list(data2.columns)

['_id',
 'away_player1',
 'away_player10',
 'away_player11',
 'away_player2',
 'away_player3',
 'away_player4',
 'away_player5',
 'away_player6',
 'away_player7',
 'away_player8',
 'away_player9',
 'first_half_ended_at',
 'fixture_status',
 'fixture_status_short',
 'game_ended_at',
 'game_started_at',
 'home_player1',
 'home_player10',
 'home_player11',
 'home_player2',
 'home_player3',
 'home_player4',
 'home_player5',
 'home_player6',
 'home_player7',
 'home_player8',
 'home_player9',
 'id_country',
 'id_league',
 'id_referee',
 'id_season',
 'id_stage',
 'id_team_season_away',
 'id_team_season_home',
 'lineup_confirmed',
 'number_goal_team_away',
 'number_goal_team_home',
 'referee_name',
 'round',
 'schedule_date',
 'second_half_ended_at',
 'second_half_started_at',
 'spectators',
 'stadium',
 'team_season_away_name',
 'team_season_home_name',
 'home_id',
 'away_id',
 'h_poss',
 'h_shot_on',
 'h_shot_off',
 'h_corner',
 'h_cross',
 'h_offs',
 'h_fouls',
 'h_card',
 'a_poss',
 'a_sh

In [46]:
columns = ["round", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "PSCH", "PSCD", "PSCA"]

In [47]:
data = data2.loc[:,columns]

### Implementation 1:

In [48]:
data["LgHG"] = data["FTHG"].expanding().mean().shift()
data["LgAG"] = data["FTAG"].expanding().mean().shift()

In [49]:
data["TeamHG"] = data.groupby("HomeTeam")["FTHG"].apply(lambda x: x.expanding().mean().shift())
data["TeamAG"] = data.groupby("AwayTeam")["FTAG"].apply(lambda x: x.expanding().mean().shift())
data["Team_c_HG"] = data.groupby("HomeTeam")["FTAG"].apply(lambda x: x.expanding().mean().shift())
data["Team_c_AG"] = data.groupby("AwayTeam")["FTHG"].apply(lambda x: x.expanding().mean().shift())

In [50]:
data["H_att"] = data["TeamHG"] / data["LgHG"]
data["A_att"] = data["TeamAG"] / data["LgAG"]

In [51]:
data["H_def"] = data["Team_c_HG"] / data["LgAG"]
data["A_def"] = data["Team_c_AG"] / data["LgHG"]

In [52]:
data["H_xG"] = data["H_att"] * data["A_def"] * data["LgHG"]
data["A_xG"] = data["A_att"] * data["H_def"] * data["LgAG"]

In [53]:
data.tail()

Unnamed: 0,round,HomeTeam,AwayTeam,FTHG,FTAG,FTR,PSCH,PSCD,PSCA,LgHG,...,TeamHG,TeamAG,Team_c_HG,Team_c_AG,H_att,A_att,H_def,A_def,H_xG,A_xG
375,38,Hull,Tottenham,1,7,A,9.74,6.48,1.31,1.592,...,1.5,1.777778,1.555556,0.888889,0.942211,1.508296,1.319759,0.558347,0.837521,2.346238
376,38,Burnley,West Ham,1,2,A,2.31,3.58,3.24,1.590426,...,1.388889,1.444444,1.0,1.777778,0.873281,1.209602,0.837416,1.1178,1.5525,1.209602
377,38,Chelsea,Sunderland,5,1,H,1.12,11.5,25.5,1.588859,...,2.777778,0.666667,0.888889,1.666667,1.748284,0.55728,0.74304,1.048971,2.913807,0.49536
378,38,Watford,Manchester City,0,5,A,18.0,9.7,1.16,1.597884,...,1.388889,2.111111,1.333333,1.222222,0.869205,1.765487,1.115044,0.764901,1.062362,2.353982
379,38,Liverpool,Middlesbrough,3,0,H,1.15,10.5,19.09,1.593668,...,2.333333,0.555556,1.0,1.5,1.464128,0.460734,0.829322,0.941225,2.196192,0.460734


In [54]:
def get_probs(H_xG, A_xG):  
    goals_home = []
    goals_away = []
    for i in range(8):
        goals_home.append(poisson.pmf(i, H_xG))
        goals_away.append(poisson.pmf(i, A_xG))   
    row = pd.Series(goals_home, index=np.arange(0, 8))
    col = pd.Series(goals_away, index=np.arange(0, 8))
    df = row.apply(lambda r: r * col)
    home = sum(sum(np.tril(df, -1)))
    away = sum(sum(np.triu(df, 1)))
    draw = 1 - home - away
    return np.round(home, 4), np.round(draw, 4), np.round(away, 4)

In [55]:
data["H_pred"], data["D_pred"], data["A_pred"] = get_probs(data["H_xG"], data["A_xG"])

  return mu >= 0


In [56]:
def convert_odds(odds_h, odds_d, odds_a):
    inv_h = 1 / odds_h
    inv_d = 1 / odds_d
    inv_a = 1 / odds_a
    inv_sum = inv_h + inv_d + inv_a
    return inv_h / inv_sum, inv_d / inv_sum, inv_a / inv_sum

In [57]:
data["H_prob_odds"], data["D_prob_odds"], data["A_prob_odds"] = convert_odds(data["PSCH"], data["PSCD"], data["PSCA"])

In [58]:
data["Home"] = np.where(data["FTHG"] > data["FTAG"], 1, 0)
data["Draw"] = np.where(data["FTHG"] == data["FTAG"], 1, 0)
data["Away"] = np.where(data["FTHG"] < data["FTAG"], 1, 0)

In [None]:
test["rps_mix"] = rps(test["H_pred_mix"], test["D_pred_mix"], test["A_pred_mix"], test["Home"],test["Draw"],test["Away"])

In [59]:
def RPS(prob_h, prob_d, prob_a, home, draw, away):
    """
    Calculates the ranked probability score
    """
    probs = np.array([prob_h, prob_d, prob_a])
    outcome = np.array([home, draw, away])
    step = probs - outcome
    step = np.cumsum(step) ** 2
    rps = (step[0] + step[1]) / 2
    return rps

In [60]:
def RPS2(prob_h, prob_d, prob_a, home, draw, away):
    """
    Calculates the ranked probability score
    """
    step1 = prob_h - home
    step2 = prob_d - draw
    summe = step1 + step2
    rps = (step1 ** 2 + summe ** 2) / 2
    return rps

In [61]:
data["RPS_Bookie"] = RPS2(data["H_prob_odds"], data["D_prob_odds"], data["A_prob_odds"], data["Home"], data["Draw"], data["Away"])

In [63]:
data["RPS_Bookie"].mean()

0.1808690846218266

In [64]:
data["RPS_Poisson"] = RPS2(data["H_pred"], data["D_pred"], data["A_pred"], data["Home"], data["Draw"], data["Away"])

In [66]:
data["round"] = data["round"].apply(int)

In [67]:
data_p = data[data["round"] > 6]

In [68]:
data_p["RPS_Bookie"].mean()

0.17996031045168587

In [69]:
data_p["RPS_Poisson"].mean()

0.19408855043749998

## Implementation 2

In [28]:
def get_avg_goaldiff(df):
    """
    Calculates average goal difference per team at home and away
    """
    df["H_avg_GoalDiff"] = df.groupby('HomeTeam').GoalDiff.apply(lambda x: x.expanding().mean().shift(1))
    df["A_avg_GoalDiff"] = - df.groupby('AwayTeam').GoalDiff.apply(lambda x: x.expanding().mean().shift(1))
    return df


def get_avg_goals(df):
    df["LgHG"] = df["FTHG"].expanding().mean().shift()
    df["LgAG"] = df["FTAG"].expanding().mean().shift()
    df["H_avgG"] = df.groupby("HomeTeam")["FTHG"].apply(lambda x: x.expanding().mean().shift())
    df["A_avgG"] = df.groupby("AwayTeam")["FTAG"].apply(lambda x: x.expanding().mean().shift())
    df["H_avgG_c"] = df.groupby("HomeTeam")["FTAG"].apply(lambda x: x.expanding().mean().shift())
    df["A_avgG_c"] = df.groupby("AwayTeam")["FTHG"].apply(lambda x: x.expanding().mean().shift())
    return df


def get_avg_shots(df):
    df["LgHS"] = df["HS"].expanding().mean().shift()
    df["LgAS"] = df["AS"].expanding().mean().shift()
    df["LgHST"] = df["HST"].expanding().mean().shift()
    df["LgAST"] = df["AST"].expanding().mean().shift()
    df["H_avgS"] = df.groupby("HomeTeam")["HS"].apply(lambda x: x.expanding().mean().shift())
    df["A_avgS"] = df.groupby("AwayTeam")["AS"].apply(lambda x: x.expanding().mean().shift())
    df["H_avgS_c"] = df.groupby("HomeTeam")["AS"].apply(lambda x: x.expanding().mean().shift())
    df["A_avgS_c"] = df.groupby("AwayTeam")["HS"].apply(lambda x: x.expanding().mean().shift())
    df["H_avgST"] = df.groupby("HomeTeam")["HST"].apply(lambda x: x.expanding().mean().shift())
    df["A_avgST"] = df.groupby("AwayTeam")["AST"].apply(lambda x: x.expanding().mean().shift())
    df["H_avgST_c"] = df.groupby("HomeTeam")["AST"].apply(lambda x: x.expanding().mean().shift())
    df["A_avgST_c"] = df.groupby("AwayTeam")["HST"].apply(lambda x: x.expanding().mean().shift())
    return df


def get_stats(df):
    df = get_avg_goaldiff(df)
    df = get_avg_goals(df)
    df = get_avg_shots(df)
    return df

In [375]:
df = pd.read_csv("data2/E18.csv")

In [9]:
df[["HomeTeam", "AwayTeam", "HS", "HST", "AS", "AST"]].head()

Unnamed: 0,HomeTeam,AwayTeam,HS,HST,AS,AST
0,Aston Villa,Wigan,11,5,14,7
1,Blackburn,Man City,17,9,8,5
2,Bolton,Sunderland,11,3,20,13
3,Chelsea,Hull,26,12,7,3
4,Everton,Arsenal,8,5,15,9


In [211]:
df["GoalDiff"] = df["FTHG"] - df["FTAG"]

In [56]:
df_old = get_stats(df)

In [364]:
def get_avg_goaldiff_rolling(df, window=6, min_periods=4):
    """
    Calculates average goal difference per team at home and away
    """
    home = df.groupby('HomeTeam').GoalDiff.apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift(1))
    away = - df.groupby('AwayTeam').GoalDiff.apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift(1))
    return home, away

def get_avg_goaldiff_expanding(df):
    """
    Calculates average goal difference per team at home and away
    """
    home = df.groupby('HomeTeam').GoalDiff.apply(lambda x: x.expanding().mean().shift(1))
    away = - df.groupby('AwayTeam').GoalDiff.apply(lambda x: x.expanding().mean().shift(1))
    return home, away


def get_avg_goals_rolling(df, window=6, min_periods=4):
    Lg_HG = df["FTHG"].expanding().mean().shift()
    Lg_AG = df["FTAG"].expanding().mean().shift()
    H_avgG = df.groupby("HomeTeam")["FTHG"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    A_avgG = df.groupby("AwayTeam")["FTAG"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    H_avgG_c = df.groupby("HomeTeam")["FTAG"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    A_avgG_c = df.groupby("AwayTeam")["FTHG"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    return Lg_HG, Lg_AG, H_avgG, A_avgG, H_avgG_c, A_avgG_c

def get_avg_goals_expanding(df):
    Lg_HG = df["FTHG"].expanding().mean().shift()
    Lg_AG = df["FTAG"].expanding().mean().shift()
    H_avgG = df.groupby("HomeTeam")["FTHG"].apply(lambda x: x.expanding().mean().shift())
    A_avgG = df.groupby("AwayTeam")["FTAG"].apply(lambda x: x.expanding().mean().shift())
    H_avgG_c = df.groupby("HomeTeam")["FTAG"].apply(lambda x: x.expanding().mean().shift())
    A_avgG_c = df.groupby("AwayTeam")["FTHG"].apply(lambda x: x.expanding().mean().shift())
    return Lg_HG, Lg_AG, H_avgG, A_avgG, H_avgG_c, A_avgG_c


def get_avg_shots_rolling(df, window=6, min_periods=4):
    Lg_HS = df["HS"].expanding().mean().shift()
    Lg_AS = df["AS"].expanding().mean().shift()
    Lg_HST = df["HST"].expanding().mean().shift()
    Lg_AST = df["AST"].expanding().mean().shift()
    H_avgS = df.groupby("HomeTeam")["HS"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    A_avgS = df.groupby("AwayTeam")["AS"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    H_avgS_c = df.groupby("HomeTeam")["AS"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    A_avgS_c = df.groupby("AwayTeam")["HS"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    H_avgST = df.groupby("HomeTeam")["HST"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    A_avgST = df.groupby("AwayTeam")["AST"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    H_avgST_c = df.groupby("HomeTeam")["AST"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    A_avgST_c = df.groupby("AwayTeam")["HST"].apply(lambda x: x.rolling(window=window, min_periods=min_periods).mean().shift())
    return Lg_HS, Lg_AS, Lg_HST, Lg_AST, H_avgS, A_avgS, H_avgS_c, A_avgS_c, H_avgST, A_avgST, H_avgST_c, A_avgST_c


def get_avg_shots_expanding(df):
    Lg_HS = df["HS"].expanding().mean().shift()
    Lg_AS = df["AS"].expanding().mean().shift()
    Lg_HST = df["HST"].expanding().mean().shift()
    Lg_AST = df["AST"].expanding().mean().shift()
    H_avgS = df.groupby("HomeTeam")["HS"].apply(lambda x: x.expanding().mean().shift())
    A_avgS = df.groupby("AwayTeam")["AS"].apply(lambda x: x.expanding().mean().shift())
    H_avgS_c = df.groupby("HomeTeam")["AS"].apply(lambda x: x.expanding().mean().shift())
    A_avgS_c = df.groupby("AwayTeam")["HS"].apply(lambda x: x.expanding().mean().shift())
    H_avgST = df.groupby("HomeTeam")["HST"].apply(lambda x: x.expanding().mean().shift())
    A_avgST = df.groupby("AwayTeam")["AST"].apply(lambda x: x.expanding().mean().shift())
    H_avgST_c = df.groupby("HomeTeam")["AST"].apply(lambda x: x.expanding().mean().shift())
    A_avgST_c = df.groupby("AwayTeam")["HST"].apply(lambda x: x.expanding().mean().shift())
    return Lg_HS, Lg_AS, Lg_HST, Lg_AST, H_avgS, A_avgS, H_avgS_c, A_avgS_c, H_avgST, A_avgST, H_avgST_c, A_avgST_c



def get_stats(df2, mode="expanding", window=6, min_periods=4):
    df = df2.copy()
    if mode == "rolling":
        df["H_avgGD"], df["A_avgGD"] = get_avg_goaldiff_rolling(df, window, min_periods)
        df["Lg_HG"], df["Lg_AG"], df["H_avgG"], df["A_avgG"], df["H_avgG_c"], df["A_avgG_c"] = get_avg_goals_rolling(df, window, min_periods)
        df["Lg_HS"], df["Lg_AS"], df["Lg_HST"], df["Lg_AST"], df["H_avgS"], df["A_avgS"], df["H_avgS_c"], df["A_avgS_c"], df["H_avgST"], df["A_avgST"], df["H_avgST_c"], df["A_avgST_c"] = get_avg_shots_rolling(df, window, min_periods)
    else:
        df["H_avgGD"], df["A_avgGD"] = get_avg_goaldiff_expanding(df)
        df["Lg_HG"], df["Lg_AG"], df["H_avgG"], df["A_avgG"], df["H_avgG_c"], df["A_avgG_c"] = get_avg_goals_expanding(df)
        df["Lg_HS"], df["Lg_AS"], df["Lg_HST"], df["Lg_AST"], df["H_avgS"], df["A_avgS"], df["H_avgS_c"], df["A_avgS_c"], df["H_avgST"], df["A_avgST"], df["H_avgST_c"], df["A_avgST_c"] = get_avg_shots_expanding(df)
    return df


def get_stats2(df2, window=6, min_periods=4):
    df = df2.copy()
    H_avgGDr, A_avgGDr = get_avg_goaldiff_rolling(df, window, min_periods)
    H_avgGDe, A_avgGDe  = get_avg_goaldiff_expanding(df)
    df["H_avgGD"] = (H_avgGDr + 2 * H_avgGDe) / 3
    df["A_avgGD"] = (A_avgGDr + 2 * A_avgGDe) / 3
    
    Lg_HG, Lg_AG, H_avgG, A_avgG, H_avgG_c, A_avgG_c = get_avg_goals_rolling(df, window, min_periods)
    Lg_HGe, Lg_AGe, H_avgGe, A_avgGe, H_avgG_ce, A_avgG_ce = get_avg_goals_expanding(df)
    df["Lg_HG"] = (Lg_HG + 2*Lg_HGe) / 3
    df["Lg_AG"] = (Lg_AG + 2*Lg_AGe) / 3
    df["H_avgG"] = (H_avgG + 2*H_avgGe) / 3
    df["A_avgG"] = (A_avgG + 2*A_avgGe) / 3
    df["H_avgG_c"] = (H_avgG_c + 2*H_avgG_ce) / 3
    df["A_avgG_c"] = (A_avgG_c + 2*A_avgG_ce) / 3
    
    Lg_HS, Lg_AS, Lg_HST, Lg_AST, H_avgS, A_avgS, H_avgS_c, A_avgS_c, H_avgST, A_avgST, H_avgST_c, A_avgST_c = get_avg_shots_rolling(df, window, min_periods)
    Lg_HSe, Lg_ASe, Lg_HSTe, Lg_ASTe, H_avgSe, A_avgSe, H_avgS_ce, A_avgS_ce, H_avgSTe, A_avgSTe, H_avgST_ce, A_avgST_ce = get_avg_shots_expanding(df)
    df["Lg_HS"] = (Lg_HS + 2*Lg_HSe) / 3
    df["Lg_AS"] = (Lg_AS + 2*Lg_ASe) / 3
    df["Lg_HST"]  = (Lg_HST + 2*Lg_HSTe) / 3
    df["Lg_AST"]  = (Lg_AST + 2*Lg_ASTe) / 3
    df["H_avgS"]  = (H_avgS + 2*H_avgSe) / 3
    df["A_avgS"]  = (A_avgS + 2*A_avgSe) / 3
    df["H_avgS_c"] = (H_avgS_c + 2*H_avgS_ce) / 3
    df["A_avgS_c"] = (A_avgS_c + 2*A_avgS_ce) / 3
    df["H_avgST"] = (H_avgST + 2*H_avgSTe) / 3
    df["A_avgST"] = (A_avgST + 2*A_avgSTe) / 3
    df["H_avgST_c"] = (H_avgST_c + 2*H_avgST_ce) / 3
    df["A_avgST_c"] = (A_avgST_c + 2*A_avgST_ce) / 3
    
    return df

In [376]:
df = get_result_encoding(df)

In [380]:
test6 = get_stats(df, "rolling")

In [122]:
def get_probs(H_xG, A_xG):
    """
    Takes expected goals for home and away team, and calculates
    probabilities for home win, draw and away win by using a poisson
    distribution
    """
    goals_home = []
    goals_away = []
    for i in range(8):
        goals_home.append(poisson.pmf(i, H_xG))
        goals_away.append(poisson.pmf(i, A_xG))
    row = pd.Series(goals_home, index=np.arange(0, 8))
    col = pd.Series(goals_away, index=np.arange(0, 8))
    df = row.apply(lambda r: r * col)
    home = sum(sum(np.tril(df, -1)))
    away = sum(sum(np.triu(df, 1)))
    draw = 1 - home - away
    return np.round(home, 4), np.round(draw, 4), np.round(away, 4)


def rps(prob_h, prob_d, prob_a, home, draw, away):
    """
    Calculates the rank probability score
    prob_h/d/a -> predicted probability
    home/draw/away -> 0 or 1 for actual result
    The lower RPS the better
    """
    step1 = prob_h - home
    step2 = prob_d - draw
    summe = step1 + step2
    rps = (step1 ** 2 + summe ** 2) / 2
    return rps


def get_result_encoding(df):
    """
    Creates an encoding for the result
    """
    df["Home"] = np.where(df["FTHG"] > df["FTAG"], 1, 0)
    df["Draw"] = np.where(df["FTHG"] == df["FTAG"], 1, 0)
    df["Away"] = np.where(df["FTHG"] < df["FTAG"], 1, 0)
    df["GoalDiff"] = df["FTHG"] - df["FTAG"]
    df["GoalDiff_Ahc"] = df["GoalDiff"] + df["BbAHh"]
    df["Handicap"] = -df["BbAHh"]
    return df

In [419]:
def prepare_poisson(df, stat):
    """
    Creates necessary variables for expected goals model
    """
    df["H_att_Poi" + stat] = df["H_avg" + stat] / df["Lg_H" + stat]
    df["A_att_Poi" + stat] = df["A_avg" + stat] / df["Lg_A" + stat]
    df["H_def_Poi" + stat] = df["H_avg" + stat + "_c"] / df["Lg_A" + stat]
    df["A_def_Poi" + stat] = df["A_avg" + stat + "_c"] / df["Lg_H" + stat]
    df["H_xG_Poi" + stat] = df["H_att_Poi" + stat] * df["A_def_Poi" + stat] * df["Lg_HG"]
    df["A_xG_Poi" + stat] = df["A_att_Poi" + stat] * df["H_def_Poi" + stat] * df["Lg_AG"]
    return df


def shot_mix(df1):
    """
    Calculates the average of the poisson models for shots and shots on target
    """
    df1["H_att_Poi_mix"] = (3*df1["H_att_PoiS"] + 3*df1["H_att_PoiST"] + 2*df1["H_att_PoiG"]) / 8
    df1["A_att_Poi_mix"] = (3*df1["A_att_PoiS"] + 3*df1["A_att_PoiST"] + 2*df1["A_att_PoiG"]) / 8
    df1["H_def_Poi_mix"] = (3*df1["H_def_PoiS"] + 3*df1["H_def_PoiST"] + 2*df1["H_def_PoiG"]) / 8
    df1["A_def_Poi_mix"] = (3*df1["A_def_PoiS"] + 3*df1["A_def_PoiST"] + 2*df1["A_def_PoiG"]) / 8
#     df1["H_att_Poi_mix"] = (df1["H_att_PoiS"] + df1["H_att_PoiST"]) / 2
#     df1["A_att_Poi_mix"] = (df1["A_att_PoiS"] + df1["A_att_PoiST"]) / 2
#     df1["H_def_Poi_mix"] = (df1["H_def_PoiS"] + df1["H_def_PoiST"]) / 2
#     df1["A_def_Poi_mix"] = (df1["A_def_PoiS"] + df1["A_def_PoiST"]) / 2
    df1["H_xG_Poi_mix"] = df1["H_att_Poi_mix"] * df1["A_def_Poi_mix"] * df1["Lg_HG"]
    df1["A_xG_Poi_mix"] = df1["A_att_Poi_mix"] * df1["H_def_Poi_mix"] * df1["Lg_AG"]
    df1["H_pred_Poi_mix"], df1["D_pred_Poi_mix"], df1["A_pred_Poi_mix"] = get_probs(df1["H_xG_Poi_mix"], df1["A_xG_Poi_mix"])
    df1["rps_Poi_mix"] = rps(df1["H_pred_Poi_mix"], df1["D_pred_Poi_mix"], df1["A_pred_Poi_mix"], df1["Home"], df1["Draw"], df1["Away"])
    return df1


def delete_stats(df):
    """
    Gets rid of unwanted columns
    """
    del df["H_att_PoiS"]
    del df["H_att_PoiST"]
    del df["A_att_PoiS"]
    del df["A_att_PoiST"]
    del df["H_def_PoiS"]
    del df["H_def_PoiST"]
    del df["A_def_PoiS"]
    del df["A_def_PoiST"]
    del df["H_xG_PoiS"]
    del df["H_xG_PoiST"]
    del df["A_xG_PoiS"]
    del df["A_xG_PoiST"]
    return df


def get_poisson(df, stat="MIX"):
    """
    Main function: Takes Dataframe and returns Dataframe with exp goals,
    winning probabilities and RPS of the prediction
    stat: the statistic that is used to calculate the poisson model
        G(oals), S(hots), ST(arget), MIX between S and ST
    delete:
    """
    assert stat in ["G", "S", "ST", "MIX", "ALL"], "Choose G, S, ST, MIX or ALL as stat"

    if stat == "MIX":
        df1 = df.copy()
        for i in ["S", "ST"]:
            df1 = prepare_poisson(df1, i)
        df1 = shot_mix(df1)
        df1 = delete_stats(df1)
        return df1

    elif stat == "ALL":
        df1 = df.copy()
        for i in ["G", "S", "ST"]:
            df1 = prepare_poisson(df1, i)
            df1["H_pred_Poi" + i], df1["D_pred_Poi" + i], df1["A_pred_Poi" + i] = get_probs(df1["H_xG_Poi" + i], df1["A_xG_Poi" + i])
            df1["rps_Poi" + i] = rps(df1["H_pred_Poi" + i], df1["D_pred_Poi" + i], df1["A_pred_Poi" + i], df1["Home"], df1["Draw"], df1["Away"])
        df1 = shot_mix(df1)
        return df1

    else:
        df1 = df.copy()
        df1 = prepare_poisson(df1, stat)
        df1["H_pred_Poi" + stat], df1["D_pred_Poi" + stat], df1["A_pred_Poi" + stat] = get_probs(df1["H_xG_Poi" + stat], df1["A_xG_Poi" + stat])
        df1["rps_Poi" + stat] = rps(df1["H_pred_Poi" + stat], df1["D_pred_Poi" + stat], df1["A_pred_Poi" + stat], df1["Home"], df1["Draw"], df1["Away"])
        return df1


In [381]:
expanding = get_poisson(test6, "ALL")

  return mu >= 0


In [383]:
test2 = (rolling[["H_xG_Poi_mix", "A_xG_Poi_mix"]] + expanding[["H_xG_Poi_mix", "A_xG_Poi_mix"]]) / 2

In [384]:
test2 = pd.concat([test2, df[["Home", "Draw", "Away"]]], axis=1)

In [385]:
test2["H_pred_Poi"], test2["D_pred_Poi"], test2["A_pred_Poi"] = get_probs(test2["H_xG_Poi_mix"], test2["A_xG_Poi_mix"])
test2["rps_Poi"] = rps(test2["H_pred_Poi"], test2["D_pred_Poi"], test2["A_pred_Poi"], test2["Home"], test2["Draw"], test2["Away"])

  return mu >= 0


In [386]:
test2.iloc[100:340]["rps_Poi"].mean()

0.20130085520833346

In [387]:
test2["rps_Poi"].mean()

0.20261220261744972

In [420]:
rps_Gr = {}
rps_Sr = {}
rps_STr = {}
rps_mixr = {}
rps_mixer = {}
rps_Ge = {}
rps_Se = {}
rps_STe = {}
rps_mixe = {}

for i in range(10, 19):
    df = pd.read_csv("data2/SP" + str(i) + ".csv")
    df = get_result_encoding(df)
    df10 = get_stats(df, "rolling")
    rolling = get_poisson(df10, "ALL")
    df11 = get_stats(df)
    expanding = get_poisson(df11, "ALL")
    test2 = (rolling[["H_xG_Poi_mix", "A_xG_Poi_mix"]] + 2 * expanding[["H_xG_Poi_mix", "A_xG_Poi_mix"]]) / 3
    test2 = pd.concat([test2, df[["Home", "Draw", "Away"]]], axis=1)
    test2["H_pred_Poi"], test2["D_pred_Poi"], test2["A_pred_Poi"] = get_probs(test2["H_xG_Poi_mix"], test2["A_xG_Poi_mix"])
    test2["rps_Poi"] = rps(test2["H_pred_Poi"], test2["D_pred_Poi"], test2["A_pred_Poi"], test2["Home"], test2["Draw"], test2["Away"])
    
    rps_Gr[i] = rolling["rps_PoiG"].iloc[100:340].mean()
    rps_Sr[i] = rolling["rps_PoiS"].iloc[100:340].mean()
    rps_STr[i] = rolling["rps_PoiST"].iloc[100:340].mean()
    rps_mixr[i] = rolling["rps_Poi_mix"].iloc[100:340].mean()
    rps_mixer[i] = test2["rps_Poi"].iloc[100:340].mean()
    rps_Ge[i] = expanding["rps_PoiG"].iloc[100:340].mean()
    rps_Se[i] = expanding["rps_PoiS"].iloc[100:340].mean()
    rps_STe[i] = expanding["rps_PoiST"].iloc[100:340].mean()
    rps_mixe[i] = expanding["rps_Poi_mix"].iloc[100:340].mean()
    
    print(f"Season 20{i} done!")

  return mu >= 0


Season 2010 done!
Season 2011 done!
Season 2012 done!
Season 2013 done!
Season 2014 done!
Season 2015 done!
Season 2016 done!
Season 2017 done!
Season 2018 done!


In [357]:
sum([i for i in rps_Gr.values()]) / 9 , sum([i for i in rps_Ge.values()]) / 9

(0.2195297536111111, 0.21219033930092596)

In [358]:
sum([i for i in rps_Sr.values()]) / 9 , sum([i for i in rps_Se.values()]) / 9

(0.20583712476388888, 0.2034064886249999)

In [359]:
sum([i for i in rps_STr.values()]) / 9 , sum([i for i in rps_STe.values()]) / 9

(0.206387749599537, 0.20288141873148147)

In [417]:
sum([i for i in rps_mixr.values()]) / 9 , sum([i for i in rps_mixe.values()]) / 9

(0.2028956967893518, 0.19865499008796297)

In [418]:
sum([i for i in rps_mixer.values()]) / 9

0.19911672328009264

In [421]:
sum([i for i in rps_mixr.values()]) / 9 , sum([i for i in rps_mixe.values()]) / 9

(0.20482475655787036, 0.2017816703495371)

In [422]:
sum([i for i in rps_mixer.values()]) / 9

0.2019460230972222