In [35]:
import pandas as pd
import numpy as np
from trueskill import TrueSkill, Rating, rate_1vs1

In [36]:
'''
Add last close odds feature for each team and match
'''
def calculate_last_close_feature(df):
  teams = np.unique(np.array(df.Home.tolist() + df.Away.tolist()))
  lastOdds_dict = {}

  # Create lists for the 'to-be-created' columns
  h_lastOdds = []
  a_lastOdds = []

  for team1 in teams:
    for team2 in teams:
      if team1 != team2:
        lastOdds_dict[(team1, team2)] = 0
        lastOdds_dict[(team2, team1)] = 0

  #print(lastOdds_dict)

  for i in range(len(df)):
    h_team = df.iloc[i]['Home']
    a_team = df.iloc[i]['Away']

    hc_pinn = df.iloc[i]['PCH']
    ac_pinn = df.iloc[i]['PCA']

    # Update the winstreak lists
    if lastOdds_dict[(h_team, a_team)] == 0 and lastOdds_dict[(a_team, h_team)] == 0:
      h_lastOdds.append(hc_pinn)
      a_lastOdds.append(ac_pinn)
    else:
      h_lastOdds.append(lastOdds_dict[(h_team, a_team)])
      a_lastOdds.append(lastOdds_dict[(a_team, h_team)])


    # Update the dict for this game below :)
    lastOdds_dict[(h_team, a_team)] = hc_pinn
    lastOdds_dict[(a_team, h_team)] = ac_pinn

  df['H_LastOdds'] = h_lastOdds
  df['A_LastOdds'] = a_lastOdds

  return df

In [37]:
'''
Add win streak for each team
'''
def calculate_win_streak_feature(df):
  teams = np.unique(np.array(df.Home.tolist() + df.Away.tolist()))
  winstreak_dict = {}

  # Create lists for the 'to-be-created' columns
  h_winstreak = []
  a_winstreak = []

  for team in teams:
    winstreak_dict[team] = 0

  for i in range(len(df)):
    h_team = df.iloc[i]['Home']
    a_team = df.iloc[i]['Away']

    # Update the winstreak lists
    h_winstreak.append(winstreak_dict[h_team])
    a_winstreak.append(winstreak_dict[a_team])

    # Update the dict for this game below :)

    hg = df.iloc[i]['FTHG']
    ag = df.iloc[i]['FTAG']

    h_prev = winstreak_dict.get(h_team)
    a_prev = winstreak_dict.get(a_team)

    # If home won
    if hg > ag:

      # Update loser
      if a_prev > 0:
        winstreak_dict[a_team] = -1
      else:
        winstreak_dict[a_team] = a_prev -1

      # Update winner
      if h_prev < 0:
        winstreak_dict[h_team] = 1
      else:
        winstreak_dict[h_team] = h_prev + 1

    # If away won
    elif ag > hg:

      # Update loser
      if h_prev > 0:
        winstreak_dict[h_team] = -1
      else:
        winstreak_dict[h_team] = h_prev -1

      # Update winner
      if a_prev < 0:
        winstreak_dict[a_team] = 1
      else:
        winstreak_dict[a_team] = a_prev + 1

  df['H_winstreak'] = h_winstreak
  df['A_winstreak'] = a_winstreak

  return df

In [38]:
'''
Add shock feature for each team and match
'''
def calculate_shock_feature(df, num_matches=1, count_type='sum'):

  teams = np.unique(np.array(df.Home.tolist() + df.Away.tolist()))
  shock_dict = {}
  # Create lists for the 'to-be-created' columns
  h_shocks = []
  a_shocks = []

  for team in teams:
    shock_dict[team] = [0]
  
  for i in range(len(df)):
    h_team = df.iloc[i]['Home']
    a_team = df.iloc[i]['Away']

    h_team_shocks = shock_dict[h_team]
    a_team_shocks = shock_dict[a_team]

    # Update the winstreak lists
    h_sum = np.sum(h_team_shocks[-num_matches:])
    a_sum = np.sum(a_team_shocks[-num_matches:])
    if count_type == 'sum':
      h_shocks.append(h_sum)
      a_shocks.append(a_sum)
    elif count_type == 'mean':
      h_shocks.append(h_sum/num_matches)
      a_shocks.append(a_sum/num_matches)
    else:
      raise Exception("ERROR, faulty 'count_type' entered!")

    # Update the dict for this game below :)
    hg = df.iloc[i]['FTHG']
    ag = df.iloc[i]['FTAG']

    hc_pinn = df.iloc[i]['PCH']
    ac_pinn = df.iloc[i]['PCA']

    if hg == ag:
      if hc_pinn > ac_pinn:
        h_team_shocks.append(-hc_pinn)
        a_team_shocks.append(ac_pinn)
      else:
        h_team_shocks.append(hc_pinn)
        a_team_shocks.append(-ac_pinn)
    else:
      h_team_shocks.append((hg * (1 - hc_pinn)) - (ag * (1 - ac_pinn)))
      a_team_shocks.append((ag * (1 - ac_pinn)) - (hg * (1 - hc_pinn)))

    shock_dict[h_team] = h_team_shocks
    shock_dict[a_team] = a_team_shocks
    # shock_dict[h_team] = (hg - ag) * (1 - hc_pinn)
    # shock_dict[a_team] = (ag - hg) * (1 - ac_pinn)

  df['H_'  + str(num_matches) + 'shock'] = h_shocks
  df['A_'  + str(num_matches) + 'shock'] = a_shocks

  return(df)

In [39]:
'''
Add FTR from last game feature for each team and match
'''
def calculate_last_ftr_feature(df):
  teams = np.unique(np.array(df.Home.tolist() + df.Away.tolist()))
  lastFTG_dict = {}

  # Create lists for the 'to-be-created' columns
  h_lastFTG = []
  a_lastFTG = []

  for team1 in teams:
    for team2 in teams:
      if team1 != team2:
        lastFTG_dict[(team1, team2)] = 0
        lastFTG_dict[(team2, team1)] = 0

  for i in range(len(df)):
    h_team = df.iloc[i]['Home']
    a_team = df.iloc[i]['Away']

    # Update the winstreak lists
    h_lastFTG.append(lastFTG_dict[(h_team, a_team)])
    a_lastFTG.append(lastFTG_dict[(a_team, h_team)])

    # Update the dict for this game below :)

    fthg = df.iloc[i]['FTHG']
    ftag = df.iloc[i]['FTAG']

    lastFTG_dict[(h_team, a_team)] = fthg
    lastFTG_dict[(a_team, h_team)] = ftag
  
  df['H_LastFTG'] = h_lastFTG
  df['A_LastFTG'] = a_lastFTG

  return df

In [40]:
def calculate_mmr_feature(df, draw_probability=0.265):
  teams = np.unique(np.array(df.Home.tolist() + df.Away.tolist()))

  trueSkill_env = TrueSkill(draw_probability=draw_probability)
  trueSkill_dict = {}

  # Add all teams to TrueSkill\n",
  for team in teams:
    trueSkill_dict[team] = trueSkill_env.create_rating()

  # Create lists for the 'to-be-created' columns
  h_mmr = []
  a_mmr = []

  for i in range(len(df)):
    h_team = df.iloc[i]['Home']
    a_team = df.iloc[i]['Away']

    # Update the mmr lists
    h_trueskill = trueSkill_dict[h_team]
    a_trueskill = trueSkill_dict[a_team]

    h_mmr.append(h_trueskill.mu)
    a_mmr.append(a_trueskill.mu)

    # Update the mmr dict for this game below :)
    fthg = df.iloc[i]['FTHG']
    ftag = df.iloc[i]['FTAG']

    # If home won
    if fthg > ftag:
      h_trueskill, a_trueskill = trueSkill_env.rate_1vs1(h_trueskill, a_trueskill, drawn=False)
    # If away won
    elif ftag > fthg:
      a_trueskill, h_trueskill = trueSkill_env.rate_1vs1(a_trueskill, h_trueskill, drawn=False)
    # Draw
    else:
      h_trueskill, a_trueskill = trueSkill_env.rate_1vs1(h_trueskill, a_trueskill, drawn=True)

    trueSkill_dict[h_team] = h_trueskill
    trueSkill_dict[a_team] = a_trueskill

  df['H_MMR'] = h_mmr
  df['A_MMR'] = a_mmr
  return df

In [41]:
def calculate_points_feature(df, num_matches=15):
  teams = np.unique(np.array(df.Home.tolist() + df.Away.tolist()))
  points_dict = {}

  # Create lists for the 'to-be-created' columns
  h_points = []
  a_points = []

  for team in teams:
    points_dict[team] = [0]

  for i in range(len(df)):
    h_team = df.iloc[i]['Home']
    a_team = df.iloc[i]['Away']

    # Update the points lists
    h_points.append(np.sum(points_dict[h_team][-num_matches:]))
    a_points.append(np.sum(points_dict[a_team][-num_matches:]))

    # Update the dict for this game below :)
    h_points_update = points_dict[h_team]
    a_points_update = points_dict[a_team]

    fthg = df.iloc[i]['FTHG']
    ftag = df.iloc[i]['FTAG']

    # If home won
    if fthg > ftag:
      h_points_update.append(3)
      a_points_update.append(0)
    # If away won
    elif ftag > fthg:
      h_points_update.append(0)
      a_points_update.append(3)
    # Draw
    else:
      h_points_update.append(1)
      a_points_update.append(1)

    points_dict[h_team] = h_points_update
    points_dict[a_team] = a_points_update

  df['H_'  + str(num_matches) + 'Points'] = h_points
  df['A_'  + str(num_matches) + 'Points'] = a_points
  return df

In [42]:
'''
Add realized EV feature to the given dataframe.
Realized EV is: +odds if team won a match or viceversa over a window of matches
'''
def calculate_realized_ev_feature(df, num_matches=5):
  teams = np.unique(np.array(df.Home.tolist() + df.Away.tolist()))
  ev_dict = {}

  # Create lists for the 'to-be-created' columns
  h_evs = []
  a_evs = []

  for team in teams:
    ev_dict[team] = [0]

  for i in range(len(df)):
    h_team = df.iloc[i]['Home']
    a_team = df.iloc[i]['Away']

    # Update the points lists
    h_evs.append(np.sum(ev_dict[h_team][-num_matches:]))
    a_evs.append(np.sum(ev_dict[a_team][-num_matches:]))

    # Update the dict for this game below :)
    h_ev_update = ev_dict[h_team]
    a_ev_update = ev_dict[a_team]

    fthg = df.iloc[i]['FTHG']
    ftag = df.iloc[i]['FTAG']

    hc_prob = df.iloc[i]['PCH']
    ac_prob = df.iloc[i]['PCA']

    # If home won
    if fthg > ftag:
      h_ev_update.append(hc_prob)
      a_ev_update.append(-(1 - ac_prob))
    # If away won
    elif ftag > fthg:
      h_ev_update.append(-(1 - hc_prob))
      a_ev_update.append(ac_prob)
    # Draw
    else:
      h_ev_update.append(-(1 - hc_prob))
      a_ev_update.append(-(1 - ac_prob))

    ev_dict[h_team] = h_ev_update
    ev_dict[a_team] = a_ev_update

  df['H_' + str(num_matches) + 'EVs'] = h_evs
  df['A_' + str(num_matches) + 'EVs'] = a_evs
  return df

In [43]:
training_df = pd.read_csv('./cleaned_data/cleaned_data.csv', index_col=[0])
england_df = pd.read_csv('./cleaned_data/england_cleaned.csv', index_col=[0])

In [44]:
training_df

Unnamed: 0,Home,Away,PH,PD,PA,FTHG,FTAG,FTR,PCH,PCD,PCA
0,Standard,Gent,4.27,3.67,1.89,2.0,2.0,D,4.36,3.58,1.90
1,Charleroi,Eupen,1.68,4.35,4.69,3.0,1.0,H,1.66,4.26,5.01
2,Kortrijk,Oud-Heverlee Leuven,2.47,3.65,2.81,0.0,2.0,A,2.40,3.45,3.06
3,Waregem,Seraing,2.20,3.83,3.14,2.0,0.0,H,1.96,3.54,4.13
4,St Truiden,St. Gilloise,4.13,3.62,1.93,1.0,1.0,D,3.65,3.06,2.31
...,...,...,...,...,...,...,...,...,...,...,...
52113,Orduspor,Akhisar Belediyespor,4.08,3.01,2.23,0.0,2.0,A,3.67,3.13,2.31
52114,Elazigspor,Sivasspor,2.42,3.47,3.02,0.0,0.0,D,2.52,3.32,3.06
52115,Eskisehirspor,Antalyaspor,2.18,3.56,3.44,3.0,1.0,H,1.93,3.60,4.37
52116,Genclerbirligi,Bursaspor,2.48,3.51,2.92,2.0,2.0,D,1.93,3.60,4.37


In [45]:
print('Adding Last Close')
training_df = calculate_last_close_feature(training_df)

print('Adding Last FTR')
training_df = calculate_last_ftr_feature(training_df)

print('Adding MMR')
training_df = calculate_mmr_feature(training_df)

print('Adding Points')
training_df = calculate_points_feature(training_df, num_matches=3)
training_df = calculate_points_feature(training_df, num_matches=9)
training_df = calculate_points_feature(training_df, num_matches=17)

print('Adding Realized EV')
training_df = calculate_realized_ev_feature(training_df, num_matches=3)
training_df = calculate_realized_ev_feature(training_df, num_matches=5)
training_df = calculate_realized_ev_feature(training_df, num_matches=9)

print('Adding Shock')
training_df = calculate_shock_feature(training_df, num_matches=1)
training_df = calculate_shock_feature(training_df, num_matches=3)
training_df = calculate_shock_feature(training_df, num_matches=7)

print('Adding Winstreak')
training_df = calculate_win_streak_feature(training_df)

training_df

Adding Last Close
Adding Last FTR
Adding MMR
Adding Points
Adding Realized EV
Adding Shock
Adding Winstreak


Unnamed: 0,Home,Away,PH,PD,PA,FTHG,FTAG,FTR,PCH,PCD,...,H_9EVs,A_9EVs,H_1shock,A_1shock,H_3shock,A_3shock,H_7shock,A_7shock,H_winstreak,A_winstreak
0,Standard,Gent,4.27,3.67,1.89,2.0,2.0,D,4.36,3.58,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
1,Charleroi,Eupen,1.68,4.35,4.69,3.0,1.0,H,1.66,4.26,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
2,Kortrijk,Oud-Heverlee Leuven,2.47,3.65,2.81,0.0,2.0,A,2.40,3.45,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
3,Waregem,Seraing,2.20,3.83,3.14,2.0,0.0,H,1.96,3.54,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
4,St Truiden,St. Gilloise,4.13,3.62,1.93,1.0,1.0,D,3.65,3.06,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52113,Orduspor,Akhisar Belediyespor,4.08,3.01,2.23,0.0,2.0,A,3.67,3.13,...,28.79,31.61,0.63,-0.36,-2.61,-5.55,1.96,-15.90,-9,1
52114,Elazigspor,Sivasspor,2.42,3.47,3.02,0.0,0.0,D,2.52,3.32,...,41.63,32.34,-4.26,0.24,-9.01,-29.59,-29.59,-30.63,2,1
52115,Eskisehirspor,Antalyaspor,2.18,3.56,3.44,3.0,1.0,H,1.93,3.60,...,23.00,21.79,-5.42,-0.63,-8.28,0.85,-0.31,6.50,-1,2
52116,Genclerbirligi,Bursaspor,2.48,3.51,2.92,2.0,2.0,D,1.93,3.60,...,30.51,18.05,1.38,1.68,5.53,7.78,-0.81,4.86,-2,4


In [46]:
england_df

Unnamed: 0,FTHG,FTAG,FTR,Home,Away,PH,PD,PA,PCH,PCD,PCA
0,0.0,2.0,A,Crystal Palace,Arsenal,4.500,3.65,1.89,4.58,3.63,1.88
1,2.0,2.0,D,Fulham,Liverpool,11.200,6.22,1.28,10.50,6.50,1.29
2,2.0,0.0,H,Bournemouth,Aston Villa,3.930,3.58,2.04,4.09,3.59,2.00
3,2.0,1.0,H,Leeds,Wolves,2.390,3.33,3.30,2.45,3.44,3.09
4,2.0,0.0,H,Newcastle,Nott'm Forest,1.710,3.74,5.83,1.57,4.22,6.60
...,...,...,...,...,...,...,...,...,...,...,...
26540,2.0,1.0,H,Macclesfield,Cambridge,2.390,3.48,3.12,2.15,3.51,3.67
26541,1.0,0.0,H,Mansfield,Wrexham,1.461,4.42,8.70,1.43,4.61,9.00
26542,1.0,3.0,A,Southport,Luton,2.590,3.52,2.82,2.84,3.59,2.54
26543,2.0,1.0,H,Tamworth,Woking,2.230,3.62,3.33,2.31,3.48,3.27


In [47]:
print('Adding Last Close')
england_df = calculate_last_close_feature(england_df)

print('Adding Last FTR')
england_df = calculate_last_ftr_feature(england_df)

print('Adding MMR')
england_df = calculate_mmr_feature(england_df)

print('Adding Points')
england_df = calculate_points_feature(england_df, num_matches=3)
england_df = calculate_points_feature(england_df, num_matches=9)
england_df = calculate_points_feature(england_df, num_matches=17)

print('Adding Realized EV')
england_df = calculate_realized_ev_feature(england_df, num_matches=3)
england_df = calculate_realized_ev_feature(england_df, num_matches=5)
england_df = calculate_realized_ev_feature(england_df, num_matches=9)

print('Adding Shock')
england_df = calculate_shock_feature(england_df, num_matches=1)
england_df = calculate_shock_feature(england_df, num_matches=3)
england_df = calculate_shock_feature(england_df, num_matches=7)

print('Adding Winstreak')
england_df = calculate_win_streak_feature(england_df)

england_df

Adding Last Close
Adding Last FTR
Adding MMR
Adding Points
Adding Realized EV
Adding Shock
Adding Winstreak


Unnamed: 0,FTHG,FTAG,FTR,Home,Away,PH,PD,PA,PCH,PCD,...,H_9EVs,A_9EVs,H_1shock,A_1shock,H_3shock,A_3shock,H_7shock,A_7shock,H_winstreak,A_winstreak
0,0.0,2.0,A,Crystal Palace,Arsenal,4.500,3.65,1.89,4.58,3.63,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
1,2.0,2.0,D,Fulham,Liverpool,11.200,6.22,1.28,10.50,6.50,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
2,2.0,0.0,H,Bournemouth,Aston Villa,3.930,3.58,2.04,4.09,3.59,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
3,2.0,1.0,H,Leeds,Wolves,2.390,3.33,3.30,2.45,3.44,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
4,2.0,0.0,H,Newcastle,Nott'm Forest,1.710,3.74,5.83,1.57,4.22,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26540,2.0,1.0,H,Macclesfield,Cambridge,2.390,3.48,3.12,2.15,3.51,...,21.75,23.33,-3.42,1.01,-9.24,-1.36,-4.28,-5.12,-6,1
26541,1.0,0.0,H,Mansfield,Wrexham,1.461,4.42,8.70,1.43,4.61,...,13.85,13.17,2.97,1.72,18.05,1.76,15.43,13.88,1,-1
26542,1.0,3.0,A,Southport,Luton,2.590,3.52,2.82,2.84,3.59,...,23.21,15.41,-3.21,2.50,-14.04,0.90,-9.93,15.59,2,1
26543,2.0,1.0,H,Tamworth,Woking,2.230,3.62,3.33,2.31,3.48,...,32.21,23.45,2.31,2.14,4.50,-0.42,-4.72,-4.43,-2,-1


# TODO

Concat england_cleaned to cleaned_data  
Premier league is the current "england_df"!

In [None]:
training_df.to_csv('./engineered_data/engineered_data.csv')
england_df.to_csv('./engineered_data/england_data.csv')