In [10]:
import numpy as np
import pandas as pd
import csv

In [11]:
file = open('Datasets/feature_dataset.csv', 'w')
writer = csv.writer(file)

In [12]:
writer.writerow(['Season', 'Home Team', 'Away Team', 'Home Goals', 'Away Goals', 'Home Team ELO', 'Away Team ELO', 'Home XG', 'Away XG', 'Home XGA', 'Away XGA', 'Home Win Percentage', 'Home Draw Percentage','Away Win Percentage', 'Away Draw Percentage','Head2Head Win','Head2Head Draw', 'Winner'])

# Load the dataset
matches_dataset = pd.read_csv('Datasets/premier-league-matches.csv')
xg_ratings_dataset = pd.read_csv('Datasets/XG_team_features.csv')
initial_dataset = pd.read_csv('Datasets/updated_training_data.csv')

# Filter the dataset to only include matches from 2014 onwards
matches_dataset = matches_dataset[matches_dataset['Season_End_Year'] >= 2015]
elo_ratings_dataset = pd.read_csv('Datasets/ELO_ratings.csv')


In [13]:
matches_dataset.head()
print(matches_dataset.shape)

(3420, 8)


In [14]:
initial_dataset.head()
print(initial_dataset.shape)

(12026, 9)


In [15]:
elo_ratings_dataset.head()
print(elo_ratings_dataset.shape)

(12026, 8)


In [16]:
xg_ratings = {}

for index, row in xg_ratings_dataset.iterrows():
    season = int(row['Season'][:4]) + 1
    if season not in xg_ratings:
        xg_ratings[season] = {}

    team = row['Team']
    xg_ratings[season][team] = [row['xG'], row['xGA']]

print(xg_ratings)

{2015: {'Watford': [55.0, 45.0], 'Chelsea': [68.64, 31.52], 'Manchester City': [75.82, 40.5], 'Arsenal': [69.8, 35.72], 'Manchester Utd': [54.21, 33.84], 'Tottenham': [52.39, 57.04], 'Liverpool': [51.7, 38.25], 'Southampton': [54.97, 39.11], 'Swansea City': [40.9, 55.94], 'Stoke City': [46.26, 47.03], 'Crystal Palace': [44.76, 45.48], 'Everton': [44.89, 46.18], 'West Ham': [46.69, 57.4], 'West Brom': [38.63, 53.75], 'Leicester City': [48.21, 56.39], 'Newcastle Utd': [40.22, 51.01], 'Sunderland': [37.45, 51.46], 'Aston Villa': [33.1, 49.71], 'Hull City': [32.79, 46.99], 'Burnley': [39.37, 57.89], 'QPR': [45.73, 65.26]}, 2021: {'Watford': [50.0, 35.0], 'Manchester City': [77.72, 30.61], 'Manchester Utd': [63.17, 41.92], 'Liverpool': [72.21, 47.3], 'Chelsea': [68.66, 30.9], 'Leicester City': [58.8, 47.07], 'West Ham': [60.34, 49.86], 'Tottenham': [56.68, 52.55], 'Arsenal': [52.25, 43.23], 'Leeds United': [59.26, 63.02], 'Everton': [49.24, 50.16], 'Aston Villa': [56.72, 53.25], 'Newcastle 

In [17]:
head2head = {}
teams = set(list(matches_dataset['Home'].unique()) + list(matches_dataset['Away'].unique()))

for team in teams:
    head2head[team] = {}
    for team2 in teams:
        head2head[team][team2] = {'Matches': 0, 'Wins': 0, 'Draws': 0}

In [18]:
for index, row in matches_dataset.iterrows():

    season_end = row['Season_End_Year']
    season = f'{season_end - 1}-{season_end}'
    home_team = row['Home']
    away_team = row['Away']
    row2 = initial_dataset.loc[index]
    row3 = elo_ratings_dataset.loc[index]
    home_team_elo = row3['Home Team Rating']
    away_team_elo = row3['Away Team Rating']

    home_xg = xg_ratings[season_end][home_team][0]
    away_xg = xg_ratings[season_end][away_team][0]
    home_xga = xg_ratings[season_end][home_team][1]
    away_xga = xg_ratings[season_end][away_team][1]

    home_goals = row['HomeGoals']
    away_goals = row['AwayGoals']

    home_win_percentage = row2['Home Team Win Percentage']
    home_draw_percentage = row2['Home Team Draw Percentage']
    away_win_percentage = row2['Away Team Win Percentage']
    away_draw_percentage = row2['Away Team Draw Percentage']

    head2head[home_team][away_team]['Matches'] += 1
    head2head[away_team][home_team]['Matches'] += 1

    if row2['Winner'] == 1:
        goal_difference = home_goals - away_goals
        
        factor = home_goals / home_xg if home_xg != 0 else 1

        home_xg = round(home_xg * (1 + 0.05 * factor * goal_difference))
        home_xga = round(home_xga * (1 - 0.05 * factor * goal_difference))

        tmp_xg_home = xg_ratings[season_end][home_team][0]
        tmp_xga_home = xg_ratings[season_end][home_team][1]

        xg_ratings[season_end][home_team][0] = home_xg
        xg_ratings[season_end][home_team][1] = home_xga

        away_factor = away_goals / away_xg if away_xg != 0 else 1

        away_xg = round(away_xg * (1 - 0.05 * away_factor * goal_difference))
        away_xga = round(away_xga * (1 + 0.05 * away_factor * goal_difference))

        tmp_xg_away = xg_ratings[season_end][away_team][0]
        tmp_xga_away = xg_ratings[season_end][away_team][1]

        xg_ratings[season_end][away_team][0] = away_xg
        xg_ratings[season_end][away_team][1] = away_xga

        head2head_win = round(head2head[home_team][away_team]['Wins'] / head2head[home_team][away_team]['Matches'] * 100)
        head2head_draw = round(head2head[home_team][away_team]['Draws'] / head2head[home_team][away_team]['Matches'] * 100)
        head2head[home_team][away_team]['Wins'] += 1

        writer.writerow([season, home_team, away_team, home_goals, away_goals, home_team_elo, away_team_elo, tmp_xg_home, tmp_xga_home, tmp_xg_away, tmp_xga_away, home_win_percentage, home_draw_percentage, away_win_percentage, away_draw_percentage, head2head_win, head2head_draw, 1])

    elif row2['Winner'] == -1:
        head2head[away_team][home_team]['Wins'] += 1
        goal_difference = away_goals - home_goals

        factor = away_goals / away_xg if away_xg != 0 else 1

        away_xg = round(away_xg * (1 + 0.05 * factor * goal_difference))
        away_xga = round(away_xga * (1 - 0.05 * factor * goal_difference))

        tmp_xg_away = xg_ratings[season_end][away_team][0]
        tmp_xga_away = xg_ratings[season_end][away_team][1]

        xg_ratings[season_end][away_team][0] = away_xg
        xg_ratings[season_end][away_team][1] = away_xga

        home_factor = home_goals / home_xg if home_xg != 0 else 1

        home_xg = round(home_xg * (1 - 0.05 * home_factor * goal_difference))
        home_xga = round(home_xga * (1 + 0.05 * home_factor * goal_difference))

        tmp_xg_home = xg_ratings[season_end][home_team][0]
        tmp_xga_home = xg_ratings[season_end][home_team][1]

        xg_ratings[season_end][home_team][0] = home_xg
        xg_ratings[season_end][home_team][1] = home_xga

        head2head_win = round(head2head[home_team][away_team]['Wins'] / head2head[home_team][away_team]['Matches'] * 100)
        head2head_draw = round(head2head[home_team][away_team]['Draws'] / head2head[home_team][away_team]['Matches'] * 100)

        writer.writerow([season, home_team, away_team, home_goals, away_goals, home_team_elo, away_team_elo, tmp_xg_home, tmp_xga_home, tmp_xg_away, tmp_xga_away, home_win_percentage, home_draw_percentage, away_win_percentage, away_draw_percentage, head2head_win, head2head_draw, -1])

    else:

        head2head_win = round(head2head[home_team][away_team]['Wins'] / head2head[home_team][away_team]['Matches'] * 100)
        head2head_draw = round(head2head[home_team][away_team]['Draws'] / head2head[home_team][away_team]['Matches'] * 100)

        head2head[home_team][away_team]['Draws'] += 1
        head2head[away_team][home_team]['Draws'] += 1

        home_xg = xg_ratings[season_end][home_team][0]
        away_xg = xg_ratings[season_end][away_team][0]
        home_xga = xg_ratings[season_end][home_team][1]
        away_xga = xg_ratings[season_end][away_team][1]

        writer.writerow([season, home_team, away_team, home_goals, away_goals, home_team_elo, away_team_elo, home_xg, away_xg, home_xga, away_xga, home_win_percentage, home_draw_percentage, away_win_percentage, away_draw_percentage, head2head_win, head2head_draw,  0])
    