In [1]:
import pandas as pd
import glob
import warnings
import pickle


warnings.filterwarnings('ignore')
paths = glob.glob('./Football-Dataset/*/*')
pd_list = []

for path in paths:
    temp_csv = pd.read_csv(path)
    pd_list.append(temp_csv)

df = pd.concat(pd_list)

In [2]:
def clean_link(x):
    new_link = x.split('.com')[1]
    new_link_year = new_link.split('/')[-1][0:4]
    new_link_complete = '/'.join(new_link.split('/')[:-1]) + '/' + new_link_year
    return new_link_complete

In [3]:
df['New Link'] = df['Link'].apply(clean_link)

In [4]:
match_info_df = pd.read_csv('Match_Info.csv')

In [5]:
new_df = df.merge(match_info_df, left_on='New Link', right_on='Link').rename(columns={'Link_x': 'Link'})

In [6]:
elo_dict = pickle.load(open('elo_dict.pkl', 'rb'))
elo_df = pd.DataFrame.from_dict(elo_dict)
elo_df = elo_df.transpose().reset_index().rename(columns={'index': 'Link'})

In [7]:
final_df = elo_df.merge(new_df, on='Link').dropna(subset=['Elo_home']).drop_duplicates(subset=['Link'], keep='first')

In [8]:
new_df = final_df[['League', 'Round', 'Season', 'Home_Team', 'Away_Team', 'Result', 'Elo_home', 'Elo_away']]

In [9]:
def get_home_result(x):
    home = x.split('-')[0]
    return home

def get_away_result(x):
    away = x.split('-')[1]
    return away

new_df['Home Score'] = new_df['Result'].apply(get_home_result)
new_df['Away Score'] = new_df['Result'].apply(get_away_result)

In [10]:
for idx, match in new_df.iterrows():
    if match['Home Score'] > match['Away Score']:
        res = 1
    elif match['Home Score'] < match['Away Score']:
        res = 3
    else:
        res = 2
    new_df.loc[ idx, 'Result_Num'] = res

In [12]:
new_df = new_df.rename(columns={'Result_Num': 'Outcome'})

## Feature engineering only on the Premier League for Goals so Far

In [13]:
mask = (new_df['League'] == 'premier_league') & (new_df['Season'] == 2021)

In [14]:
premier_league = new_df[mask]

In [16]:
premier_league = premier_league[['Round', 'Season', 'Home_Team', 'Away_Team', 'Home Score', 'Away Score']]
premier_league

Unnamed: 0,Round,Season,Home_Team,Away_Team,Home Score,Away Score
79243,1,2021,Fulham,Arsenal,0,3
79244,1,2021,Crystal Palace,Southampton,1,0
79245,1,2021,Liverpool,Leeds United,4,3
79246,1,2021,West Ham,Newcastle,0,2
79247,1,2021,West Bromwich Albion,Leicester,0,3
...,...,...,...,...,...,...
79495,26,2021,Crystal Palace,Fulham,0,0
79496,26,2021,Leicester,Arsenal,1,3
79497,26,2021,Tottenham Hotspur,Burnley,4,0
79498,26,2021,Chelsea,Man. Utd,0,0


In [17]:
premier_league['Total Goals So Far Home'] = 0
premier_league['Total Goals So Far Away'] = 0


In [20]:
aux_dict = {}
# ITerate through all the rounds except the last one
# We don't need info about the last round because the next match round doesn't exist
for round in premier_league['Round'].unique()[:-1]:
    # Get the matches of the current round
    new_round = premier_league[premier_league['Round'] == round]
    # Get the number of goals per team in the current round
    # and add it to the total goals so far
    for i, row in new_round.iterrows():
        if row['Home_Team'] in aux_dict:
            aux_dict[row['Home_Team']] += int(row['Home Score'])
        else:
            aux_dict[row['Home_Team']] = int(row['Home Score'])

        if row['Away_Team'] in aux_dict:
            aux_dict[row['Away_Team']] += int(row['Away Score'])
        else:
            aux_dict[row['Away_Team']] = int(row['Away Score'])
    # Add the total goals so far to the dataframe   
    for team in aux_dict.keys():
        # Check if the team on that round is the Home team
        idx_home = premier_league[(premier_league['Home_Team'] == team) & (premier_league['Round'] == round + 1)]
        if len(idx_home) == 1:
            idx_home = idx_home.index
            premier_league.loc[idx_home, 'Total Goals So Far Home'] = aux_dict[team]
        # If not, it's the Away team
        else:
            idx_away = premier_league[(premier_league['Away_Team'] == team) & (premier_league['Round'] == round + 1)].index
            premier_league.loc[idx_away, 'Total Goals So Far Away'] = aux_dict[team]

### !! This works because the data is cleaned. Sometimes, some seasons and leagues have the data all over the place and in the same round Home team plays multiple times. That will cause some unexpected results for this algorithm. 