# Feature Engineering

---

# Overview

In this file we create 3 types of features:
- Player based: e.g. average goals scored by player in the past 10 games
- Team based: e.g. team form, average goals conceded by team in the past 10 games
- Game based: e.g. home or away game

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("ready_datasets/data_cleaning_output.csv")

## Player based

**AVG/MAX of main statistics**

Here we create 'trend' variables using the rolling window method of pandas dataframes. Examples of such variables are:

- AVG goals scored in the past 6 games
- MAX assists in the past 3 games
- AVG base score in the past 9 games

For each type of variable (e.g. AVG goals scored) we create 3 versions: one that considers 3 games in the past, one that considers 6 and one that considers 9 games in the past. In the modelling stage we expect to remove some of the less predictive variables (especially since they are havily correlated).

In [3]:
# Impute values as 0 if player was on the bench <- he could have been picked for a game but scored 0 points
data.loc[(data['player_status'] == 'on the bench') & (data['goal_scored'].isna()), "base_score":"goal_decisive_win"] = data.loc[(data['player_status'] == 'on the bench') & (data['goal_scored'].isna()), "base_score":"goal_decisive_win"].fillna(0)

# Penalties scored and goal_scored was counted separately, create column for all goals in a game
data['goals_total'] = data['goal_scored'] + data['penalty_scored']

# Replace 6* with 0 in 'base_score' <- similarly to benched players they could have scored points but didn't, hence, we count them as 0
data['base_score'] = data['base_score'].replace('6*', '0')
data['base_score'] = [float(x) if str(x) != 'None' else x for x in data['base_score']]

In [4]:
# Function that uses rolling windows to calulate the AVG and MAX of a specified variable on a given timeframe.
# The granularity is per player

def avg_max_timex(df, col, timeframe):
    
    #prepare helper datasets. Useful for improving computation time
    output_df = pd.DataFrame()
    df1 = df[['player', 'team', 'season', 'matchweek', col]]
    
    #Variables for checking the runtime progress
    x = len(df1['player'].unique())
    counter = 0
    print('NOW DOING THIS', x)
    
    
    #Populate output dataset with avg and max for a specified rolling window
    for player in df1['player'].unique():
        temp = df1[df1['player'] == player]
        temp["avg_" + col + "_" + str(timeframe)] = temp[col].rolling(timeframe, min_periods=1).mean()
        temp["max_" + col + "_" + str(timeframe)] = temp[col].rolling(timeframe, min_periods=1).max()
        output_df = output_df.append(temp)
        
        #Print runtime progress every 100 players
        counter += 1
        if counter % 100 == 0:
            print(counter/x, '% done')
    
    #Merge with original dataset
    output_df = output_df.drop(columns=[col])
    df = pd.merge(df, output_df,  how='left', left_on=["player", "team", "season", "matchweek"], right_on=["player", "team", "season", "matchweek"])
    
    return df

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,player,season,matchweek,team,match_date,home_team,away_team,score,status,...,assist,goal_decisive_draw,goal_decisive_win,coach,coach_score,price,opponent,elo_rating,elo_rating_opponent,goals_total
0,0,AARONS,2017,23,VERONA,2018-02-04,VERONA,ROMA,0:1,LM,...,0.0,0.0,0.0,PECCHIA,5.0,5.0,ROMA,1771.0,2114.0,0.0
1,1,AARONS,2017,24,VERONA,2018-02-11,SAMPDORIA,VERONA,2:0,RW,...,0.0,0.0,0.0,PECCHIA,6.0,5.0,SAMPDORIA,1767.0,1935.0,0.0
2,2,AARONS,2017,25,VERONA,2018-02-19,LAZIO,VERONA,2:0,,...,0.0,0.0,0.0,PECCHIA,5.0,5.0,LAZIO,1762.0,2032.0,0.0
3,3,AARONS,2017,26,VERONA,2018-02-25,VERONA,TORINO,2:1,on the bench,...,0.0,0.0,0.0,,,4.0,TORINO,1759.0,1975.0,0.0
4,4,AARONS,2017,27,VERONA,2018-04-04,BENEVENTO,VERONA,3:0,on the bench,...,0.0,0.0,0.0,,,5.0,BENEVENTO,1765.0,1628.0,0.0


In [None]:
data = avg_max_timex(data, 'base_score', 10)
data = avg_max_timex(data, 'base_score', 7)
data = avg_max_timex(data, 'base_score', 4)

data = avg_max_timex(data, 'goal_scored', 10)
data = avg_max_timex(data, 'goal_scored', 7)
data = avg_max_timex(data, 'goals_total', 4)

data = avg_max_timex(data, 'assist', 10)
data = avg_max_timex(data, 'assist', 7)
data = avg_max_timex(data, 'assist', 4)

data = avg_max_timex(data, 'goal_conceded', 10)
data = avg_max_timex(data, 'goal_conceded', 7)
data = avg_max_timex(data, 'goal_conceded', 4)

data = avg_max_timex(data, 'yellow_card', 10)
data = avg_max_timex(data, 'yellow_card', 7)
data = avg_max_timex(data, 'yellow_card', 4)

data = avg_max_timex(data, 'penalty_failed', 10)
data = avg_max_timex(data, 'penalty_saved', 10)
data = avg_max_timex(data, 'own_goal', 10)
data = avg_max_timex(data, 'red_card', 10)

**Number of injured/benched in the past x games**

Here we create variables that count the number of absences of each kind on the short term (3 games in the past) and on the long term (10 games in the past). We use the same rolling window method as before. The three kinds are:

- Injuries
- Not in squad
- Benched

In [7]:
data['is_injured']      = [1 if x=='injured'      else 0 for x in data['player_status']]
data['is_not_in_squad'] = [1 if x=='not in squad' else 0 for x in data['player_status']]
data['is_benched']      = [1 if x=='on the bench' else 0 for x in data['player_status']]
data['is_suspended'] = [1 if x=='suspended' else 0 for x in data['player_status']]

In [8]:
def sum_timex_player(df, col, timeframe):
    
    #prepare helper datasets. Useful for improving computation time
    output_df = pd.DataFrame()
    df1 = df[['player', 'team', 'season', 'matchweek', col]]
    
    x = len(df1['player'].unique())
    counter = 0
    print('NOW DOING THIS', x)
    
    #Populate output dataset with avg and max for a specified rolling window
    for player in df1['player'].unique():
        temp = df1[df1['player'] == player]
        temp["sum_" + col + "_" + str(timeframe)] = temp[col].rolling(timeframe, min_periods=1).sum()
        output_df = output_df.append(temp)
        
        counter += 1
        if counter % 100 == 0:
            print(counter/x, '% done')
    
    #Merge with original dataset
    output_df = output_df.drop(columns=[col])
    df = pd.merge(df, output_df,  how='left', left_on=["player", "team", "season", "matchweek"], right_on=["player", "team", "season", "matchweek"])
    
    return df

In [None]:
data = sum_timex_player(data, 'is_injured', 10)
data = sum_timex_player(data, 'is_injured', 7)
data = sum_timex_player(data, 'is_injured', 4)

data = sum_timex_player(data, 'is_not_in_squad', 10)
data = sum_timex_player(data, 'is_not_in_squad', 7)
data = sum_timex_player(data, 'is_not_in_squad', 4)

data = sum_timex_player(data, 'is_benched', 10)
data = sum_timex_player(data, 'is_benched', 7)
data = sum_timex_player(data, 'is_benched', 4)

data = sum_timex_player(data, 'is_suspended', 10)
data = sum_timex_player(data, 'is_suspended', 7)
data = sum_timex_player(data, 'is_suspended', 4)

In [10]:
data.drop(columns=['is_injured', 'is_not_in_squad', 'is_benched', 'is_suspended'], inplace=True)

**Calculate time in days since last league game the player played in**

We expect that players who play regularly could perform better than players who are just coming back after an injury or being benched for a long time. We therefore check the number of days since a players last **played** game in a given season.

- If the player plays regularly the number will ususally be 7 since league games are every week
- If player had a long break the number will be large

In [11]:
days_since_last_game = []

# Iterate through each row of data
for index,row in data.iterrows():
    
    # Treat first row of data separately to avoid out of index error
    # Fill with average: 7 days
    if index == 0:
        days_since_last_game.append(7)
        continue
    
    # All other cases. Find the last 'played game' or the first game of the season (if no played games before)
    for i in range(38):
        
        # When we go from one player to another, we encounter the very first game of that player in the database
        # Wehn we go from one season to another, this is the very first game in that season
        # This will be the stopping point for all games before the first played game of the season
        # We fill with (i+1)*7 which represents time from the beggininig of the season
        if (data.iloc[index,0] != data.iloc[index-(i+1),0]) or (data.iloc[index,1] != data.iloc[index-(i+1),1]):
            days_since_last_game.append((i+1)*7)
            break
        
        # We look for the latest game where player was 'played in the game'
        # Find the difference in days to that game
        elif (data.iloc[index-(i+1),9] == 'played in the game'):
            days_since_last_game.append((data.iloc[index,4] - data.iloc[index-(i+1),4]).days)
            break
    
    # Illustrate progress of calculation
    if index%10000==0:
        print(index/len(data), '% done')

0.10149914233224729 % done
0.20299828466449457 % done
0.30449742699674187 % done
0.40599656932898914 % done
0.5074957116612364 % done
0.6089948539934837 % done
0.7104939963257311 % done
0.8119931386579783 % done
0.9134922809902256 % done


In [12]:
data['days_since_last_game'] = days_since_last_game

In [13]:
# Fill values below 0 with average: 7
# Theres not many negative values (<1000), this is a quick fix
# Negative values represent errors in database and they can impact this variable
data.loc[data['days_since_last_game']<0, 'days_since_last_game'] = 7

## Team based

Here we explore team based variables, such as how well is the team doing in terms of scoring goals, conceding goals and winning games. To the final dataset we add both the given player's team variables such as 'average amount of goals scored in the past 3 games', as well as the opponent's team variables such as 'average amount of goals conceded in the past 3 games'. This way we consider not only how well/bad is the given team doing but also how well/bad is the opposing team doing.

In [16]:
# Create a dataset that includes unique rows for combination [team, season, matchweek]
team_df = data[['team', 'season', 'matchweek', 'home_team', 'away_team', 'score']].drop_duplicates().sort_values(by=['team', 'season', 'matchweek'])

In [17]:
# Extract goals scored, conceded and win/loss from the 'score' variable
team_df['team_goals_scored']   = team_df['score'].apply(lambda x: int(x.split(":")[0]))
team_df['team_goals_conceded'] = team_df['score'].apply(lambda x: int(x.split(":")[1]))
team_df['team_win_loss'] = [ 1 if row['team_goals_scored']>row['team_goals_conceded'] else 0 if row['team_goals_scored']==row['team_goals_conceded'] else -1 for index,row in team_df.iterrows()]

In [18]:
# Deal with cases of duplicate [team, season, matchweek] by removing rows where team is not in away or home
team_df['team_check'] = [ -1 if ((row['team'].lower() == row['home_team'].lower()) | (row['team'].lower() == row['away_team'].lower())) else 0 for index,row in team_df.iterrows()]
team_df = team_df[team_df['team_check'] == -1]
team_df = team_df.drop(columns=['team_check'])
print('Number of duplicates left: ', team_df.duplicated(['team', 'season', 'matchweek']).sum())

Number of duplicates left:  0


**Calculate max, avg of goals scored and conceded**

In [19]:
def avg_max_timex_team(df, col, timeframe):
    
    #prepare helper datasets
    output_df = pd.DataFrame()
    df1 = df[['team', 'season', 'matchweek', col]]
    
    #Populate output dataset with avg and max for a specified rolling window
    for team in df1['team'].unique():
        temp = df1[df1['team'] == team]
        temp["avg_" + col + "_" + str(timeframe)] = temp[col].rolling(timeframe, min_periods=1).mean()
        temp["max_" + col + "_" + str(timeframe)] = temp[col].rolling(timeframe, min_periods=1).max()
        output_df = output_df.append(temp)
    
    #Merge with original dataset
    output_df = output_df.drop(columns=[col])
    df = pd.merge(df, output_df,  how='left', left_on=["team", "season", "matchweek"], right_on=["team", "season", "matchweek"])

    return df

In [None]:
team_df = avg_max_timex_team(team_df, 'team_goals_scored', 10)
team_df = avg_max_timex_team(team_df, 'team_goals_scored', 7)
team_df = avg_max_timex_team(team_df, 'team_goals_scored', 4)

team_df = avg_max_timex_team(team_df, 'team_goals_conceded', 10)
team_df = avg_max_timex_team(team_df, 'team_goals_conceded', 7)
team_df = avg_max_timex_team(team_df, 'team_goals_conceded', 4)

**Calculate team form**

How team form is calculated:
- 1 point for win
- 0 points for a draw
- -1 point for a loss

Then we sum up aver a period of time e.g. 3 games in the past. This variable is another indicator of how well a team is performing.

In [21]:
def team_form_timex(df, col, timeframe):
    
    #prepare helper datasets
    output_df = pd.DataFrame()
    df1 = df[['team', 'season', 'matchweek', col]]
    
    #Populate output dataset with sum for a specified rolling window
    for team in df1['team'].unique():
        temp = df1[df1['team'] == team]
        temp["team_form_" + str(timeframe)] = temp[col].rolling(timeframe, min_periods=1).sum()
        output_df = output_df.append(temp)
    
    #Merge with original dataset
    output_df = output_df.drop(columns=[col])
    df = pd.merge(df, output_df,  how='left', left_on=["team", "season", "matchweek"], right_on=["team", "season", "matchweek"])

    return df

In [None]:
team_df = team_form_timex(team_df, 'team_win_loss', 10)
team_df = team_form_timex(team_df, 'team_win_loss', 7)
team_df = team_form_timex(team_df, 'team_win_loss', 4)

In [23]:
team_df.head(2)

Unnamed: 0,team,season,matchweek,home_team,away_team,score,team_goals_scored,team_goals_conceded,team_win_loss,avg_team_goals_scored_10,...,max_team_goals_scored_4,avg_team_goals_conceded_10,max_team_goals_conceded_10,avg_team_goals_conceded_7,max_team_goals_conceded_7,avg_team_goals_conceded_4,max_team_goals_conceded_4,team_form_10,team_form_7,team_form_4
0,ATALANTA,2016,1,ATALANTA,LAZIO,3:4,3,4,-1,3.0,...,3.0,4.0,4.0,4.0,4.0,4.0,4.0,-1.0,-1.0,-1.0
1,ATALANTA,2016,2,SAMPDORIA,ATALANTA,2:1,2,1,1,2.5,...,3.0,2.5,4.0,2.5,4.0,2.5,4.0,0.0,0.0,0.0


**Merge with main table**

In [24]:
# Check if we are able to assign to every row
data['team_check'] = [ -1 if ((row['team'].lower() == row['home_team'].lower()) | (row['team'].lower() == row['away_team'].lower())) else 0 for index,row in data.iterrows()]

# Some will not match but all of them not so usefull <- about 200 rows will have wrong team forms,
# but all of them for players not in sqad / with NaN id and role and so on
print('Potential incorrect team assignemnts: ', len(data[data['team_check'] == 0]))
data.drop(columns=['team_check'], inplace=True)

Potential incorrect team assignemnts:  417


In [25]:
# Remove some rows to make my life easier in the following cell
team_df.drop(columns=['home_team', 'away_team', 'score','team_goals_scored', 
                      'team_goals_conceded', 'team_win_loss'], inplace=True)

In [26]:
# Create player team table and opponent team table (we will merge then to the main table)
# For each table we need
players_team  = team_df.copy(deep=True)
opposing_team = team_df.copy(deep=True)
opposing_team.rename(columns={'avg_team_goals_scored_15' : 'avg_opponent_goals_scored_15',
                             'avg_team_goals_scored_10' : 'avg_opponent_goals_scored_10',
                             'avg_team_goals_scored_5' : 'avg_opponent_goals_scored_15',
                              
                             'max_team_goals_scored_15' : 'max_opponent_goals_scored_15',
                             'max_team_goals_scored_10' : 'max_opponent_goals_scored_10',
                             'max_team_goals_scored_5' : 'max_opponent_goals_scored_5',
                              
                             'avg_team_goals_conceded_15' : 'avg_opponent_goals_conceded_15',
                             'avg_team_goals_conceded_10' : 'avg_opponent_goals_conceded_10',
                             'avg_team_goals_conceded_5' : 'avg_opponent_goals_conceded_5',
                              
                             'max_team_goals_conceded_15' : 'max_opponent_goals_conceded_15',
                             'max_team_goals_conceded_10' : 'max_opponent_goals_conceded_10',
                             'max_team_goals_conceded_5' : 'max_opponent_goals_conceded_5',
                              
                             'team_form_15' : 'opponent_form_15',
                             'team_form_10' : 'opponent_form_10',
                             'team_form_5' : 'opponent_form_5'}, inplace=True)

In [27]:
# Merge the datasets
data = pd.merge(data, players_team,  how='left', left_on=["team", "season", "matchweek"], right_on=["team", "season", "matchweek"])
data = pd.merge(data, opposing_team,  how='left', left_on=["opponent", "season", "matchweek"], right_on=["team", "season", "matchweek"])
data.drop(columns=['team_y'], inplace=True)
data.rename(columns={'team_x': 'team'}, inplace=True)

## Game based

**Away or home game**

In football playing on home pitch gives a big advantage, therefore, we create home_game variable

In [28]:
data['home_game'] = [ 1 if row['team']==row['home_team'] else 0 for index,row in data.iterrows()]

In [29]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,player,season,matchweek,team,match_date,home_team,away_team,score,status,...,avg_opponent_goals_conceded_10,max_opponent_goals_conceded_10,avg_team_goals_conceded_7_y,max_team_goals_conceded_7_y,avg_team_goals_conceded_4_y,max_team_goals_conceded_4_y,opponent_form_10,team_form_7_y,team_form_4_y,home_game
0,0,AARONS,2017,23,VERONA,2018-02-04,VERONA,ROMA,0:1,LM,...,0.8,2.0,0.857143,2.0,1.25,2.0,0.0,-1.0,-3.0,1
1,1,AARONS,2017,24,VERONA,2018-02-11,SAMPDORIA,VERONA,2:0,RW,...,1.2,2.0,1.0,2.0,0.75,1.0,2.0,4.0,1.0,0
2,2,AARONS,2017,25,VERONA,2018-02-19,LAZIO,VERONA,2:0,,...,1.6,5.0,1.428571,5.0,1.0,2.0,2.0,2.0,2.0,0
3,3,AARONS,2017,26,VERONA,2018-02-25,VERONA,TORINO,2:1,on the bench,...,0.9,3.0,0.571429,1.0,0.75,1.0,2.0,3.0,1.0,1
4,4,AARONS,2017,27,VERONA,2018-04-04,BENEVENTO,VERONA,3:0,on the bench,...,0.8,2.0,0.857143,2.0,1.0,2.0,8.0,5.0,4.0,0
5,5,AARONS,2017,28,VERONA,2018-03-10,VERONA,CHIEVO,1:0,,...,0.7,2.0,0.714286,2.0,0.5,1.0,4.0,2.0,3.0,1
6,6,AARONS,2017,29,VERONA,2018-03-18,VERONA,ATALANTA,0:5,,...,1.6,5.0,1.428571,5.0,2.0,5.0,-4.0,-1.0,-2.0,1
7,7,AARONS,2017,30,VERONA,2018-03-31,INTER,VERONA,3:0,RW,...,0.9,5.0,0.857143,5.0,1.25,5.0,3.0,3.0,0.0,0
8,8,AARONS,2017,31,VERONA,2018-04-08,VERONA,CAGLIARI,1:0,LW,...,1.6,5.0,2.142857,5.0,2.0,4.0,1.0,0.0,-1.0,1
9,9,AARONS,2017,32,VERONA,2018-04-15,BOLOGNA,VERONA,2:0,LW,...,0.7,2.0,0.428571,1.0,0.5,1.0,4.0,3.0,2.0,0


# Target Variable: Final Score

In [30]:
# list(enumerate(data.columns))

In [31]:
data.iloc[:,13:25]  = data.iloc[:,13:25].fillna(0)
data.iloc[:,29:108] = data.iloc[:,29:108].fillna(0)

In [32]:
# COMPUTE OUR TARGET VARIABLE, THE FINAL SCORE OF THE NEXT MATCHWEEK
# the model shall only have information about what preceded the moment it is trying to predict
# of course it is necessary to erase the last row available for each player, as we don't have information about the next score he will receive

def final_score (df1):
    
    df2 = pd.DataFrame()
    
    for player in df1.player.unique():
        
        #divide whole dataset into chunks, one for each player, and operate within one chunk at a time
        temp = df1[df1.player == player]
        
        #compute final score resulting from data on each row
        temp["final_score"] = temp.apply(lambda x: sum([x.base_score, x.goal_scored*(3), x.goal_conceded*(-1), x.penalty_saved*(3), x.penalty_failed*(-3), x.penalty_scored*(3), x.own_goal*(-2), x.yellow_card*(-0.5), x.red_card*(-1), x.assist]), axis=1)
        
        #shift final scores upward by 1, so that the final score at matchweek x is associated with the information available after matchweek x-1
        temp["final_score_next_week"] = temp["final_score"].shift(-1)
        
        #concatenate all chunks again in a single dataset
        df2 = df2.append(temp)
    
    #erase rows for which final score has remained empty i.e. the last observation we have of each player
    df2 = df2[df2["final_score_next_week"].notna()]
    
    return df2

In [None]:
data = final_score(data)

In [None]:
# Create avg max aggregates on the final_score variable
data = avg_max_timex(data, 'final_score', 10)
data = avg_max_timex(data, 'final_score', 7)
data = avg_max_timex(data, 'final_score', 4)

In [35]:
data.drop(columns=['Unnamed: 0'], inplace=True)

In [36]:
data.head()

Unnamed: 0,player,season,matchweek,team,match_date,home_team,away_team,score,status,player_status,...,team_form_4_y,home_game,final_score,final_score_next_week,avg_final_score_10,max_final_score_10,avg_final_score_7,max_final_score_7,avg_final_score_4,max_final_score_4
0,AARONS,2017,23,VERONA,2018-02-04,VERONA,ROMA,0:1,LM,played in the game,...,-3.0,1,5.0,5.5,5.0,5.0,5.0,5.0,5.0,5.0
1,AARONS,2017,24,VERONA,2018-02-11,SAMPDORIA,VERONA,2:0,RW,played in the game,...,1.0,0,5.5,5.5,5.25,5.5,5.25,5.5,5.25,5.5
2,AARONS,2017,25,VERONA,2018-02-19,LAZIO,VERONA,2:0,,error_status,...,2.0,0,5.5,0.0,5.333333,5.5,5.333333,5.5,5.333333,5.5
3,AARONS,2017,26,VERONA,2018-02-25,VERONA,TORINO,2:1,on the bench,on the bench,...,1.0,1,0.0,0.0,4.0,5.5,4.0,5.5,4.0,5.5
4,AARONS,2017,27,VERONA,2018-04-04,BENEVENTO,VERONA,3:0,on the bench,on the bench,...,4.0,0,0.0,0.0,3.2,5.5,3.2,5.5,2.75,5.5


In [37]:
data.to_csv("ready_datasets/final_dataset_for_modeling.csv")