## IMPORTS

#### Standards

In [169]:
from pathlib import Path

#### Externals

In [170]:
import warnings
import pandas as pd

## CONFIGS

#### Folders

In [171]:
ROOT = Path("../")
RAW = ROOT / "data/raw"
PROCESSED = ROOT / "data/processed"
CACHE  = str(ROOT / ".mypycache")

## OPTIONS

#### Warnings

In [172]:
warnings.filterwarnings("ignore")

## DATASETS

In [173]:
matches = pd.read_csv(PROCESSED/'matches.csv')
players_teams = pd.read_csv(PROCESSED/'players_teams.csv')
attendance = pd.read_csv(PROCESSED/'attendance.csv')
stadiums = pd.read_csv(RAW/'stadiums.csv')
players = pd.read_csv(RAW/'players.csv')
squads = pd.read_csv(RAW/'squads.csv')
tournaments = pd.read_csv(RAW/'tournaments.csv')
goals =  pd.read_csv(RAW/'goals.csv')

## FEATURE ENGINEERING

<span style="display:inline-block;">The following features will be created:<br> 
<span style="color:blue; font-style:italic;">- total goals in match</span>, <span style="color:blue; font-style:italic;">match for host</span> (binary feature indicates if the host team is playing) and <span style="color:blue; font-style:italic;">used capacity ratio</span> in the matches table.<br>
<span style="color:blue; font-style:italic;">- attendance category</span> depending on the attendance feature and <span style="color:blue; font-style:italic;">relative attendance category</span> depending on the used capacity ratio feature using Discretization<br>
<span style="color:blue; font-style:italic;">- host country code, tournament year, full name</span> (for players in a readable format).<br>
<span style="color:blue; font-style:italic;">- winner code</span> in tournaments table<br>
<span style="color:blue; font-style:italic;">- short stage name</span> which includes knockout and group stages only.<br>
<span style="color:blue; font-style:italic;">- late goal</span> (binary feature denotes whether a goal was scored late in the match), we'll see that the goal minute and halftime of the match can be helpful for this feature..

#### <span style="color:#007BF1; font-style:italic;">Total goals in match feature</span>

In [174]:
# Calculating the total goals in the match without the penalties

matches['total_goals_in_match'] = matches['home_team_score'] + matches['away_team_score']
matches[['home_team_score', 'away_team_score', 'score','score_penalties','total_goals_in_match']]

Unnamed: 0,home_team_score,away_team_score,score,score_penalties,total_goals_in_match
0,4,1,4–1,0-0,5
1,3,0,3–0,0-0,3
2,2,1,2–1,0-0,3
3,3,1,3–1,0-0,4
4,1,0,1–0,0-0,1
...,...,...,...,...,...
973,1,2,1–2,0-0,3
974,3,0,3–0,0-0,3
975,2,0,2–0,0-0,2
976,2,1,2–1,0-0,3


#### <span style="color:#007BF1; font-style:italic;">Match for host feature</span>

In [175]:
# Create a new column called 'match_for_host' with value 1 if the Host Country plays in the matche and 0 otherwise.

matches['match_for_host'] = ((matches['country_name'] == matches['home_team_name']) |
                             (matches['country_name'] == matches['away_team_name'])).astype(int)
matches[['country_name', 'home_team_name', 'away_team_name','match_for_host']].head()

Unnamed: 0,country_name,home_team_name,away_team_name,match_for_host
0,Uruguay,France,Mexico,0
1,Uruguay,United States,Belgium,0
2,Uruguay,Yugoslavia,Brazil,0
3,Uruguay,Romania,Peru,0
4,Uruguay,Argentina,France,0


#### <span style="color:#007BF1; font-style:italic;">Used capacity ratio feature</span>

In [176]:
# Create a new column called 'used_capacity_ratio' by Dividing 'attendance' on 'stadium_capacity'.
matches['used_capacity_ratio'] = matches['attendance'] / matches['stadium_capacity']
matches[['stadium_name', 'stadium_capacity', 'attendance','used_capacity_ratio']].head()

Unnamed: 0,stadium_name,stadium_capacity,attendance,used_capacity_ratio
0,Estadio Pocitos,10000,4444.0,0.4444
1,Estadio Gran Parque Central,20000,18346.0,0.9173
2,Estadio Gran Parque Central,20000,24059.0,1.20295
3,Estadio Pocitos,10000,2549.0,0.2549
4,Estadio Gran Parque Central,20000,23409.0,1.17045


#### <span style="color:#007BF1; font-style:italic;">Attendance category feature</span>

In [177]:
# Define custom bins for discretization
bins = [0, 25000, 50000, 75000, 100000, float('inf')]
labels = ['1k-25k', '25k-50k', '50k-75k', '75k-100k', '>100k']

# Create 'attendance_category' column based on discretization
matches['attendance_category'] = pd.cut(matches['attendance'], bins=bins, labels=labels, right=False)

# Display the relevant columns
matches[['attendance', 'stadium_capacity', 'used_capacity_ratio', 'attendance_category']].head()

Unnamed: 0,attendance,stadium_capacity,used_capacity_ratio,attendance_category
0,4444.0,10000,0.4444,1k-25k
1,18346.0,20000,0.9173,1k-25k
2,24059.0,20000,1.20295,1k-25k
3,2549.0,10000,0.2549,1k-25k
4,23409.0,20000,1.17045,1k-25k


#### <span style="color:#007BF1; font-style:italic;">Relative attendance category feature</span>

In [178]:
ratio_bins = [0, 0.25, 0.5, 0.75, 1.0, float('inf')]
ratio_labels = ['0%-25%', '25%-50%', '50%-75%', '75%-100%', '>100%']

# Create 'relative_attendance_category' column based on discretization of 'used_capacity_ratio'
matches['relative_attendance_category'] = pd.cut(matches['used_capacity_ratio'], bins=ratio_bins, labels=ratio_labels, right=False)

# Display the relevant columns
matches[['attendance', 'stadium_capacity', 'used_capacity_ratio', 'attendance_category', 'relative_attendance_category']].head()

Unnamed: 0,attendance,stadium_capacity,used_capacity_ratio,attendance_category,relative_attendance_category
0,4444.0,10000,0.4444,1k-25k,25%-50%
1,18346.0,20000,0.9173,1k-25k,75%-100%
2,24059.0,20000,1.20295,1k-25k,>100%
3,2549.0,10000,0.2549,1k-25k,25%-50%
4,23409.0,20000,1.17045,1k-25k,>100%


In [179]:
matches[['match_for_host', 'relative_attendance_category']].dtypes

match_for_host                     int32
relative_attendance_category    category
dtype: object

#### <span style="color:#007BF1; font-style:italic;">Host country code</span>

In [180]:
# create a dictionary that maps each country name to its code
country_codes = {'Algeria': 'DZA', 'Argentina': 'ARG', 'Australia': 'AUS',
                 'Austria': 'AUT', 'Belgium': 'BEL', 'Bosnia and Herzegovina': 'BIH',
                 'Brazil': 'BRA', 'Cameroon': 'CMR', 'Chile': 'CHL', 'China PR': 'CHN',
                 'Colombia': 'COL', 'Costa Rica': 'CRI', 'Croatia': 'HRV', 'Czech Republic': 'CZE',
                 'Denmark': 'DNK', 'Ecuador': 'ECU', 'Egypt': 'EGY', 'England': 'ENG',
                 'France': 'FRA', 'Germany': 'DEU', 'Ghana': 'GHA', 'Greece': 'GRC', 'Honduras': 'HND',
                 'Hungary': 'HUN', 'Iran': 'IRN', 'Iraq': 'IRQ', 'Italy': 'ITA', 'Ivory Coast': 'CIV',
                 'Japan': 'JPN', 'Korea DPR': 'PRK', 'Korea Republic': 'KOR', 'Mexico': 'MEX', 'Morocco': 'MAR',
                 'Netherlands': 'NLD', 'New Zealand': 'NZL', 'Nigeria': 'NGA', 'Northern Ireland': 'NIR',
                 'Norway': 'NOR', 'Paraguay': 'PRY', 'Peru': 'PER', 'Poland': 'POL', 'Portugal': 'PRT',
                 'Republic of Ireland': 'IRL', 'Romania': 'ROU', 'Russia': 'RUS', 'Saudi Arabia': 'SAU',
                 'Scotland': 'SCO', 'Senegal': 'SEN', 'Serbia': 'SRB', 'Slovakia': 'SVK', 'Slovenia': 'SVN',
                 'South Africa': 'ZAF', 'Spain': 'ESP', 'Sweden': 'SWE', 'Switzerland': 'CHE', 'Tunisia': 'TUN',
                 'Turkey': 'TUR', 'Ukraine': 'UKR', 'United Arab Emirates': 'ARE', 'United States': 'USA',
                 'Uruguay': 'URY', 'Wales': 'WAL'}

In [181]:
# applying 'host_country_code' in matches dataframe
matches['host_country_code'] = matches['country_name'].map(country_codes)
matches[['country_name', 'host_country_code']].head()

Unnamed: 0,country_name,host_country_code
0,Uruguay,URY
1,Uruguay,URY
2,Uruguay,URY
3,Uruguay,URY
4,Uruguay,URY


<span style="display:inline-block;">applying <span style="color:#007BF1; font-style:italic;">host_country_code</span> feature for other dataframes</span>

In [182]:
# tournaments dataframe
tournaments['host_country_code'] = tournaments['host_country'].map(country_codes)

# stadiums dataframe
stadiums['host_country_code'] = stadiums['country_name'].map(country_codes)
stadiums[['country_name', 'host_country_code']].head()

Unnamed: 0,country_name,host_country_code
0,Argentina,ARG
1,Argentina,ARG
2,Argentina,ARG
3,Argentina,ARG
4,Argentina,ARG


#### <span style="color:#007BF1;font-style:italic;">Tournament year feature</span>

In [183]:
# Exrecting the year as numerical value from the 'tournament_id' column and creating 'tournament_year'.
matches['tournament_year'] = matches['tournament_id'].str[-4:].astype(int)
matches[['tournament_id','tournament_name','tournament_year']].head()

Unnamed: 0,tournament_id,tournament_name,tournament_year
0,WC-1930,1930 FIFA World Cup,1930
1,WC-1930,1930 FIFA World Cup,1930
2,WC-1930,1930 FIFA World Cup,1930
3,WC-1930,1930 FIFA World Cup,1930
4,WC-1930,1930 FIFA World Cup,1930


<span style="display:inline-block;">applying <span style="color:#007BF1; font-style:italic;">tournament_year</span> feature for other dataframes</span>

In [184]:
# squads dataframe
squads['tournament_year'] = squads['tournament_id'].str[-4:].astype(int)

# attendance dataframe
attendance['tournament_year'] = pd.to_datetime(attendance['match_date']).dt.year

# goals dataframe
goals['tournament_year'] = goals['tournament_id'].str[-4:].astype(int)
goals[['tournament_id','tournament_year']].head()

Unnamed: 0,tournament_id,tournament_year
0,WC-1930,1930
1,WC-1930,1930
2,WC-1930,1930
3,WC-1930,1930
4,WC-1930,1930


#### <span style="color:#007BF1; font-style:italic;">full name feature</span>

In [185]:
# full name is concatination between the gvien name and the family name
players_teams['full_name'] = players_teams['given_name'] + ' ' + players_teams['family_name']
players_teams[['given_name','family_name','full_name']].head()

Unnamed: 0,given_name,family_name,full_name
0,Alan,A'Court,Alan A'Court
1,Brenden,Aaronson,Brenden Aaronson
2,Stefan,Abadzhiev,Stefan Abadzhiev
3,Jean-Paul,Abalo,Jean-Paul Abalo
4,Patrice,Abanda,Patrice Abanda


<span style="display:inline-block;">applying <span style="color:#007BF1; font-style:italic;">full_name</span> feature for other dataframes</span>

In [186]:
# players dataframe
players['full_name'] = players['given_name'] + ' ' + players['family_name']

# squads dataframe
squads['full_name'] = squads['given_name'] + ' ' + squads['family_name']

# goals dataframe
goals['full_name'] = goals['given_name'] + ' ' + goals['family_name']
goals[['given_name','family_name','full_name']].head()

Unnamed: 0,given_name,family_name,full_name
0,Lucien,Laurent,Lucien Laurent
1,Marcel,Langiller,Marcel Langiller
2,André,Maschinot,André Maschinot
3,Juan,Carreño,Juan Carreño
4,André,Maschinot,André Maschinot


#### <span style="color:#007BF1; font-style:italic;">Short stage name feature</span>

In [187]:
matches['short_stage_name'] = matches.apply(lambda x: 'group stage' if x['group_stage'] == 1 
                                            else 'knockout stage' if x['knockout_stage'] == 1 else '', axis=1)
matches[['group_stage','knockout_stage','short_stage_name']]

Unnamed: 0,group_stage,knockout_stage,short_stage_name
0,1,0,group stage
1,1,0,group stage
2,1,0,group stage
3,1,0,group stage
4,1,0,group stage
...,...,...,...
973,0,1,knockout stage
974,0,1,knockout stage
975,0,1,knockout stage
976,0,1,knockout stage


#### <span style="color:#007BF1; font-style:italic;">Winner code feature</span>

In [188]:
tournaments['winner_code'] = tournaments['winner'].map(country_codes)
tournaments[['winner', 'winner_code']].head()

Unnamed: 0,winner,winner_code
0,Uruguay,URY
1,Italy,ITA
2,Italy,ITA
3,Uruguay,URY
4,West Germany,


#### <span style="color:#007BF1; font-style:italic;">Late goal feature</span>

In [189]:
thresholds = {
    'first half': 40,
    'second half': 85,
    'first half, stoppage time': 45,
    'second half, stoppage time': 90,
    'extra time, first half': 97,
    'extra time, second half': 112,
    'extra time, first half, stoppage time': 105,
    'extra time, second half, stoppage time': 120
}

def is_late_goal(row):
    period = row['match_period']
    threshold = thresholds[period]
    minute = row['minute_stoppage'] if row['minute_stoppage'] > 0 else row['minute_regulation']
    return minute >= threshold


goals['late_goal'] = goals.apply(is_late_goal, axis=1)
goals[['minute_label','minute_regulation', 'minute_stoppage','match_period','late_goal']].head()

Unnamed: 0,minute_label,minute_regulation,minute_stoppage,match_period,late_goal
0,19',19,0,first half,False
1,40',40,0,first half,True
2,43',43,0,first half,True
3,70',70,0,second half,False
4,87',87,0,second half,True


### Save the processed data

In [190]:
matches.to_csv(PROCESSED/'matches.csv', index=False)
players_teams.to_csv(PROCESSED/'players_teams.csv', index=False)
attendance.to_csv(PROCESSED/'attendance.csv', index=False)
stadiums.to_csv(PROCESSED/'stadiums.csv', index=False)
players.to_csv(PROCESSED/'players.csv', index=False)
squads.to_csv(PROCESSED/'squads.csv', index=False)
tournaments.to_csv(PROCESSED/'tournaments.csv', index=False)
goals.to_csv(PROCESSED/'goals.csv', index=False)