In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer


In [55]:
matches = pd.read_csv("data/matches_combined_data.csv")
teams = pd.read_csv("data/fjelstul/teams.csv")

### 1. total_goals_in_match

In [56]:
matches["total_goals_in_match"] = (
    matches["home_team_score"] +
    matches["away_team_score"] +
    matches["home_team_score_penalties"] +
    matches["away_team_score_penalties"]
)

### 2. match_for_host

In [57]:
condition1 = matches["country_name"] == matches["home_team_name"]
condition2 = matches["country_name"] == matches["away_team_name"]
matches["match_for_host"] = condition1 | condition2


### 3. used_capacity_ratio

In [58]:
matches["used_capacity_ratio"] = matches["attendance"] / matches["stadium_capacity"]

### 4. attendance_category 

In [59]:
transformer = KBinsDiscretizer(n_bins=5, encode='ordinal')
discrete_attendance = transformer.fit_transform(matches['attendance'].values.reshape(-1, 1))
matches["attendance_category"] = discrete_attendance

### 5. relative_attendance_category

In [60]:
bins = [0 , 0.99, 1.01, np.inf]
labels = ['underload', 'full', 'overload']
transformer = FunctionTransformer(pd.cut, kw_args={'bins': bins, 
                                                   'labels': labels, 
                                                   'retbins': False})
matches["relative_attendance_category"] = transformer.fit_transform(matches['used_capacity_ratio'])

### 6. host_country_code

In [61]:
matches = pd.merge(
    matches,
    teams[["team_name", "team_code"]],
    left_on=["country_name"],
    right_on=["team_name"],
    how="left"
)

In [62]:
matches = matches.drop("team_name", axis=1)
matches = matches.rename(columns={"team_code": "host_country_code"})

### 7. tournament_year

In [63]:
matches["tournament_year"] = matches.tournament_id.str.split("-").str.get(1)

### 8. short_stage_name

In [71]:
matches["short_stage_name"] = np.where(matches['group_stage'] == 1, 'group', 'knockout')

## 9. winner_code 

In [65]:
conditions = [matches['home_team_win'].astype(bool), matches['away_team_win'].astype(bool)]
choices = [matches['home_team_code'], matches['away_team_code']]
matches['winner_code'] = np.select(conditions, choices, default=None)