In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from typing import List
import warnings
warnings.filterwarnings('ignore')

In [2]:
game_2010 = pd.read_csv('./Dataset/Cleaned/cleaned_game_2010.csv')
game_2014 = pd.read_csv('./Dataset/Cleaned/cleaned_game_2014.csv')
game_2018 = pd.read_csv('./Dataset/Cleaned/cleaned_game_2018.csv')

games = pd.concat([game_2010, game_2014, game_2018])
games = games.reset_index()
games.to_csv('./Dataset/cleaned/cleaned_games_total.csv')

### Data Preprocessing
* Change the information of each columns to the offset of its value with the opponent's value  
* Use forward and backward selection for feature selection

In [3]:
# will drop the goals, because its obvious if one team have more goals than other, that team will win 100 percent.
games = games.drop(columns=['goals'])

# for index, row in games.iterrows():

for i in range(0,len(games),2):
    temp = games.iloc[i, 4:-1]
    games.iloc[i, 4:-1] = games.iloc[i, 4:-1] - games.iloc[i+1, 4:-1]
    games.iloc[i+1,4:-1] = games.iloc[i+1,4:-1] - temp 
    
games.to_csv('./Dataset/cleaned/cleaned_games_offset_total.csv')

In [4]:
X = games.iloc[:,4:-1]
y = games.iloc[:, -1]

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
object = StandardScaler()
X_train = object.fit_transform(X_train)

In [8]:
def forward_selection(
        data: pd.DataFrame, target: pd.Series, significance_level: float = 0.05
    ) -> List[str]:  # 9 pts
        """
        Args:
            data: (pandas data frame) contains the feature matrix
            target: (pandas series) represents target feature to search to generate significant features
            significance_level: (float) threshold to reject the null hypothesis
        Return:
            forward_list: (python list) contains significant features. Each feature
            name is a string
        """

        forwards = []
        originals = data.columns.tolist()
        num_features = len(originals)

        if num_features == 0:
            return forwards

        while True:
            left = list(set(originals) - set(forwards))
            p_value = pd.Series(index = left)
            for i in left:
                bias_added = sm.add_constant(data[forwards+[i]])
                p_value[i] = (sm.OLS(target, bias_added).fit()).pvalues[i]
            if p_value.min() < significance_level:
                forwards.append(p_value.idxmin())
            else:
                break

        return forwards

In [9]:
def backward_elimination(
        data: pd.DataFrame, target: pd.Series, significance_level: float = 0.05
    ) -> List[str]:  # 9 pts
        """
        Args:
            data: (pandas data frame) contains the feature matrix
            target: (pandas series) represents target feature to search to generate significant features
            significance_level: (float) threshold to reject the null hypothesis
        Return:
            backward_list: (python list) contains significant features. Each feature
            name is a string
        """
        originals = data.columns.tolist()
        if len(originals) == 0:
            return originals
        while True:
            bias_added = sm.add_constant(data[originals])
            p_value = sm.OLS(target, bias_added).fit()
            p_value = p_value.pvalues[1:]
            if p_value.max() <= significance_level:
                break
            originals.remove(p_value.idxmax())
        
        return originals

In [10]:
X_train = pd.DataFrame(X_train, columns = X.columns)
X_train = X_train.reset_index()
y_train = y_train.reset_index()
y_train = y_train['result']

In [12]:
forward_selected_list = forward_selection(X_train,y_train, significance_level = 0.05)
backward_selected_list = backward_elimination(X_train,y_train, significance_level = 0.05)

X_forward = games[forward_selected_list]
X_backward = games[backward_selected_list]

print(forward_selected_list)
print(backward_selected_list)

['FW_on-target', 'FW_time on opposite half', 'DF_mid activity', 'FW_passes_succeed', 'MF_fouls suffered', 'GK_total passes', 'MF_on-target', 'DF_red', 'MF_red', 'GK_low activity', 'DF_avg max speed', 'MF_distance', 'DF_on-target', 'FW_sprints', 'DF_distance in poss', 'DF_total passes', 'MF_total shots', 'GK_mid activity', 'GK_fouls suffered', 'DF_yellow', 'FW_mid activity', 'GK_avg max speed', 'DF_time on opposite pen', 'MF_time on opposite third', 'MF_time on opposite pen', 'MF_time on opposite half', 'FW_tackles gaining ball', 'MF_passes accuracy']
['GK_total passes', 'GK_fouls suffered', 'GK_avg max speed', 'GK_low activity', 'GK_yellow', 'DF_passes_succeed', 'DF_total passes', 'DF_on-target', 'DF_fouls committed', 'DF_distance in poss', 'DF_distance not in poss', 'DF_sprints', 'DF_avg max speed', 'DF_yellow', 'DF_red', 'MF_passes_succeed', 'MF_total passes', 'MF_passes accuracy', 'MF_total shots', 'MF_on-target', 'MF_fouls committed', 'MF_fouls suffered', 'MF_distance', 'MF_time on