# Feature Selection

In [1]:
!pip install boruta



In [2]:
import pandas as pd
import numpy as np
import pickle
import json
from sklearn.impute import SimpleImputer
from constants.constants import *
from models import Boruta, PreProcessor # see models.py for implementation

pd.set_option('display.max_columns', None) # show all columns
pd.set_option('display.max_rows', 20) # show all rows
pd.set_option('display.max_seq_items', None)

To keep it simple for now, we will just use the outcome to filter out the most important features. We should only use the data from 2019 to 2022 as we are predicting 2023

In [3]:
dfs = []

for year in range(2019, 2023):
    df = pd.read_csv(f'./data/machine_learning/outcome/{str(year)}.csv')
    dfs.append(df)

train_df = pd.concat(dfs, ignore_index=True)

In [4]:
train_df = train_df.drop(['form', 'opponent_form'], axis=1) # too complicated to take into account form when predicting

We create a Preprocessor class and Boruta child class in models.py so that it can have access to the preprocessor

Normalize the data

In [5]:
train_df_clean = train_df.dropna()
X = train_df_clean.drop(columns=['outcome'])
y = train_df_clean['outcome']
preprocessor = PreProcessor(X, y)

In [6]:
with open('data/machine_learning/pkl/processor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

## Using Correlation Matrix to drop Highly Related Cols

In [7]:
# Calculate the correlation matrix
corr_matrix = preprocessor.df_train_preprocessed.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than 0.85
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

CORR_COLS = list(set(preprocessor.df_train_preprocessed.columns) - set(to_drop))
print(f"{len(CORR_COLS)} features left")

155 features left


In [8]:
with open(UNIQUE_FEATURES_PATH, 'w') as file:
    json.dump(CORR_COLS, file, indent=4)

# Logistic Regression

Trying to use Logistic Regression to further reduce the number of features

In [8]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000)
# Fit your logistic regression model
log_reg.fit(preprocessor.df_train_preprocessed[CORR_COLS], preprocessor.y_train_encoded)

In [9]:
feature_names = preprocessor.df_train_preprocessed[CORR_COLS].columns
coefficients = log_reg.coef_
class_labels = preprocessor.target_encoder.inverse_transform(log_reg.classes_)

coef_df = pd.DataFrame(coefficients.T,  # Transpose to make features as rows
                       columns=[f'Coeff_Class_{cls}' for cls in class_labels],
                       index=feature_names)

coef_df

Unnamed: 0,Coeff_Class_D,Coeff_Class_L,Coeff_Class_W
team_corner_kicks_in_per90,-0.051626,-0.034214,0.085840
team_carries_into_penalty_area_per90,-0.001666,-0.082413,0.084079
team_cards_yellow_per90,-0.016557,0.027901,-0.011344
opponent_sca_take_ons_per90,-0.060198,0.059999,0.000200
opponent_tackles_def_3rd_per90,0.039956,-0.012848,-0.027108
...,...,...,...
team_players_used,-0.009566,0.035006,-0.025440
opponent_take_ons_tackled_pct,-0.004822,-0.008110,0.012931
opponent_challenge_tackles_per90,0.078559,-0.055045,-0.023514
opponent_tackles_interceptions_per90,0.015652,-0.001766,-0.013886


In [10]:
top_features_d = coef_df.nlargest(20, 'Coeff_Class_D', keep='all').index
top_features_l = coef_df.nlargest(5, 'Coeff_Class_L', keep='all').index
top_features_w = coef_df.nlargest(5, 'Coeff_Class_W', keep='all').index

top_features_all = top_features_d.union(top_features_l).union(top_features_w)
coef_df.loc[top_features_all, :]

Unnamed: 0,Coeff_Class_D,Coeff_Class_L,Coeff_Class_W
is_home,-0.001118,-0.360220,0.361337
opponent_aerials_lost_per90,0.041067,-0.058209,0.017142
opponent_assisted_shots_per90,0.071048,0.019760,-0.090808
opponent_assists_per90,0.014598,-0.238126,0.223528
opponent_challenge_tackles_per90,0.078559,-0.055045,-0.023514
...,...,...,...
team_tackles_def_3rd_per90,0.069956,-0.052033,-0.017923
team_tackles_interceptions_per90,0.035033,-0.031884,-0.003148
team_take_ons_per90,0.038815,0.025862,-0.064676
team_throw_ins_per90,0.038624,-0.222169,0.183545


In [11]:
LOG_REG_COLS = top_features_all.tolist()

## Boruta Algorithm

<br>
Referred to https://www.sciencedirect.com/science/article/pii/S1877050922007955 by Fátima Rodriguesa
and Ângelo Pintob

- Boruta algorithm was used. Boruta is a heuristic variable selection algorithm based on the random forests algorithm that aims to find the most relevant variables in a dataset.
- The Boruta algorithm show the relevant and non-relevant variables. This algorithm was used because it does not look for a suboptimal solution, instead it tries to find all variables with relevant information, thus allowing to eliminate variables that would negatively affect the forecast models

In [12]:
boruta = Boruta()

boruta.fit(preprocessor.df_train_preprocessed[LOG_REG_COLS], preprocessor.y_train)
BORUTA_FEATURES = boruta.get_selected_features()

with open(BORUTA_FEATURES_PATH, 'w') as file:
    json.dump(BORUTA_FEATURES, file, indent=4)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	4
Iteration: 	9 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	4
Iteration: 	10 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	4
Iteration: 	11 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	4
Iteration: 	12 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	4
Iteration: 	13 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	4
Iteration: 	14 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	4
Iteration: 	15 / 100
Confirmed: 	18
Tentative: 	5
Rejected: 	4
Iteration: 	16 / 100
Confirmed: 	19
Tentative: 	4
Rejected: 	4
I

In [13]:
print(BORUTA_FEATURES)

['is_home', 'opponent_aerials_lost_per90', 'opponent_assisted_shots_per90', 'opponent_assists_per90', 'opponent_offsides_per90', 'opponent_passes_dead_per90', 'opponent_squad_value', 'opponent_tackles_def_3rd_per90', 'opponent_throw_ins_per90', 'opponent_touches_def_pen_area_per90', 'team_aerials_lost_per90', 'team_assists_per90', 'team_corner_kicks_per90', 'team_passes_dead_per90', 'team_squad_value', 'team_tackles_def_3rd_per90', 'team_tackles_interceptions_per90', 'team_take_ons_per90', 'team_throw_ins_per90', 'team_touches_def_pen_area_per90']
