# Random Forest Implementation

In [1]:
import numpy as np
import pandas as pd
import os 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE

In [2]:
data_folder = os.path.join(os.getcwd(), '..', 'Train_Data')
uncorrelated_home_stats = pd.read_csv(os.path.join(data_folder, 'train_uncorrelated_home_stats.csv'), index_col=0)
uncorrelated_away_stats = pd.read_csv(os.path.join(data_folder, 'train_uncorrelated_away_stats.csv'), index_col=0)
train_scores = pd.read_csv(os.path.join(data_folder, 'Y_train.csv'), index_col=0)

Next, I make one big df and have it so all the information about every game is in one row

In [3]:
# Here i am turning results into a column vector, where 1 is a win, 0 is a draw, and -1 is a loss. This is again so our forest
# just classifies into outcomes of 3 classes
# It is defined so 1 = win, 0 = draw, -1 = loss.
results = []
for index, row in train_scores.iterrows():
    if row.iloc[0] == 1:
        results.append(1)
    elif row.iloc[1] == 1:
        results.append(0)
    elif row.iloc[2] == 1:
        results.append(-1)
results_df = pd.DataFrame(results, columns=['Score'])

In [4]:
uncorrelated_home_stats.columns = 'HOME_' + uncorrelated_home_stats.columns
uncorrelated_away_stats.columns = 'AWAY_' + uncorrelated_away_stats.columns

# join = inner just slots these 2 arrays side by side
files = [uncorrelated_home_stats, uncorrelated_away_stats]
train_data =  pd.concat(files,join='inner',axis=1)

# this last line is a bit unnecessary, but it just fixes the scores to only include games that we have.
train_scores = train_scores.loc[train_data.index] 

(12303, 918)

# Recursive Feature Elimination on Training Set

The next process has 2 steps. We separate our data into training, testing and validation, and perform RFE on the training set. Then, with this new set of features, we optimise hyperparameters on the validation set, and then build the tree on the training set. Finally, this is tested on the test set. 

In [5]:
# We need a training, testing and validation set 
X_train, X_test, y_train, y_test = train_test_split(train_data, results_df, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
y_train = np.ravel(y_train)
y_validate = np.ravel(y_validate)
# This looks odd I assume but what we now have is 3 sets of data. The testing set is .2 of the original, testing set is .64 of 
# the orignial and validation set is 0.16 of the original. 

In [None]:
model = RandomForestClassifier(n_estimators=300, random_state=42)

# Initialize RFE to select the top 300 features
rfe = RFE(estimator=model, n_features_to_select=300, step=1, verbose = 2)
rfe.fit(X_train, y_train)

In [9]:
# Get the selected features
selected_features = rfe.support_
X_train_reduced_df = pd.DataFrame(X_train.loc[:, selected_features])
new_features = list(X_train_reduced_df.columns)
new_features_df = pd.DataFrame(new_features)
new_features_df.to_csv('Final_features_list.csv')
print(f"Selected {X_train_reduced_df.shape[1]} features using RFE.")

Selected 910 features using RFE.
