# Random Forest Implementation

In [1]:
import numpy as np
import pandas as pd
import os 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

These are the 4 dfs which I have discussed at length in 'investigating_data.ipynb'

In [10]:
data_folder = os.path.join(os.getcwd(), '..', 'Train_Data')
train_home_team_statistics_df   = pd.read_csv(os.path.join(data_folder, 'train_home_team_statistics_df.csv'), index_col=0)
train_away_team_statistics_df   = pd.read_csv(os.path.join(data_folder, 'train_away_team_statistics_df.csv'), index_col=0)
train_home_player_statistics_df = pd.read_csv(os.path.join(data_folder, 'selected_averaged_away_player_stats.csv'), index_col=0)
train_away_player_statistics_df = pd.read_csv(os.path.join(data_folder, 'selected_averaged_home_player_stats.csv'), index_col=0)
train_scores = pd.read_csv(os.path.join(data_folder, 'Y_train.csv'), index_col=0)

In [13]:
# Here i am turning results into a column vector, where 1 is a win, 0 is a draw, and -1 is a loss. This is again so our forest
# just classifies into outcomes of 3 classes
# It is defined so 1 = win, 0 = draw, -1 = loss.
results = []
for index, row in train_scores.iterrows():
    if row.iloc[0] == 1:
        results.append(1)
    elif row.iloc[1] == 1:
        results.append(0)
    elif row.iloc[2] == 1:
        results.append(-1)
results_df = pd.DataFrame(results, columns=['Score'])

Next, I make one big df and have it so all the information about every game is in one row

In [14]:
train_home_team    = train_home_team_statistics_df.iloc[:,2:]
train_away_team   = train_away_team_statistics_df.iloc[:,2:]
train_home_players = train_home_player_statistics_df
train_away_players   = train_away_player_statistics_df
# adding a home vs away prefix for all of the columns. 
train_home_team.columns = 'HOME_' + train_home_team.columns
train_away_team.columns = 'AWAY_' + train_away_team.columns
# join = inner just slots these 2 arrays side by side
files = [train_home_team,train_away_team,train_home_players, train_away_players]
train_data =  pd.concat(files,join='inner',axis=1)
# this last line is a bit unnecessary, but it just fixes the scores to only include games that we have.
train_scores = train_scores.loc[train_data.index] 
train_data.shape # This is (140 + 47) *2 as expected from earlier shapes. 

(12303, 374)

In [15]:
# We need a training, testing and validation set 
X_train, X_test, y_train, y_test = train_test_split(train_data, results_df, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
y_train = np.ravel(y_train)
y_validate = np.ravel(y_validate)
# This looks odd I assume but what we now have is 3 sets of data. The testing set is .2 of the original, testing set is .64 of 
# the orignial and validation set is 0.16 of the original. 

In [16]:
# This uses the results of the next section. Basically, we are just telling the random forest how many trees we want it to have, 
# how big we want the trees to be, and how many features to consider in each tree, etc. I sent you a video which I found very helpful 
# for getting to grips with this.

# These were chosen by running parameter optimisaion grid search on the validation section of the data. 
best_params = {
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 2,
    'min_samples_split': 5,
    'n_estimators': 500
}
# Initialize the Random Forest classifier with the best parameters
rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    max_features=best_params['max_features'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    random_state=42
)

In [41]:
rf.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = rf.predict(X_test)

In [42]:
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.4900446972775295


# Hyperparameter OPtimisation

In [39]:
from sklearn.model_selection import GridSearchCV
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data

grid_search.fit(X_validate, y_validate)

# Get the best parameters
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator to make predictions
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

print("Tuned Accuracy:", accuracy_score(y_test, y_pred_best))
print("Tuned Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("Tuned Classification Report:\n", classification_report(y_test, y_pred_best))

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   4.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=



[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   3.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   3.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   3.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.5s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=20, max_features=log2

In [18]:
y_valid_pred = rf.predict(X_validate)

In [21]:
print('Accuracy:', accuracy_score(y_validate, y_valid_pred))

Accuracy: 0.494667343829355
