# Random Forest Implementation 2

This is approach 2 of 3. In this method, I discussed with chat GPT4s 'Football Betting' GPT to narrow down the list of player stats to the most relevant ones to match outcome. These are then averaged over a team and added to the other team wide stats. I then removed ones already covered by the averaged team stats/ highly correlated with things we already take account of. This results in: 
- Accuracy on training data: 0.7782 (intentionally low, any higher leads to overfitting)
- Accuracy on validation data: 0.5089
- Accuracy on testing data: 0.4961

In [1]:
import numpy as np
import pandas as pd
import os 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

These are the 4 dfs which I have discussed at length in 'investigating_data.ipynb'

In [2]:
train_data_folder = os.path.join(os.getcwd(), '..', 'Train_Data')
train_home_team_statistics_df   = pd.read_csv(os.path.join(train_data_folder, 'train_home_team_statistics_df.csv'), index_col=0)
train_away_team_statistics_df   = pd.read_csv(os.path.join(train_data_folder, 'train_away_team_statistics_df.csv'), index_col=0)
train_home_player_statistics_df = pd.read_csv(os.path.join(train_data_folder, 'train_home_player_statistics_df.csv'), index_col=0)
train_away_player_statistics_df = pd.read_csv(os.path.join(train_data_folder, 'train_away_player_statistics_df.csv'), index_col=0)
important_player_stats = pd.read_csv(os.path.join(train_data_folder, 'important_player_stats.csv'), index_col=0)
train_scores = pd.read_csv(os.path.join(train_data_folder, 'Y_train.csv'), index_col=0)

In [3]:
# First, we trim our player stats dataset based on the columns we have inferred are the most 
# important from the football betting GPT
# select columns required
features = important_player_stats['0'].to_list()
essential_training_data_home = train_home_player_statistics_df[features]
essential_training_data_away = train_away_player_statistics_df[features]
# average over team and positons
averaged_training_data_home = essential_training_data_home.groupby(['ID']).mean()
averaged_training_data_away = essential_training_data_away.groupby(['ID']).mean()

In [4]:
# Here i am turning results into a column vector, where 1 is a win, 0 is a draw, and -1 is a loss. This is again so our forest
# just classifies into outcomes of 3 classes
# It is defined so 1 = win, 0 = draw, -1 = loss.
results = []
for index, row in train_scores.iterrows():
    if row.iloc[0] == 1:
        results.append(1)
    elif row.iloc[1] == 1:
        results.append(0)
    elif row.iloc[2] == 1:
        results.append(-1)
results_df = pd.DataFrame(results, columns=['Score'])

Next, I make one big df and have it so all the information about every game is in one row

In [5]:
train_home_team    = train_home_team_statistics_df.iloc[:,2:]
train_away_team   = train_away_team_statistics_df.iloc[:,2:]
# adding a home vs away prefix for all of the columns. 
train_home_team.columns = 'HOME_' + train_home_team.columns
averaged_training_data_home.columns = 'HOME_' + averaged_training_data_home.columns
train_away_team.columns = 'AWAY_' + train_away_team.columns
averaged_training_data_away.columns = 'AWAY_' + averaged_training_data_away.columns
# join = inner just slots these 2 arrays side by side
files = [train_home_team,train_away_team,averaged_training_data_home, averaged_training_data_away]
train_data =  pd.concat(files,join='inner',axis=1)
# this last line is a bit unnecessary, but it just fixes the scores to only include games that we have.
train_scores = train_scores.loc[train_data.index] 
train_data.shape # This is (140 + 47) *2 as expected from earlier shapes. 

(12303, 374)

In [18]:
# We need a training, testing and validation set 
X_train, X_test, y_train, y_test = train_test_split(train_data, results_df, test_size=0.01, random_state=42)
#X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
y_train = np.ravel(y_train)
#y_validate = np.ravel(y_validate)
# What we now have is 3 sets of data. The testing set is .2 of the original, testing set is .64 of 
# the orignial and validation set is 0.16 of the original. 

In [19]:
# This uses the results of the next section. Basically, we are just telling the random forest how many trees we want it to have, 
# how big we want the trees to be, and how many features to consider in each tree, etc. 

# These were chosen by running parameter optimisaion grid search on the validation section of the data. 
best_params = {
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 2,
    'min_samples_split': 5,
    'n_estimators': 300
}
# Initialize the Random Forest classifier with the best parameters
rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    max_features=best_params['max_features'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    random_state=42
)

In [20]:
rf.fit(X_train, y_train)
# Make predictions on the testing data
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)
# y_pred_validate = rf.predict(X_validate)

In [21]:
print('Accuracy on training data:',   accuracy_score(y_train, y_pred_train))
# print('Accuracy on validation data:', accuracy_score(y_validate, y_pred_validate))
print('Accuracy on testing data:',    accuracy_score(y_test, y_pred_test))

Accuracy on training data: 0.7958781509155103
Accuracy on testing data: 0.5241935483870968


# Hyperparameter Optimisation

In [39]:
from sklearn.model_selection import GridSearchCV
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data

grid_search.fit(X_validate, y_validate)

# Get the best parameters
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator to make predictions
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

print("Tuned Accuracy:", accuracy_score(y_test, y_pred_best))
print("Tuned Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("Tuned Classification Report:\n", classification_report(y_test, y_pred_best))

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   2.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   4.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=



[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   3.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   3.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   3.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.5s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   1.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=20, max_features=log2

In [18]:
y_valid_pred = rf.predict(X_validate)

In [21]:
print('Accuracy:', accuracy_score(y_validate, y_valid_pred))

Accuracy: 0.494667343829355


# Submission 

We import the hidden testing data to generate the submission.

In [22]:
test_data_folder = os.path.join(os.getcwd(), '..', 'Test_Data')
test_home_team_statistics_df   = pd.read_csv(os.path.join(test_data_folder, 'test_home_team_statistics_df.csv'), index_col=0)
test_away_team_statistics_df   = pd.read_csv(os.path.join(test_data_folder, 'test_away_team_statistics_df.csv'), index_col=0)
test_home_player_statistics_df = pd.read_csv(os.path.join(test_data_folder, 'test_home_player_statistics_df.csv'), index_col=0)
test_away_player_statistics_df = pd.read_csv(os.path.join(test_data_folder, 'test_away_player_statistics_df.csv'), index_col=0)

In [23]:
essential_testing_data_home = test_home_player_statistics_df[features]
essential_testing_data_away = test_away_player_statistics_df[features]
# average over team and positons
averaged_testing_data_home = essential_testing_data_home.groupby(['ID']).mean()
averaged_testing_data_away = essential_testing_data_away.groupby(['ID']).mean()

In [24]:
test_home_team   = test_home_team_statistics_df
test_away_team   = test_away_team_statistics_df
# adding a home vs away prefix for all of the columns. 
test_home_team.columns = 'HOME_' + test_home_team.columns
averaged_testing_data_home.columns = 'HOME_' + averaged_testing_data_home.columns
test_away_team.columns = 'AWAY_' + test_away_team.columns
averaged_testing_data_away.columns = 'AWAY_' + averaged_testing_data_away.columns
# join = inner just slots these 2 arrays side by side
files = [test_home_team,test_away_team,averaged_testing_data_home, averaged_testing_data_away]
test_data =  pd.concat(files,join='inner',axis=1)
test_data.columns

Index(['HOME_TEAM_SHOTS_TOTAL_season_sum',
       'HOME_TEAM_SHOTS_INSIDEBOX_season_sum',
       'HOME_TEAM_SHOTS_OFF_TARGET_season_sum',
       'HOME_TEAM_SHOTS_ON_TARGET_season_sum',
       'HOME_TEAM_SHOTS_OUTSIDEBOX_season_sum', 'HOME_TEAM_PASSES_season_sum',
       'HOME_TEAM_SUCCESSFUL_PASSES_season_sum', 'HOME_TEAM_SAVES_season_sum',
       'HOME_TEAM_CORNERS_season_sum', 'HOME_TEAM_FOULS_season_sum',
       ...
       'AWAY_PLAYER_ASSISTS_5_last_match_average',
       'AWAY_PLAYER_KEY_PASSES_5_last_match_average',
       'AWAY_PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_average',
       'AWAY_PLAYER_INTERCEPTIONS_5_last_match_average',
       'AWAY_PLAYER_CLEARANCES_5_last_match_average',
       'AWAY_PLAYER_RATING_5_last_match_average',
       'AWAY_PLAYER_GOALS_CONCEDED_5_last_match_sum',
       'AWAY_PLAYER_GOALS_CONCEDED_5_last_match_average',
       'AWAY_PLAYER_TACKLES_season_sum', 'AWAY_PLAYER_FOULS_5_last_match_sum'],
      dtype='object', length=374)

In [25]:
# quick check for column agreement
counter = 0
for column in train_data.columns:
    if column not in test_data.columns:
        print(column)
        counter += 1
print(counter)

0


In [26]:
test_predicted_outcomes = rf.predict(test_data)

In [50]:
test_predicted_outcomes

array([ 1, -1,  1, ...,  1,  1,  1])

In [27]:
start_id = 12303
ids = np.arange(start_id, start_id + len(test_predicted_outcomes))

# Initialize columns for HOME_WINS, DRAW, and AWAY_WINS
home_wins = (test_predicted_outcomes == 1).astype(int)
draw = (test_predicted_outcomes == 0).astype(int)
away_wins = (test_predicted_outcomes == -1).astype(int)

# Create a DataFrame
df = pd.DataFrame({
    'ID': ids,
    'HOME_WINS': home_wins,
    'DRAW': draw,
    'AWAY_WINS': away_wins
})

# Print the DataFrame
df.to_csv('Final_Submission.csv', index = False)
print(df)

          ID  HOME_WINS  DRAW  AWAY_WINS
0      12303          1     0          0
1      12304          0     0          1
2      12305          1     0          0
3      12306          1     0          0
4      12307          1     0          0
...      ...        ...   ...        ...
25363  37666          0     0          1
25364  37667          1     0          0
25365  37668          1     0          0
25366  37669          1     0          0
25367  37670          1     0          0

[25368 rows x 4 columns]
