In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer

sys.path.append('../Scripts')
from Data_Processing import DataProcessing
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = DataProcessing('../Data/train.csv')

In [3]:
df = df.loc[df['Lap_Time'] != 0]

y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])

obj_columns = list(X.select_dtypes(include=object).columns)
obj_columns.append('Lap_Number')
obj_columns.append('Lap_Improvement')

num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')

# Scalers

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

In [5]:
ColumnTransformer = joblib.load('../Models/Column_Transformer.pkl')
#PowerTransformer = joblib.load('../Models/Power_Transformer.pkl')

trans_X = ColumnTransformer.transform(X)

#features_index = [72, 3, 12, 11, 13, 5, 17, 1, 14, 7, 8, 16, 15, 0, 9]
#trans_X = trans_X[:,features_index]
trans_X

<10059x114 sparse matrix of type '<class 'numpy.float64'>'
	with 185545 stored elements in Compressed Sparse Row format>

In [6]:
#features_index = [72, 3, 12, 11, 13, 5, 17, 1, 14, 7, 8, 16, 15, 0, 9]
#trans_X = trans_X[:,features_index]
#trans_X

In [7]:
#trans_X = trans_X.toarray()
y = np.asarray(y)

# Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=11, test_size=0.2)

# RF Model

In [10]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=1600,
    min_samples_split=2,
    min_samples_leaf=4,
    max_features='sqrt',
    max_depth=10,
    bootstrap=True)

In [11]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                      n_estimators=1600)

In [12]:
results = pd.DataFrame()
results['Predicted'] = rf.predict(X_test)
results['Actual']= y_test
results['Difference'] = abs(results['Predicted'] - results['Actual'])

In [13]:
results['Difference'].mean()

16.508316535015116

In [14]:
from sklearn.metrics import mean_squared_error
mean_squared_error(results['Actual'], results['Predicted'], squared=False)

22.53624210167731

In [16]:
joblib.dump(rf, '../Models/RF_Model.h5')

['../Models/RF_Model.h5']

# New Features

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
trans_X = trans_X[:,indexes]
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=11, test_size=0.3)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=1600,
    min_samples_split=2,
    min_samples_leaf=4,
    max_features='sqrt',
    max_depth=10,
    bootstrap=True)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_predicted = rf.predict(X_test)

root_mean_squared_log_error(y_test, y_predicted)

In [None]:
joblib.dump(rf, '../Models/RF_Model.h5')

# Output

In [None]:
results = pd.DataFrame()
results['Predicted'] = y_predicted
results['Actual'] = y_test
results

In [None]:
columns = column_transformer.get_feature_names_out()
importances = rf.feature_importances_

features = pd.DataFrame()
features['Column'] = columns
features['Importance'] = importances
features.sort_values(by='Importance', ascending=False).to_csv('../Data/Feature_Importances.csv', index=False)

# Grid Search

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from keras import backend as K

In [12]:
def root_mean_squared_log_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(K.log(1+y_pred) - K.log(1+y_true))))

In [None]:
rf = RandomForestRegressor(random_state=42)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
rf_random = RandomizedSearchCV(
    estimator = rf, 
    param_distributions = random_grid, 
    n_iter = 100, 
    cv = 3, 
    verbose=2, 
    random_state=42, 
    n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_