In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer

sys.path.append('../Scripts')
from Data_Processing import DataProcessing
from pprint import pprint

In [2]:
df = DataProcessing('../Data/train.csv')

In [4]:
df = df.loc[df['Lap_Time'] != 0]

y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])

obj_columns = list(X.select_dtypes(include=object).columns)
obj_columns.append('Lap_Number')
obj_columns.append('Lap_Improvement')

num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')

# Scalers

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

In [6]:
#pt = PowerTransformer(method='yeo-johnson')
#X[num_columns] = pt.fit_transform(X[num_columns])
X

Unnamed: 0,Lap_Number,S1,S1_Improvement,S2,S2_Improvement,S3,S3_Improvement,Kph,Elapsed,Driver_Name,Pit_Time,Team,Power,Event,Air_Temp,Track_Temp,Humidity,Pressure,Wind_Speed,Wind_Direction,Rain
0,1,343.300,0,35.427,0,43.313,0,28.8,422.0,SB,104.900000,JR,250,Free Practice 2,15.0556,18.6,60.0,1018.25,3.18280,175.0,-1.0
1,2,25.674,2,33.399,2,41.922,2,120.5,523.0,SB,104.900000,JR,250,Free Practice 2,15.0556,18.7,60.0,1018.25,4.24374,161.0,-1.0
2,3,28.129,0,34.091,0,57.248,0,101.9,642.5,SB,104.900000,JR,250,Free Practice 2,15.0556,18.7,60.0,1018.22,3.18280,148.0,-1.0
3,1,65.000,0,38.416,0,56.833,0,75.9,160.2,LGRA,8.250000,AD,250,Free Practice 2,15.0556,18.5,60.0,1018.12,2.12187,157.0,-1.0
4,2,28.013,0,36.743,0,44.716,0,111.2,269.7,LGRA,8.250000,AD,250,Free Practice 2,15.1111,18.5,60.0,1018.15,3.18280,149.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10267,17,22.936,0,21.231,0,23.701,0,124.9,1434.7,PWEHRL,12.580952,TAG,250,Free Practice 1,2973.0000,36.0,3847.0,101409.00,45.00000,171.0,0.0
10268,18,23.610,0,22.432,0,30.281,0,111.1,1511.0,PWEHRL,12.580952,TAG,235,Free Practice 1,2986.0000,361.0,3834.0,101413.00,0.00000,202.0,0.0
10269,19,137.100,0,22.681,0,24.308,0,46.1,1695.1,PWEHRL,12.580952,TAG,250,Free Practice 1,3005.0000,362.0,383.0,101419.00,1391.00000,203.0,0.0
10270,20,22.539,2,21.057,2,23.548,2,126.3,1762.3,PWEHRL,12.580952,TAG,250,Free Practice 1,2983.0000,362.0,3859.0,101416.00,254.00000,273.0,0.0


In [7]:
column_transformer = ColumnTransformer(
[('num', StandardScaler(), num_columns),
('obj', OneHotEncoder(), obj_columns)],
remainder='passthrough')

trans_X = column_transformer.fit_transform(X)

In [8]:
joblib.dump(column_transformer, '../Models/Column_Transformer.pkl')
#joblib.dump(pt, '../Models/Power_Transformer.pkl')

['../Models/Column_Transformer.pkl']

In [9]:
#trans_X = trans_X.toarray()
y = np.asarray(y)

In [10]:
test_x = trans_X[:1000,]
test_y = y[:1000,]

trans_X = trans_X[1000:,]
y = y[1000:,]

In [11]:
test_y.shape

(1000,)

# Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=16, test_size=0.2)

# Gradient Boost Grid Search

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_absolute_percentage_error
from keras import backend as K

In [15]:
def root_mean_squared_log_error(y_true, y_pred):
        return np.sqrt(np.mean(np.square(np.log(1+y_pred) - np.log(1+y_true))))

In [16]:
gb = GradientBoostingRegressor(random_state=42)

In [17]:
scoring = {'MSLE': make_scorer(mean_squared_log_error),
           'MAPE': make_scorer(mean_absolute_percentage_error)}

In [18]:
random_grid = {
    "loss":['squared_error', 'absolute_error', 'huber'],
    "learning_rate": [0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(1, 200, 10, dtype=int),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8,10,12],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    }

In [19]:
gb_random = RandomizedSearchCV(
    estimator = gb, 
    param_distributions = random_grid, 
    n_iter = 100, 
    cv = 5, 
    verbose=2, 
    random_state=42, 
    n_jobs = -1)

In [20]:
gb_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


35 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mcand\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mcand\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 586, in fit
    n_stages = self._fit_stages(
  File "C:\Users\mcand\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 663, in _fit_stages
    raw_predictions = self._fit_stage(
  File "C:\Users\mcand\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 246, in _fit_stage
    tree.fit(X, residual, sample_weight=sample

RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'criterion': ['friedman_mse', 'mae'],
                                        'learning_rate': [0.001, 0.01, 0.025,
                                                          0.05, 0.075, 0.1,
                                                          0.15, 0.2],
                                        'loss': ['squared_error',
                                                 'absolute_error', 'huber'],
                                        'max_depth': [3, 5, 8, 10, 12],
                                        'max_features': ['log2', 'sqrt'],
                                        'min_samples_leaf': array([0.1       , 0.13636364, 0.17272727, 0.20909091, 0.24545455,
       0.28181818, 0.31818182, 0.35454545, 0.39090909, 0.42727273,
       0.46363636, 0.5       ]),
                                        'min_samples_split': 

In [None]:
params = gb_random.best_params_

In [None]:
gb = GradientBoostingRegressor(**params)

In [None]:
gb.fit(X_train, y_train)

In [None]:
predictions = gb.predict(X_test)

In [None]:
root_mean_squared_log_error(y_test, predictions)

In [None]:
predictions_test_x = gb.predict(test_x)

In [None]:
root_mean_squared_log_error(test_y, predictions_test_x)

In [None]:
joblib.dump(gb, '../Models/Gradient_Boost_Model.h5')