In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer

sys.path.append('../Scripts')
from Data_Processing import DataProcessing
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = DataProcessing('../Data/train.csv')

In [3]:
df = df.loc[df['Lap_Time'] != 0]

y = df['Lap_Time']
X = df.drop(columns=['Lap_Time', 'Lap_Improvement', 'S1_Improvement', 'S2_Improvement', 'S3_Improvement'])

obj_columns = list(X.select_dtypes(include=object).columns)

#obj_columns.append('Lap_Improvement')
#obj_columns.append('Lap_Number')
#obj_columns.append('S1_Improvement')
#obj_columns.append('S2_Improvement')
#obj_columns.append('S3_Improvement')

num_columns = list(X.select_dtypes(include='number').columns)
#num_columns.remove('Lap_Number')
#num_columns.remove('Lap_Improvement')
#num_columns.remove('S1_Improvement')
#num_columns.remove('S2_Improvement')
#num_columns.remove('S3_Improvement')

# Scalers

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import joblib

In [5]:
column_transformer = ColumnTransformer(
[('num', MinMaxScaler(), num_columns),
('obj', OneHotEncoder(), obj_columns)],
remainder='passthrough')

trans_X = column_transformer.fit_transform(X)

In [6]:
joblib.dump(column_transformer, '../Models/Column_Transformer.pkl')
#joblib.dump(pt, '../Models/Power_Transformer.pkl')

['../Models/Column_Transformer.pkl']

In [7]:
#trans_X = trans_X.toarray()
y = np.asarray(y)

# Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=16, test_size=0.2)

# Gradient Boost Grid Search

In [10]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_absolute_percentage_error
from keras import backend as K

In [11]:
def root_mean_squared_log_error(y_true, y_pred):
        return np.sqrt(np.mean(np.square(np.log(1+y_pred) - np.log(1+y_true))))

In [None]:
gb = GradientBoostingRegressor(random_state=42)

In [None]:
scoring = {'MSLE': make_scorer(mean_squared_log_error),
           'MAPE': make_scorer(mean_absolute_percentage_error)}

In [None]:
random_grid = {
    "loss":['squared_error', 'absolute_error', 'huber'],
    "learning_rate": [0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(1, 200, 10, dtype=int),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8,10,12],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "absolute_error"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    }

In [None]:
gb_random = RandomizedSearchCV(
    estimator = gb, 
    param_distributions = random_grid, 
    n_iter = 100, 
    cv = 5, 
    verbose=2, 
    random_state=42, 
    n_jobs = -1)

In [None]:
gb_random.fit(X_train, y_train)

In [None]:
params = gb_random.best_params_
params

### GB Best Params

In [12]:
gb = GradientBoostingRegressor(
subsample=1.0,
n_estimators=80,
min_samples_split=23,
min_samples_leaf=0.13636363636363638,
max_features='sqrt',
max_depth=8,
loss='huber',
learning_rate=0.1,
criterion='absolute_error')

In [13]:
gb.fit(X_train, y_train)

GradientBoostingRegressor(criterion='absolute_error', loss='huber', max_depth=8,
                          max_features='sqrt',
                          min_samples_leaf=0.13636363636363638,
                          min_samples_split=23, n_estimators=80)

In [20]:
results = pd.DataFrame()
results['Predicted'] = (1 / gb.predict(X_test)) - 1
results['Actual']= (1 / y_test) - 1
results['Difference'] = abs(results['Predicted'] - results['Actual'])

In [21]:
results

Unnamed: 0,Predicted,Actual,Difference
0,91.947827,73.0,18.947827
1,92.187727,88.0,4.187727
2,92.487113,82.0,10.487113
3,93.306111,95.0,1.693889
4,92.576327,78.0,14.576327
...,...,...,...
2007,93.005412,93.0,0.005412
2008,93.193905,114.0,20.806095
2009,77.722796,139.0,61.277204
2010,93.033387,111.0,17.966613


In [22]:
from sklearn.metrics import mean_squared_error
mean_squared_error(results['Actual'], results['Predicted'], squared=False)

22.264603251326864

In [23]:
results['Difference'].mean()

15.498294459306972

In [None]:
joblib.dump(gb, '../Models/Gradient_Boost_Model.h5')

In [16]:
(1 / gb.predict(X_test)) -1

array([91.9478268 , 92.18772714, 92.48711291, ..., 77.72279614,
       93.03338724, 92.43910168])