Import Libraries

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

Import Train and Test Files

In [2]:
train = pd.read_csv('train.csv',parse_dates=["date"])

In [3]:
test = pd.read_csv('test.csv',parse_dates=["date"])

In [4]:
data = train.append(test)

Data Formatting and Feature Engineering

In [5]:
data = data.reset_index().set_index('date')

In [6]:
data = data.drop(columns="index")

In [7]:
train_pct_index = int(0.79989 * len(data))
train = data[:train_pct_index]
test = data[train_pct_index:]

In [8]:
def create_features(df):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X

In [9]:
X_train, y_train = create_features(train), train['speed']
X_test, y_test   = create_features(test), test['speed']

X_train.shape, y_train.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = df.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['date'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dayofweek'] = df['date'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

((14006, 8), (14006,))

Hyper Parameter Optimization

In [10]:
booster=['gbtree','gblinear']
base_score=[0.25,0.5,0.75,1]

In [11]:
n_estimators = [195, 200, 203, 205, 210, 215, 220]
max_depth = [7, 8, 9, 10, 11, 12]
booster=['gbtree']
learning_rate=[0.30, 0.35, 0.40, 0.45, 0.5]
min_child_weight=[1, 1.5, 2, 3, 4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

Set up the random search with 5-fold cross validation

In [12]:
regressor=xgb.XGBRegressor()

In [13]:
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=100,
            scoring = 'neg_mean_absolute_error',n_jobs = 5,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [14]:
random_cv.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:   14.1s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:  1.4min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  3.4min
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:  6.0min
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  9.3min
[Parallel(n_jobs=5)]: Done 500 out of 500 | elapsed: 10.6min finished


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n...
                                          validate_parameters=None,
                                          verbosity=None),
                   n_iter=100, n_jobs=5,
                   par

In [15]:
random_cv.best_estimator_

XGBRegressor(base_score=0.75, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.35, max_delta_step=0, max_depth=12,
             min_child_weight=1.5, missing=nan, monotone_constraints='()',
             n_estimators=203, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

Setting up model and fitting the data

In [16]:
#I did several rounds of submissions by playing around with the parameters that I extracted from the cross validation 
#My best submission is below

In [17]:
model = xgb.XGBRegressor(base_score=1, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.35, max_delta_step=0, max_depth=10,
             min_child_weight=3, missing=None, monotone_constraints='()',
             n_estimators=215, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
model.fit(X_train,y_train)

XGBRegressor(base_score=1, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.35, max_delta_step=0, max_depth=10,
             min_child_weight=3, missing=None, monotone_constraints='()',
             n_estimators=215, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

Predictions

In [26]:
#Train Data

In [30]:
y_pred_train = model.predict(X_train)

In [32]:
mean_squared_error(y_train, y_pred_train)

0.3475791307427396

In [33]:
#Test Data

In [34]:
y_pred_test = model.predict(X_test)

In [36]:
y_pred_test

array([47.18398 , 48.33232 , 40.753437, ..., 46.2421  , 40.99009 ,
       43.78659 ], dtype=float32)

In [37]:
import pandas as pd 
pd.DataFrame(y_pred_test).to_csv("Test Results")