#### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
%matplotlib inline

##### Libraries to avoid future warnings pop during practice

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#### Loading the Dataset

In [3]:
train=pd.read_csv('train_aWnotuB.csv')
test=pd.read_csv('test_BdBKkAj_L87Nc3S.csv')
sample=pd.read_csv('sample_submission_KVKNmI7.csv')

In [4]:
train.shape,test.shape,sample.shape

((48120, 4), (11808, 3), (11808, 2))

In [5]:
import datetime

In [6]:
combine=pd.concat([train, test], axis=0)

In [7]:
combine.dtypes

DateTime     object
Junction      int64
Vehicles    float64
ID            int64
dtype: object

In [8]:
train['Date']=train['DateTime'].apply(lambda x: x.split()[0])
train['Time']=train['DateTime'].apply(lambda x: x.split()[1].split(':')[0]).astype('int')
train['Year']=train['Date'].apply(lambda x: x.split('-')[0])
train['Month']=train['Date'].apply(lambda x: x.split('-')[1])
train['Day']=train['Date'].apply(lambda x: x.split('-')[2])

In [9]:
combine['Date']=combine['DateTime'].apply(lambda x: x.split()[0])
combine['Time']=combine['DateTime'].apply(lambda x: x.split()[1].split(':')[0]).astype('int')
combine['Year']=combine['Date'].apply(lambda x: x.split('-')[0])
combine['Month']=combine['Date'].apply(lambda x: x.split('-')[1])
combine['Day']=combine['Date'].apply(lambda x: x.split('-')[2])

In [10]:
test['Date']=test['DateTime'].apply(lambda x: x.split()[0])
test['Time']=test['DateTime'].apply(lambda x: x.split()[1].split(':')[0]).astype('int')
test['Year']=test['Date'].apply(lambda x: x.split('-')[0])
test['Month']=test['Date'].apply(lambda x: x.split('-')[1])
test['Day']=test['Date'].apply(lambda x: x.split('-')[2])

In [11]:
test.head()

Unnamed: 0,DateTime,Junction,ID,Date,Time,Year,Month,Day
0,2017-07-01 00:00:00,1,20170701001,2017-07-01,0,2017,7,1
1,2017-07-01 01:00:00,1,20170701011,2017-07-01,1,2017,7,1
2,2017-07-01 02:00:00,1,20170701021,2017-07-01,2,2017,7,1
3,2017-07-01 03:00:00,1,20170701031,2017-07-01,3,2017,7,1
4,2017-07-01 04:00:00,1,20170701041,2017-07-01,4,2017,7,1


In [12]:
test.head()

Unnamed: 0,DateTime,Junction,ID,Date,Time,Year,Month,Day
0,2017-07-01 00:00:00,1,20170701001,2017-07-01,0,2017,7,1
1,2017-07-01 01:00:00,1,20170701011,2017-07-01,1,2017,7,1
2,2017-07-01 02:00:00,1,20170701021,2017-07-01,2,2017,7,1
3,2017-07-01 03:00:00,1,20170701031,2017-07-01,3,2017,7,1
4,2017-07-01 04:00:00,1,20170701041,2017-07-01,4,2017,7,1


In [13]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [14]:
X=train.drop(['DateTime','ID','Vehicles','Date'],axis=1)
y=train.Vehicles

In [15]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.2, shuffle=True)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = RandomForestRegressor(n_estimators = 100, random_state = 42)
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(base_model, test_features, test_labels)


best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [16]:
base_model=RandomForestRegressor(n_estimators = 500, random_state = 42)
base_model.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [17]:
test_Data=test.drop(['DateTime','ID','Date'],axis=1)

In [18]:
pred=base_model.predict(test_Data.values)

In [19]:
sample['Vehicles']=pred.astype(int)

In [20]:
sample.to_csv('RF.csv',index=False)

#### XGB

In [21]:
import xgboost

In [22]:
base_xgb_model = xgboost.XGBRegressor(n_estimators=1000,objective="reg:squarederror",learning_rate=0.05,max_depth=5)
base_xgb_model.fit(X.values, y.values)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [23]:
test_Data=test.drop(['DateTime','ID','Date'],axis=1)

In [24]:
pred=base_xgb_model.predict(test_Data.values)
test['Vehicles']=pred.astype(int)
sample['Vehicles']=test['Vehicles']
sample.to_csv('xgb.csv',index=False)