In [20]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

import pymongo
from pymongo import MongoClient

In [2]:
# Load in our parameters file
with open('../parameters.json') as f:
    params = json.load(f)

In [3]:
# Get relevant parameters
database = params['database']
two_chunk_collection = params['chunk_2_collection']

# Connect to our database/collection
client = MongoClient('localhost', 27017)
db = client[database]
two_chunk_coll = db[two_chunk_collection]

In [4]:
chunk_df = pd.DataFrame(list(two_chunk_coll.find({},{'_id':0})))
chunk_df.head()

Unnamed: 0,avg_speed_chnk_1,avg_speed_chnk_2,mfn_sq_chnk_1,mfn_sq_chnk_2,seconds_chnk_1,seconds_chnk_2,start_timestamp,trip_id_iso
0,5.563081,5.117909,481636,452929,1250.0,1027.0,1475912000.0,7253717_2016-10-08_4489K
1,5.05178,5.112077,12321,7396,1438.0,1428.0,1475947000.0,7253845_2016-10-08_GHNYG
2,4.878015,3.558431,729,2916,1560.0,1806.0,1475955000.0,7253837_2016-10-08_8Q6DP
3,4.676721,3.959559,28900,39204,1682.0,1550.0,1475963000.0,7253830_2016-10-08_SR1KP
4,4.587672,3.931596,96100,114244,1678.0,1534.0,1475972000.0,7253823_2016-10-08_TDW9M


In [5]:
sort_df = chunk_df.sort_values('start_timestamp')
sort_df.reset_index(drop=True, inplace=True)

data_with_priors = []

for idx, row in sort_df.iterrows():

    if idx != 0:
        
        data_dict = {}
        
        prior = sort_df.iloc[idx-1]
        
        current_ts = row['start_timestamp']
        prior_ts = prior['start_timestamp']
        
        diff =  current_ts - prior_ts
        
        if diff < 1800:
            
            data_dict['seconds_chnk_1'] = row['seconds_chnk_1']
            data_dict['seconds_chnk_2'] = row['seconds_chnk_2']
            data_dict['mfn_sq_chnk_1'] = row['mfn_sq_chnk_1']
            data_dict['prior_seconds_chnk_2'] = prior['seconds_chnk_2']
            
            data_with_priors.append(data_dict)

In [6]:
prior_df = pd.DataFrame(data_with_priors)

mask = (prior_df['seconds_chnk_2'] < 2500) & (prior_df['seconds_chnk_1'] < 2500)
prior_trimed_df = prior_df[mask]

y = prior_trimed_df['seconds_chnk_2'].values.reshape(-1,1)
X = prior_trimed_df[['seconds_chnk_1', 'mfn_sq_chnk_1', 'prior_seconds_chnk_2']].values

scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
scaler_y = StandardScaler()
y_norm = scaler_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_norm, y_norm)

In [7]:
# Build our model
ridge_cv = RidgeCV(alphas=[5,10,20,30], cv=20)
ridge_cv.fit(X_train, y_train)

#Score our model, R-Squared
r_squared = ridge_cv.score(X_test, y_test)

#Score our model, Root Mean Squared Error
y_predict = ridge_cv.predict(X_test)
rmse_norm = (mean_squared_error(y_test, y_predict))**.5
rmse_dev = scaler_y.inverse_transform([rmse_norm])
diff = rmse_dev[0] - scaler_y.mean_[0]

print ("R-squared: ", "{0:.2f}".format(r_squared))
print ('RMSE: ', "{0:.2f}".format(diff))

R-squared:  0.51
RMSE:  148.25


In [18]:
regr = RandomForestRegressor(n_estimators=100, max_leaf_nodes=20)

crossed = cross_validate(regr, X_train, y_train.flatten(), cv=20, 
                         scoring=['neg_mean_squared_error','r2'], return_train_score=False)

r_squared = crossed['test_neg_mean_squared_error'].mean()

#Score our model, Root Mean Squared Error
rmse_norm = (abs(crossed['test_neg_mean_squared_error'].mean()))**.5
rmse_dev = scaler_y.inverse_transform([rmse_norm])
diff = rmse_dev[0] - scaler_y.mean_[0]

print ("R-squared: ", "{0:.2f}".format(r_squared))
print ('RMSE: ', "{0:.2f}".format(diff))

R-squared:  -0.46
RMSE:  142.68


In [19]:
grd_boost = GradientBoostingRegressor(learning_rate=.01, max_depth=2, n_estimators=500)

crossed = cross_validate(grd_boost, X_train, y_train.flatten(), cv=20, 
                         scoring=['neg_mean_squared_error','r2'], return_train_score=False)

r_squared = crossed['test_neg_mean_squared_error'].mean()

#Score our model, Root Mean Squared Error
rmse_norm = (abs(crossed['test_neg_mean_squared_error'].mean()))**.5
rmse_dev = scaler_y.inverse_transform([rmse_norm])
diff = rmse_dev[0] - scaler_y.mean_[0]

print ("R-squared: ", "{0:.2f}".format(r_squared))
print ('RMSE: ', "{0:.2f}".format(diff))

R-squared:  -0.46
RMSE:  142.79


In [27]:
regr = RandomForestRegressor()

params = {
    'max_leaf_nodes': [2,5,10,15,20,25,30,50],
    'max_depth': [1,2,3,4,5,10,None],
    'max_features': ['auto', 1, 2]
}

clf = GridSearchCV(regr, params)

clf.fit(X_train, y_train.flatten())
clf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features=1, max_leaf_nodes=30, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [28]:
regr = RandomForestRegressor(max_features=1, max_depth=4, max_leaf_nodes=30)

crossed = cross_validate(regr, X_train, y_train.flatten(), cv=20, 
                         scoring=['neg_mean_squared_error','r2'], return_train_score=False)

r_squared = crossed['test_neg_mean_squared_error'].mean()

#Score our model, Root Mean Squared Error
rmse_norm = (abs(crossed['test_neg_mean_squared_error'].mean()))**.5
rmse_dev = scaler_y.inverse_transform([rmse_norm])
diff = rmse_dev[0] - scaler_y.mean_[0]

print ("R-squared: ", "{0:.2f}".format(r_squared))
print ('RMSE: ', "{0:.2f}".format(diff))

R-squared:  -0.46
RMSE:  142.57
