In [None]:
import pandas as pd
import seaborn as sns
import numpy as np 
import math
import time
import os

from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%load_ext autoreload
%autoreload 2

import data_gestion
import stacking

In [None]:
data = data_gestion.open_data('preprocessed_train_data.csv')                                         
tmp = data.pop('Unnamed: 0')   

In [None]:
data['trip_duration'] = np.log(data['trip_duration'] + 1)

In [None]:
print(data.shape)

In [None]:
data['blizzard'] = data['blizzard'].astype(int)

In [None]:
print(np.sum(data.isna(),axis=0))

In [None]:
data = data.replace([np.inf, -np.inf], np.nan).dropna()

In [None]:
train_data, train_labels, test_data, test_labels = data_gestion.create_train_test_set(data,0.2,0.1)  

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn_params = {
    'n_neighbors': 4
}
taxi_knn = KNeighborsRegressor(n_neighbors = 20)
taxi_knn.fit(train_data,train_labels)

In [None]:
knn_predictions = taxi_knn.predict(test_data)

In [None]:
from sklearn.metrics import mean_squared_log_error
rmsle = np.sqrt(mean_squared_error(knn_predictions, test_labels))
print(rmsle)

In [None]:
rmsle_val = [] #to store rmse values for different k
for K in range(20,40):
    K = K+1
    model = KNeighborsRegressor(n_neighbors = K)

    model.fit(train_data, train_labels)  #fit the model
    pred=model.predict(test_data) #make prediction on test set
    error = np.sqrt(mean_squared_error(test_labels,pred)) #calculate rmse
    rmsle_val.append(error) #store rmse values
    print('RMLSE value for k= ' , K , 'is:', error)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor

# Function to mesure the quality of a split
criterion = ['friedman_mse','mse']

# Strategy to choose to split
splitter = ['best','random']

# Maximum depth
max_depth = [int(x) for x in np.linspace(10, 300, num = 60)]
max_depth.append(None)

min_sample_split = [int(x) for x in np.linspace(1,50,num=50)]

min_sample_leaf = [int(x) for x in np.linspace(1,50,num=50)]

min_weight_fraction_leaf = [0.0,0.1,0.2]

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']

max_leaf_nodes = [10,20,30,40,50,None]

min_impurity_decrease = [0.0,0.1,0.2]

ccp_alpha = [0.0,0.1,0.2]

random_grid = {'criterion': criterion,
               'splitter': splitter,
               'max_depth': max_depth,
               'min_samples_split': min_sample_split,
               'min_samples_leaf': min_sample_leaf,
               'min_weight_fraction_leaf': min_weight_fraction_leaf,
               'max_features' : max_features,
               'max_leaf_nodes': max_leaf_nodes,
               'min_impurity_decrease': min_impurity_decrease,
               'ccp_alpha': ccp_alpha}

print(random_grid)


In [None]:
tree = DecisionTreeRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
tree_random = RandomizedSearchCV(estimator = tree, param_distributions = random_grid, n_iter = 300, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
tree_random.fit(train_data, train_labels)


In [None]:
tree_random.best_estimator_


In [None]:
base_model = DecisionTreeRegressor(random_state = 42)
base_model.fit(train_data, train_labels)
pes_predictions_base = base_model.predict(test_data)
base_accuracy = np.sqrt(mean_squared_error(test_labels,pes_predictions_base))

print("Root mean squared error: %.4f "
      % np.sqrt(mean_squared_error(test_labels,pes_predictions_base)))


best_random = tree_random.best_estimator_
pes_predictions_best = best_random.predict(test_data)
random_accuracy = np.sqrt(mean_squared_error(test_labels,pes_predictions_best))

print("Root mean squared error: %.4f MeV"
      % np.sqrt(mean_squared_error(test_labels,pes_predictions_best)))

print('Improvement of {:0.4f}%.'.format( 100 * (base_accuracy - random_accuracy) / base_accuracy))

In [None]:
from sklearn.tree import DecisionTreeRegressor


tree_reg = tree_random.best_estimator_
predictions = tree_reg.predict(test_data)
print(np.sqrt(mean_squared_error(test_labels,predictions)))

importances = tree_reg.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [list(train_data)[i] for i in indices]

# Create plot
plt.figure()

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(train_data.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(train_data.shape[1]), names, rotation=90)

# Show plot
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

train_data, train_labels, test_data, test_labels = create_train_test_set(data,1.,0.)  

tree_reg = RandomForestRegressor(n_jobs = -1, 
                                n_estimators = 56,
                                min_samples_split = 2,
                                min_samples_leaf = 1,
                                max_features = 'auto',
                                max_depth = 50,
                                bootstrap = True,
                                verbose = 0)

tree_reg.fit(train_data,train_labels)

test = pd.read_csv('preprocessed_test_data.csv')
#test = test.replace([np.inf, -np.inf], np.nan)
test.pop('Unnamed: 0')                                                                                                   

np.sum(test.isna())

In [None]:
#tree_reg = tree_random.best_estimator_
test_predictions = tree_reg.predict(test)
test_predictions = np.exp(test_predictions) - 1
test_sample = pd.read_csv('nyc-taxi-trip-duration/test.csv')
df = pd.DataFrame(test_predictions, columns = ['trip_duration'])
my_submission = pd.DataFrame({'id' : test_sample['id'], 'trip_duration' : df['trip_duration']})
my_submission.to_csv('first_submission.csv',index=False)

In [None]:
tree_reg = DecisionTreeRegressor().fit(train_data,train_labels)
predictions = tree_reg.predict(test_data)
print(np.sqrt(mean_squared_error(test_labels,predictions)))
importances = tree_reg.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [list(train_data)[i] for i in indices]

# Create plot
plt.figure()

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(train_data.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(train_data.shape[1]), names, rotation=90)

# Show plot
plt.show()

In [None]:
knn_params = {
    'n_neighbors': 30
}
taxi_knn = KNeighborsRegressor(n_neighbors = 30)
taxi_knn.fit(train_data,train_labels)

In [None]:
from sklearn.metrics import mean_squared_log_error
knn_predictions = taxi_knn.predict(test_data)
rmsle = np.sqrt(mean_squared_error(knn_predictions, test_labels))
print(rmsle)

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
C = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)]
# Number of features to consider at every split
epsilon = [.2,.3,.4,.5,.6]

random_grid = {'C': C,
               'epsilon': epsilon}

print(random_grid)

In [None]:
svr = SVR()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
svr_random = RandomizedSearchCV(estimator = svr, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
svr_random.fit(train_data, train_labels)


In [None]:
svr_random.best_estimator_

In [None]:
base_model = SVR(C=900)
base_model.fit(train_data, train_labels)
pes_predictions_base = base_model.predict(test_data)
base_accuracy = np.sqrt(mean_squared_error(test_labels,pes_predictions_base))

print("Root mean squared error: %.2f "
      % np.sqrt(mean_squared_error(test_labels,pes_predictions_base)))


best_random = svr_random.best_estimator_
pes_predictions_best = best_random.predict(test_data)
random_accuracy = np.sqrt(mean_squared_error(test_labels,pes_predictions_best))

print("Root mean squared error: %.2f MeV"
      % np.sqrt(mean_squared_error(test_labels,pes_predictions_best)))

print('Improvement of {:0.2f}%.'.format( 100 * (base_accuracy - random_accuracy) / base_accuracy))

In [None]:
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

                                                              
train_data, train_labels, test_data, test_labels = data_gestion.create_train_test_set(data,0.8,0.2)             

model = stacking.StackedRegressor([DecisionTreeRegressor(criterion='friedman_mse', 
                                                max_depth=41,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_samples_leaf=20, 
                                                min_samples_split=10,
                                                splitter='random'),
                         DecisionTreeRegressor(criterion='mse', 
                                                max_depth=30,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_samples_leaf=20, 
                                                min_samples_split=10,
                                                splitter='random'),
                         DecisionTreeRegressor(criterion='friedman_mse', 
                                                max_depth=60,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_samples_leaf=20, 
                                                min_samples_split=10,
                                                splitter='random'),
                         RandomForestRegressor(n_jobs = -1, 
                                n_estimators = 56,
                                min_samples_split = 2,
                                min_samples_leaf = 1,
                                max_features = 'auto',
                                max_depth = 50,
                                bootstrap = True,
                                verbose = 0)])

# Performance of ensemble
stacking.cross_val_metric(model, train_data, train_labels,metric=stacking.root_mean_squared_error,cv=3, display='RMSE')

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 2, stop = 100, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

RandomForestRegressor(n_jobs = -1, 
                                n_estimators = 56,
                                min_samples_split = 2,
                                min_samples_leaf = 1,
                                max_features = 'auto',
                                max_depth = 50,
                                bootstrap = True,
                                verbose = 0),
                          RandomForestRegressor(n_jobs = -1, 
                                n_estimators = 100,
                                min_samples_split = 5,
                                min_samples_leaf = 1,
                                max_features = 'auto',
                                max_depth = 30,
                                bootstrap = True,
                                verbose = 0),

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(train_data, train_labels)

In [None]:
rf_random.best_estimator_

In [None]:
base_model = RandomForestRegressor(n_jobs = -1, 
                                n_estimators = 56,
                                min_samples_split = 2,
                                min_samples_leaf = 1,
                                max_features = 'auto',
                                max_depth = 50,
                                bootstrap = True,
                                verbose = 0)
base_model.fit(train_data, train_labels)
pes_predictions_base = base_model.predict(test_data)
base_accuracy = np.sqrt(mean_squared_error(test_labels,pes_predictions_base))

print("Root mean squared error: %.5f "
      % np.sqrt(mean_squared_error(test_labels,pes_predictions_base)))


best_random = rf_random.best_estimator_
pes_predictions_best = best_random.predict(test_data)
random_accuracy = np.sqrt(mean_squared_error(test_labels,pes_predictions_best))

print("Root mean squared error: %.5f MeV"
      % np.sqrt(mean_squared_error(test_labels,pes_predictions_best)))

print('Improvement of {:0.5f}%.'.format( 100 * (base_accuracy - random_accuracy) / base_accuracy))

In [None]:
test = pd.read_csv('preprocessed_test_data.csv')
#test = test.replace([np.inf, -np.inf], np.nan)
test.pop('Unnamed: 0')                                                                                                   

np.sum(test.isna())

In [None]:
#test = test.replace([np.inf, -np.inf], np.nan).dropna()
test_predictions = model.predict(test)

In [None]:
test_predictions = model.predict(test)
test_predictions = np.exp(test_predictions) - 1
test_sample = pd.read_csv('nyc-taxi-trip-duration/test.csv')
df = pd.DataFrame(test_predictions, columns = ['trip_duration'])
my_submission = pd.DataFrame({'id' : test_sample['id'], 'trip_duration' : df['trip_duration']})
my_submission.to_csv('first_submission.csv',index=False)

In [None]:
test_sample = pd.read_csv('nyc-taxi-trip-duration/test.csv')

In [None]:

df = pd.DataFrame(test_predictions, columns = ['trip_duration'])
my_submission = pd.DataFrame({'id' : test_sample['id'], 'trip_duration' : df['trip_duration']})

In [None]:
my_submission.head()

In [None]:
my_submission.to_csv('first_submission.csv',index=False)

In [None]:
pd.read_csv('nyc-taxi-trip-duration/sample_submission.csv').head()

In [None]:
from sklearn.model_selection import KFold
import xgboost as xgb

ntrain = train_data.shape[0]
ntest = test_data.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold( n_splits = NFOLDS)

In [None]:
# Put in our parameters for said classifiers
# Random Forest parameters

rf_params = {
    'n_jobs': -1, 
    'n_estimators': 56,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',
    'max_depth': 50,
    'bootstrap': True,
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.7
}

  
# Gradient Boosting parameters
gb_params = {
    
}
rf_1_params = {
    'bootstrap' : True,
    'max_depth' : 40,
    'max_features' : 'auto',
    'min_samples_leaf' : 1,
    'min_samples_split' : 2,
    'n_estimators' : 102,
    'n_jobs' : -1
    
} 

rf_2_params = {
    'bootstrap' : True,
    'max_depth' : 50,
    'max_features' : 'auto',
    'min_samples_leaf' : 1,
    'min_samples_split' : 2,
    'n_estimators' : 56,
    'n_jobs' : -1
    
}


    
tree_1_params = {
    'criterion':'friedman_mse', 
    'max_depth':41,
    'max_features':'auto',
    'max_leaf_nodes':None,
    'min_samples_leaf':20, 
    'min_samples_split':10,
    'splitter':'random'
}



# Support Vector Classifier parameters 
svc_params = {
    'epsilon': 0.7,
    'C': 1000
    }

knn_params = {
    'n_neighbors': 4
}


In [None]:
start_time = time.time()
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
# Create 5 objects that represent our 4 models
rf = stacking.SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rf1 = stacking.SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_1_params)
rf2 = stacking.SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_2_params)
tree = stacking.SklearnHelper(clf=DecisionTreeRegressor, seed=SEED,params=tree_1_params)
#et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
#ada = SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=ada_params)
#gb = SklearnHelper(clf=GradientBoostingRegressor, seed=SEED, params=gb_params)
#svc = SklearnHelper(clf=SVR, seed=SEED, params=svc_params)
#knn = SklearnHelper(clf=KNeighborsRegressor, seed=SEED, params=knn_params)
time5 = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
train_data, train_labels, test_data, test_labels = create_train_test_set(data,0.9,0.1)             
normed_train_features = train_data.to_numpy()
normed_test_features = test_data.to_numpy()
train_labels = train_labels.to_numpy()
test_labels = test_labels.to_numpy()
time6 = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
# Create our OOF train and test predictions. These base results will be used as new features
#et_oof_train, et_oof_test = get_oof(et, normed_train_features, train_labels, normed_test_features) # Extra Trees
#print('OK')
rf_oof_train, rf_oof_test = stacking.get_oof(rf,normed_train_features, train_labels, normed_test_features) # Random Forest
print('OK')
rf1_oof_train, rf1_oof_test = stacking.get_oof(rf1,normed_train_features, train_labels, normed_test_features) # Random Forest
print('OK')
rf2_oof_train, rf2_oof_test = stacking.get_oof(rf2,normed_train_features, train_labels, normed_test_features) # Random Forest
print('OK')
tree_oof_train, tree_oof_test = stacking.get_oof(tree,normed_train_features, train_labels, normed_test_features) # Random Forest
print('OK')
#ada_oof_train, ada_oof_test = stacking.get_oof(ada, normed_train_features, train_labels, normed_test_features) # AdaBoost 
#print('OK')
#gb_oof_train, gb_oof_test = stacking.get_oof(gb,normed_train_features, train_labels, normed_test_features) # Gradient Boost
#print('OK')
#svc_oof_train, svc_oof_test = stacking.get_oof(svc,normed_train_features, train_labels, normed_test_features) # Support Vector Classifier
#print('OK')
#knn_oof_train, knn_oof_test = stacking.get_oof(knn,normed_train_features,train_labels,normed_test_features)  # kNN
print("Training is complete")
time7 = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls


data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')


In [None]:
start_time = time.time()
x_train = np.concatenate((rf_oof_train, rf1_oof_train,rf2_oof_train,tree_oof_train), axis=1)
x_test = np.concatenate((rf_oof_test,rf1_oof_test,rf2_oof_test,tree_oof_test), axis=1)
time8 = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
test = pd.read_csv('preprocessed_test_data.csv')
#test = test.replace([np.inf, -np.inf], np.nan)
test.pop('Unnamed: 0')                                                                                                   

"""start_time = time.time()
gbm = xgb.XGBRegressor(
     learning_rate = 0.01,
     n_estimators= 1000,
     max_depth= 110,
     min_child_weight= 5,
     gamma=.5,                        
     subsample=1.,
     colsample_bytree=1.,
     nthread= -1,
     scale_pos_weight=1).fit(x_train, train_labels)

predictions = gbm.predict(x_test)
time9 = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))"""
br = BayesianRidge().fit(x_train,train_labels)
test_predictions = br.predict(test)

In [None]:
#test_predictions = model.predict(test)
test_predictions = np.exp(test_predictions) - 1
test_sample = pd.read_csv('nyc-taxi-trip-duration/test.csv')
df = pd.DataFrame(test_predictions, columns = ['trip_duration'])
my_submission = pd.DataFrame({'id' : test_sample['id'], 'trip_duration' : df['trip_duration']})
my_submission.to_csv('first_submission.csv',index=False)

In [None]:
rmsle = np.sqrt(mean_squared_error(predictions,test_labels))
print(rmsle)


In [None]:
test = pd.read_csv('preprocessed_test_data.csv')
#test = test.replace([np.inf, -np.inf], np.nan)
test.pop('Unnamed: 0')                                                                                                   

np.sum(test.isna())

In [None]:
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge


train_data, train_labels, test_data, test_labels = create_train_test_set(data,0.9,0.1)  
test_predictions = stacking.stacking(train_data, train_labels, test)

In [None]:
#test_predictions = model.predict(test)
test_predictions = np.exp(test_predictions) - 1
test_sample = pd.read_csv('nyc-taxi-trip-duration/test.csv')
df = pd.DataFrame(test_predictions, columns = ['trip_duration'])
my_submission = pd.DataFrame({'id' : test_sample['id'], 'trip_duration' : df['trip_duration']})
my_submission.to_csv('first_submission.csv',index=False)