In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import copy
import pickle
import datetime
from math import sqrt
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# Map WorkDays

In [None]:
workdays = df = pd.read_csv('WorkDays.csv')
workdays = workdays.iloc[:365]

workdays['Date'] = workdays['Date'].astype('datetime64')
workdays['Date'] = workdays['Date'].dt.strftime('%m-%d') 

In [None]:
start = datetime.datetime.now()

df = pd.read_csv('46A_1.csv')
df = df.drop(columns=['dt','Unnamed: 0','tripid','stop_id','weather_main'])

end = datetime.datetime.now()
print('The time to read in the largest file is', end-start)
dataset = df.copy()

In [None]:
dataset['month'] = dataset['month'].astype('str')
dataset['day'] = dataset['day'].astype('str')

dataset['Date'] = dataset[['month', 'day']].agg('-'.join, axis=1)


In [None]:
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%m-%d')
dataset['Date'] = dataset['Date'].dt.strftime('%m-%d') 


In [None]:
# random sample 70% data from dataset to speed up training process
mappeddf = pd.merge(dataset, workdays, how='left', on=['Date'])
mappeddf = mappeddf.sample(n=int(dataset.shape[0]*0.7),replace = False, random_state=10)

mappeddf = mappeddf.drop(columns=['Date'])
mappeddf = mappeddf.drop(columns=['Unnamed: 0'])

# Split Dataset

In [None]:
mappeddf['month'] = mappeddf['month'].astype('int')
mappeddf['day'] = mappeddf['day'].astype('int')

mappeddf['dayofweek'].replace({0:0.1},inplace=True)
mappeddf['rush_hour'].replace({0:0.5},inplace=True)
mappeddf['SchoolHoliday'].replace({0:0.5},inplace=True)

train, test = train_test_split(mappeddf, test_size=0.3, random_state=33)

x_train = train.drop(columns=['cum_duration'])
y_train = train.cum_duration
x_test = test.drop(columns=['cum_duration'])
y_test = test.cum_duration

X_train = np.array(x_train)
Y_train = np.array(y_train)

X_test = np.array(x_test)
Y_test = np.array(y_test)

In [None]:
X_train

In [None]:
Y_train

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
start = datetime.datetime.now()
multiple_linreg = LinearRegression().fit(X_train, Y_train)
end = datetime.datetime.now()
print("The Runing Time：", end-start)

pre_lr = multiple_linreg.predict(X_test)

In [None]:
print('='*20,'Evaluate Results','='*20)
print("Testing Set MSE：",metrics.mean_squared_error(y_pred=pre_lr, y_true=Y_test))
print("Testing Set R^2：",metrics.r2_score(y_pred=pre_lr, y_true=Y_test))

# Polynomial Regression

In [None]:
start = datetime.datetime.now()

n_degree = [2,3,4,5,6]
score = []
for degree in n_degree:
    
    print ('The degree is', degree)
    pol = PolynomialFeatures(degree = degree)
    xtrain_pol = pol.fit_transform(X_train)
    lr_pol = LinearRegression()
    lr_pol.fit(xtrain_pol, Y_train)
    pre_train = lr_pol.predict(xtrain_pol)

    xtest_pol = pol.fit_transform(X_test)
    pre_test = lr_pol.predict(xtest_pol)
    
    score.append([metrics.r2_score(y_pred=pre_train, y_true=Y_train), metrics.r2_score(y_pred=pre_test, y_true=Y_test)])
    

end = datetime.datetime.now()
print("The Runing Time：", end-start)

score

In [None]:
start = datetime.datetime.now()

pol = PolynomialFeatures(degree = 6)
xtrain_pol = pol.fit_transform(X_train)
lr_pol = LinearRegression()
lr_pol.fit(xtrain_pol, Y_train)
pre_train = lr_pol.predict(xtrain_pol)

end = datetime.datetime.now()
print("The Runing Time：", end-start)

xtest_pol = pol.fit_transform(X_test)
pre_pr = lr_pol.predict(xtest_pol)

print('='*20,'Evaluate Results','='*20)
print("Testing Set MSE：",metrics.mean_squared_error(y_pred=pre_pr, y_true=Y_test))
print("Testing Set R^2：",metrics.r2_score(y_pred=pre_pr, y_true=Y_test))


# XGBoost

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import sklearn

In [None]:
# no parameters
start = datetime.datetime.now()

xg_reg = xgb.XGBRegressor()
xg_reg.fit(X_train,Y_train)
end = datetime.datetime.now()
print("The Runing Time：", end-start)

pre_test = xg_reg.predict(X_test)

print('='*20,'Evaluate Results','='*20)
print("Testing Set MSE：",metrics.mean_squared_error(y_pred=pre_test, y_true=Y_test))
print("Testing Set R^2：",metrics.r2_score(y_pred=pre_test, y_true=Y_test))

In [None]:
# print out default params
xg_reg.get_params()

In [None]:
start = datetime.datetime.now()
param_test1 = {
        'max_depth': [1 , 3, 5],
        'learning_rate': [0.005, 0.01, 0.1, 0.3],
        'n_estimators':[50, 100, 150, 200]

    }
gsearch1 = GridSearchCV(xgb.XGBRegressor(),
                            param_grid=param_test1,
                            scoring='r2',cv=2)

gsearch1.fit(X_train,Y_train)
print (gsearch1.best_params_,gsearch1.best_score_)
end = datetime.datetime.now()
print("The Runing Time：", end-start)


In [None]:
# the best params are: {'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 200}

start = datetime.datetime.now()

xg_reg = xgb.XGBRegressor(learning_rate= 0.3, max_depth= 5, n_estimators = 200)
xg_reg.fit(X_train,Y_train)

pre_test = xg_reg.predict(X_test)

end = datetime.datetime.now()
print("The Runing Time：", end-start)

print('='*20,'Evaluate Results','='*20)
print("Testing Set MSE：",metrics.mean_squared_error(y_pred=pre_test, y_true=Y_test))
print("Testing Set R^2：",metrics.r2_score(y_pred=pre_test, y_true=Y_test))


# ANN -- MLPRegressor

In [None]:
from sklearn import neural_network

In [None]:
# no parameters
start = datetime.datetime.now()

mlp_reg = neural_network.MLPRegressor()
mlp_reg.fit(X_train,Y_train)
end = datetime.datetime.now()
print("The Runing Time：", end-start)

pre_test = mlp_reg.predict(X_test)

print('='*20,'Evaluate Results','='*20)
print("Testing Set MSE：",metrics.mean_squared_error(y_pred=pre_test, y_true=Y_test))
print("Testing Set R^2：",metrics.r2_score(y_pred=pre_test, y_true=Y_test))

In [None]:
mytuple =[]
j = 5
for x in range(1,9,3):

    for i in [3,5,7]:
        ituple = (j,)*i
        mytuple.append(ituple)
    j += 3

param_grid = {'hidden_layer_sizes': mytuple}
print (param_grid)

In [None]:
start = datetime.datetime.now()

In [None]:

gsearch2 = GridSearchCV(xgb.XGBRegressor(learning_rate_init=0.005),
                            param_grid=param_grid,
                            scoring='r2',cv=2)

gsearch2.fit(X_train,Y_train)
print (gsearch2.best_params_,gsearch2.best_score_)

end = datetime.datetime.now()
print("The Runing Time：", end-start)