In [2]:
from datetime import datetime
import pandas as pd
from geopy import distance
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import normalize

import statsmodels.api as sm


import pymongo
from pymongo import MongoClient

import matplotlib.pyplot as plt

plt.style.use("ggplot")

%matplotlib inline

In [3]:
client = MongoClient('localhost', 27017)

db = client['avl_pipeline_test']
in_coll = db['edges_normed']

In [4]:
cursor = in_coll.find()

In [5]:
df =  pd.DataFrame(list(cursor))

In [6]:
df.head()

Unnamed: 0,_id,edge_id,first_prior,midday,normed_edge,rush_hour,second_prior,sequence,weekend
0,5ae7ccf03ad39e78321c4444,7253717_2016-10-28_HKH3O_1,0.0,0,0.165269,0,0.0,1,0
1,5ae7ccf03ad39e78321c4445,7253717_2016-10-28_HKH3O_2,0.165269,0,-0.061655,0,0.0,2,0
2,5ae7ccf03ad39e78321c4446,7253717_2016-10-28_HKH3O_3,-0.061655,0,-0.542964,0,0.165269,3,0
3,5ae7ccf03ad39e78321c4447,7253717_2016-10-28_HKH3O_4,-0.542964,0,0.301291,0,-0.061655,4,0
4,5ae7ccf03ad39e78321c4448,7253717_2016-10-28_HKH3O_5,0.301291,0,-0.0596,0,-0.542964,5,0


In [13]:
y = df['normed_edge'].values.reshape(-1,1)
X = df[['first_prior', 'second_prior', 'midday', 'rush_hour', 'weekend']].values

In [15]:
y.shape

(53561, 1)

In [16]:
X.shape

(53561, 5)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [18]:
# fit with statsmodels
model1 = sm.OLS(y_train,X_train)
results = model1.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.028
Method:,Least Squares,F-statistic:,232.6
Date:,"Mon, 30 Apr 2018",Prob (F-statistic):,1.08e-245
Time:,19:25:45,Log-Likelihood:,-56662.0
No. Observations:,40170,AIC:,113300.0
Df Residuals:,40165,BIC:,113400.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1446,0.005,-28.964,0.000,-0.154,-0.135
x2,-0.0281,0.005,-5.537,0.000,-0.038,-0.018
x3,0.1520,0.009,16.823,0.000,0.134,0.170
x4,0.1267,0.009,14.320,0.000,0.109,0.144
x5,-0.0939,0.010,-9.016,0.000,-0.114,-0.073

0,1,2,3
Omnibus:,37824.714,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5379951.2
Skew:,4.13,Prob(JB):,0.0
Kurtosis:,59.09,Cond. No.,2.53


In [20]:
# fit with sklearn OLS
ols = LinearRegression()
ols.fit(X_train, y_train)
ols.score(X_test, y_test)

0.04986341995087151

In [21]:
# With RF
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train.flatten())
regr.score(X_test, y_test.flatten())

0.037839429839948036

What about just weekend data?

In [24]:
wk_df = df[df['weekend'] == 1]
y = wk_df['normed_edge'].values.reshape(-1,1)
X = wk_df[['first_prior', 'second_prior', 'midday', 'rush_hour']].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [26]:
# fit with sklearn OLS
ols = LinearRegression()
ols.fit(X_train, y_train)
ols.score(X_test, y_test)

0.03648310841755453

In [27]:
# With RF
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train.flatten())
regr.score(X_test, y_test.flatten())

0.028506925814792194

# What about non-normalized?

In [30]:
stand_coll = db['edges_standardized']
std_cursor = stand_coll.find()
std_df =  pd.DataFrame(list(std_cursor))

In [33]:
y = std_df['standard_edge'].values.reshape(-1,1)
X = std_df[['first_prior', 'second_prior', 'midday', 'rush_hour', 'weekend']].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [34]:
# fit with sklearn OLS
ols = LinearRegression()
ols.fit(X_train, y_train)
ols.score(X_test, y_test)

0.01391835062696567

In [35]:
# With RF
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train.flatten())
regr.score(X_test, y_test.flatten())

0.02045907040064232