In [1]:
# OLS (using statsmodels) was used to compute the coefficients theta and produce prediction for ENTRIESn_hourly.

# Features considered in the model include:
# 1) 'hour': Expectation that ridership varies based on the time of day (peak vs. non-peak transit hours).
# 2) 'weekday': Expectation that ridership varies based on weekdays vs. weekends (business vs. leisure transit).
# 3) 'UNIT': A transformed dummy variable to account for higher/lower trafficked units.

# Note that 'rain', 'fog' and 'tempi' were also considered, however they were not found to significantly increase the
# predictive power of the model.

In [90]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [91]:
path = r'C:\Users\darry\Documents\Projects\Udacity\Data Analyst Nanodegree\P2 Analyzing the NYC Subway Dataset\data\turnstile_weather_v2.csv'
dataFrame = pd.read_csv(path)
dataFrame

Unnamed: 0,UNIT,DATEn,TIMEn,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,datetime,hour,day_week,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
0,R003,05-01-11,00:00:00,4388333,2911002,0,0,2011-05-01 00:00:00,0,6,...,30.22,0,55.9,3.5,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
1,R003,05-01-11,04:00:00,4388333,2911002,0,0,2011-05-01 04:00:00,4,6,...,30.25,0,52.0,3.5,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
2,R003,05-01-11,12:00:00,4388333,2911002,0,0,2011-05-01 12:00:00,12,6,...,30.28,0,62.1,6.9,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
3,R003,05-01-11,16:00:00,4388333,2911002,0,0,2011-05-01 16:00:00,16,6,...,30.26,0,57.9,15.0,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
4,R003,05-01-11,20:00:00,4388333,2911002,0,0,2011-05-01 20:00:00,20,6,...,30.28,0,52.0,10.4,0.000000,30.258000,55.980000,7.860000,40.700348,-73.887177
5,R003,05-02-11,00:00:00,4388348,2911036,15,34,2011-05-02 00:00:00,0,0,...,30.31,0,50.0,6.9,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177
6,R003,05-02-11,04:00:00,5818689,3874767,19,40,2011-05-02 04:00:00,4,0,...,30.27,0,50.0,4.6,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177
7,R003,05-02-11,08:00:00,4388855,2911194,488,118,2011-05-02 08:00:00,8,0,...,30.30,0,53.1,10.4,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177
8,R003,05-02-11,12:00:00,4389345,2911326,490,132,2011-05-02 12:00:00,12,0,...,30.24,0,57.0,11.5,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177
9,R003,05-02-11,16:00:00,4389576,2911558,231,232,2011-05-02 16:00:00,16,0,...,30.16,0,59.0,11.5,0.000000,30.238333,54.166667,8.250000,40.700348,-73.887177


In [92]:
y = dataFrame['ENTRIESn_hourly']

x = dataFrame[['hour', 'weekday']]

unit_dum = pd.get_dummies(dataFrame['UNIT'], prefix = 'unit')
x = x.join(unit_dum)

In [93]:
result = sm.OLS(y, x).fit()

In [94]:
result.rsquared
# result1.summary()

0.48136953933969318

In [81]:
pred = result.predict(x)
(y - pred).mean()

-3.814367338504984e-11

In [82]:
plt.plot(x = x['hour'], y = y)
plt.show()