In [27]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import dateutil.parser
import matplotlib.patches as mpatches
import re
import operator
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

%matplotlib inline

## Load the Data

In [28]:
first_part = "2015_data/2015"
second_part = "-citibike-tripdata.csv"
months = ['01','02','03','04','05','06','07','08','09','10','11','12']

In [30]:
citi = pd.DataFrame()
for i in months:
    file = first_part+i+second_part
    data = pd.read_csv(file)
    citi = pd.concat([citi,data])

In [31]:
trips = pd.DataFrame(citi["starttime"], columns = ["starttime"])

In [32]:
days = list(range(1,32))
months = list(range(1,13))

all_days = []
for m in months:
    for d in days:
        all_days.append(str(m)+'/'+str(d)+'/'+str(2015))

In [33]:
invalid_days = ['2/29/2015','2/30/2015','2/31/2015','4/31/2015','6/31/2015','9/31/2015','11/31/2015']

In [34]:
indexes_to_remove = []
for i in invalid_days:
    indexes_to_remove.append(all_days.index(i))
    
for i in indexes_to_remove[::-1]:
    all_days.pop(i)

In [35]:
def fix_starttime(date):
    date = date.split(" ")
    return date[0]

In [36]:
trips["Date"] = trips["starttime"].apply(fix_starttime) 
trips = trips.drop("starttime",1)
trips["total_trips"] = 1

In [37]:
trips = trips.groupby("Date",as_index = False).sum()

In [38]:
## The first day of 2015 is Thursday so the pattern to follow is weekday, weekday, weekend,
## weekend, weekday, weekday, weekday

weekend_pattern = ["0", "0", "1", "1", "0", "0", "0"]
## Where weekday = 0 and weekend = 1
all_year_pattern = weekend_pattern * 52
all_year_pattern = all_year_pattern + ["0"]

In [39]:
date_weekday = {}
for a, b in zip(all_days, all_year_pattern):
    date_weekday[a] = b
    
def is_weekend(date):
    separate = date.split(" ")
    return int(date_weekday[separate[0]])

In [40]:
trips["is_weekend"] = trips["Date"].apply(is_weekend)

In [41]:
def seasons(date):
    date = date.split('/')
    month = int(date[0])
    
    winter = [1,2,3]
    spring = [4,5,6]
    summer = [7,8,9]
    fall = [10,11,12]
    
    if month in winter:
        return 1
    elif month in spring:
        return 2
    elif month in summer:
        return 3
    else:
        return 4

In [42]:
trips["season"] = trips['Date'].apply(seasons)

In [43]:
weather = pd.read_csv("weather_data.csv") ## read in weather data
weather = weather.drop("Unnamed: 0",1)

In [44]:
weather.head()

Unnamed: 0,Date,mean_temp,max_temp,min_temp,average_humid,max_humid,min_humid
0,1/1/2015,33.0,39.0,27.0,37.0,46.0,27.0
1,1/2/2015,39.0,42.0,35.0,43.0,52.0,33.0
2,1/3/2015,38.0,42.0,33.0,68.0,92.0,44.0
3,1/4/2015,49.0,56.0,41.0,88.0,93.0,83.0
4,1/5/2015,35.0,49.0,21.0,38.0,49.0,26.0


In [45]:
data = trips.merge(weather,how = "inner")

In [46]:
data.head()

Unnamed: 0,Date,total_trips,is_weekend,season,mean_temp,max_temp,min_temp,average_humid,max_humid,min_humid
0,1/1/2015,5317,0,1,33.0,39.0,27.0,37.0,46.0,27.0
1,1/10/2015,6109,1,1,20.0,23.0,16.0,41.0,47.0,34.0
2,1/11/2015,7467,1,1,28.0,37.0,18.0,38.0,47.0,29.0
3,1/12/2015,8645,0,1,37.0,39.0,35.0,64.0,92.0,35.0
4,1/13/2015,12797,0,1,27.0,36.0,17.0,61.0,82.0,39.0


In [47]:
data["total_trips"].describe()

count      365.000000
mean     27227.312329
std      13470.980912
min       1107.000000
25%      15234.000000
50%      30295.000000
75%      38285.000000
max      52706.000000
Name: total_trips, dtype: float64

In [48]:
X = data.loc[:,["season","is_weekend","mean_temp","average_humid"]]
y = data.loc[:,'total_trips']

model = sm.OLS(y,X)
results = model.fit()

results.summary()

0,1,2,3
Dep. Variable:,total_trips,R-squared:,0.964
Model:,OLS,Adj. R-squared:,0.964
Method:,Least Squares,F-statistic:,2421.0
Date:,"Thu, 12 Jan 2017",Prob (F-statistic):,3.29e-259
Time:,04:15:39,Log-Likelihood:,-3678.2
No. Observations:,365,AIC:,7364.0
Df Residuals:,361,BIC:,7380.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
season,5116.0685,309.754,16.517,0.000,4506.920 5725.217
is_weekend,-6225.7085,662.218,-9.401,0.000,-7527.998 -4923.420
mean_temp,459.6265,17.020,27.006,0.000,426.156 493.097
average_humid,-178.8363,15.541,-11.508,0.000,-209.398 -148.275

0,1,2,3
Omnibus:,31.847,Durbin-Watson:,1.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48.856
Skew:,-0.586,Prob(JB):,2.46e-11
Kurtosis:,4.356,Cond. No.,181.0


In [49]:
results.params

season           5116.068537
is_weekend      -6225.708531
mean_temp         459.626498
average_humid    -178.836254
dtype: float64

In [50]:
x = data.drop(["total_trips","Date"],1)
y = data["total_trips"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state = 123)

In [51]:
model = LinearRegression()
model.fit(x_train,y_train)
# The coefficients
print('Coefficients: \n', model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((model.predict(x_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % model.score(x_test, y_test))

Coefficients: 
 [-6486.67587866  5213.2062354   -454.53297717   616.84075156   265.05695269
  -670.70265248   217.41091731   235.01834995]
Mean squared error: 32417035.54
Variance score: 0.83


In [52]:
model = Ridge()
model.fit(x_train,y_train)
# The coefficients
print('Coefficients: \n', model.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((model.predict(x_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % model.score(x_test, y_test))

Coefficients: 
 [-6377.33839248  5191.31491724  -406.55097119   592.22601058   242.17894047
  -634.66295835   199.05332967   217.64986648]
Mean squared error: 32459440.60
Variance score: 0.83
