In [157]:
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 2000)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set(style="whitegrid")


In [158]:
df = pd.read_csv('/Users/jmirabito/Desktop/Learn.co/mod_2_final_project/citibike_modeling_dataset.csv')

In [159]:
df.set_index(['date','station_id'], inplace=True)

In [160]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rider_count,TAVG,PRCP,dist_PATH,dist_landmark,median_inc,mean_inc,population_zip,station_zip,weekend,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
date,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-08-01,3184,86,79.0,0.0,0.010906,0.043861,76967.05,98752.3753,31104.0,7302,0,0,0,0,1,0,0,0
2019-08-01,3185,91,79.0,0.0,0.01027,0.04078,76967.05,98752.3753,31104.0,7302,0,0,0,0,1,0,0,0
2019-08-01,3186,501,79.0,0.0,0.008522,0.038837,76967.05,98752.3753,31104.0,7302,0,0,0,0,1,0,0,0
2019-08-01,3187,121,79.0,0.0,0.003982,0.036882,76967.05,98752.3753,31104.0,7302,0,0,0,0,1,0,0,0
2019-08-01,3191,22,79.0,0.0,0.025381,0.061002,40861.4099,56704.7387,41745.0,7304,0,0,0,0,1,0,0,0


In [161]:
# Convert zipcode to string
df.station_zip = df.station_zip.astype(str)
df.station_zip = df.station_zip.apply(lambda x: x[0:5])
df.station_zip = df.station_zip.apply(lambda x: x.strip('.'))
df.station_zip = df.station_zip.apply(lambda x: '0'+ x if x.startswith('7') else x)

In [162]:
df = pd.concat([df, pd.get_dummies(df['station_zip'])], 1)

In [163]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rider_count,TAVG,PRCP,dist_PATH,dist_landmark,median_inc,mean_inc,population_zip,station_zip,weekend,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,07087,07302,07304,07305,07306,07307,07310,07311
date,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
2019-08-01,3184,86,79.0,0.0,0.010906,0.043861,76967.05,98752.3753,31104.0,7302,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
2019-08-01,3185,91,79.0,0.0,0.01027,0.04078,76967.05,98752.3753,31104.0,7302,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
2019-08-01,3186,501,79.0,0.0,0.008522,0.038837,76967.05,98752.3753,31104.0,7302,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
2019-08-01,3187,121,79.0,0.0,0.003982,0.036882,76967.05,98752.3753,31104.0,7302,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
2019-08-01,3191,22,79.0,0.0,0.025381,0.061002,40861.4099,56704.7387,41745.0,7304,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0


In [164]:
df.drop(['station_zip'], axis=1, inplace=True)

In [165]:
df.reset_index(inplace=True)

In [166]:
# Creating a column for month to then create dummy variables for season
df['month'] = df.date.apply(lambda x: x[5:7])

# Creating a column for season
conditions = [
    df['month'].isin(['12', '01', '02']),
    df['month'].isin(['03', '04', '05']),
    df['month'].isin(['06', '07', '08']),
    df['month'].isin(['09', '10', '11'])
]

choices = [
    'winter',
    'spring',
    'summer',
    'fall'
]

df['season'] = np.select(conditions, choices, default=np.nan)

In [167]:
# Creating the season dummy variables
df = pd.concat([df, pd.get_dummies(df['season'])], 1)

# Dropping season and month categoricals since season is now a dummy
df.drop(['season', 'month'], axis=1, inplace=True)

# Resetting the index
df.set_index(['date', 'station_id'], inplace=True)

In [105]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rider_count,TAVG,PRCP,dist_PATH,dist_landmark,median_inc,mean_inc,population_zip,weekend,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,07087,07302,07304,07305,07306,07307,07310,07311,fall,spring,summer,winter
date,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2019-08-01,3184,86,79.0,0.0,0.010906,0.043861,76967.05,98752.3753,31104.0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2019-08-01,3185,91,79.0,0.0,0.01027,0.04078,76967.05,98752.3753,31104.0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2019-08-01,3186,501,79.0,0.0,0.008522,0.038837,76967.05,98752.3753,31104.0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2019-08-01,3187,121,79.0,0.0,0.003982,0.036882,76967.05,98752.3753,31104.0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2019-08-01,3191,22,79.0,0.0,0.025381,0.061002,40861.4099,56704.7387,41745.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


In [106]:
# dropping the first of each dummy category
df.drop(['Monday', '07087', 'fall'], axis=1, inplace=True)

In [107]:
# Removing features that are highly correlated
corr_matrix = df.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.shape

# Find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

df.drop(columns=to_drop, inplace=True)

## Creating a Polynomial Regression Model

In [109]:
from sklearn.preprocessing import PolynomialFeatures

#Second degree polynomial
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
poly2_data = poly_2.fit_transform(df_features)
poly2_columns = poly_2.get_feature_names(df_features.columns)
df_poly2 = pd.DataFrame(poly2_data, columns=poly2_columns)
print(df_poly2.shape)
df_poly2.head()

(20064, 230)


Unnamed: 0,TAVG,PRCP,dist_PATH,dist_landmark,median_inc,mean_inc,Wednesday,Thursday,Friday,Saturday,Sunday,07302,07304,07305,07306,07310,07311,spring,summer,winter,TAVG^2,TAVG PRCP,TAVG dist_PATH,TAVG dist_landmark,TAVG median_inc,TAVG mean_inc,TAVG Wednesday,TAVG Thursday,TAVG Friday,TAVG Saturday,TAVG Sunday,TAVG 07302,TAVG 07304,TAVG 07305,TAVG 07306,TAVG 07310,TAVG 07311,TAVG spring,TAVG summer,TAVG winter,PRCP^2,PRCP dist_PATH,PRCP dist_landmark,PRCP median_inc,PRCP mean_inc,PRCP Wednesday,PRCP Thursday,PRCP Friday,PRCP Saturday,PRCP Sunday,PRCP 07302,PRCP 07304,PRCP 07305,PRCP 07306,PRCP 07310,PRCP 07311,PRCP spring,PRCP summer,PRCP winter,dist_PATH^2,dist_PATH dist_landmark,dist_PATH median_inc,dist_PATH mean_inc,dist_PATH Wednesday,dist_PATH Thursday,dist_PATH Friday,dist_PATH Saturday,dist_PATH Sunday,dist_PATH 07302,dist_PATH 07304,dist_PATH 07305,dist_PATH 07306,dist_PATH 07310,dist_PATH 07311,dist_PATH spring,dist_PATH summer,dist_PATH winter,dist_landmark^2,dist_landmark median_inc,dist_landmark mean_inc,dist_landmark Wednesday,dist_landmark Thursday,dist_landmark Friday,dist_landmark Saturday,dist_landmark Sunday,dist_landmark 07302,dist_landmark 07304,dist_landmark 07305,dist_landmark 07306,dist_landmark 07310,dist_landmark 07311,dist_landmark spring,dist_landmark summer,dist_landmark winter,median_inc^2,median_inc mean_inc,median_inc Wednesday,median_inc Thursday,median_inc Friday,median_inc Saturday,median_inc Sunday,median_inc 07302,median_inc 07304,median_inc 07305,median_inc 07306,median_inc 07310,median_inc 07311,median_inc spring,median_inc summer,median_inc winter,mean_inc^2,mean_inc Wednesday,mean_inc Thursday,mean_inc Friday,mean_inc Saturday,mean_inc Sunday,mean_inc 07302,mean_inc 07304,mean_inc 07305,mean_inc 07306,mean_inc 07310,mean_inc 07311,mean_inc spring,mean_inc summer,mean_inc winter,Wednesday^2,Wednesday Thursday,Wednesday Friday,Wednesday Saturday,Wednesday Sunday,Wednesday 07302,Wednesday 07304,Wednesday 07305,Wednesday 07306,Wednesday 07310,Wednesday 07311,Wednesday spring,Wednesday summer,Wednesday winter,Thursday^2,Thursday Friday,Thursday Saturday,Thursday Sunday,Thursday 07302,Thursday 07304,Thursday 07305,Thursday 07306,Thursday 07310,Thursday 07311,Thursday spring,Thursday summer,Thursday winter,Friday^2,Friday Saturday,Friday Sunday,Friday 07302,Friday 07304,Friday 07305,Friday 07306,Friday 07310,Friday 07311,Friday spring,Friday summer,Friday winter,Saturday^2,Saturday Sunday,Saturday 07302,Saturday 07304,Saturday 07305,Saturday 07306,Saturday 07310,Saturday 07311,Saturday spring,Saturday summer,Saturday winter,Sunday^2,Sunday 07302,Sunday 07304,Sunday 07305,Sunday 07306,Sunday 07310,Sunday 07311,Sunday spring,Sunday summer,Sunday winter,07302^2,07302 07304,07302 07305,07302 07306,07302 07310,07302 07311,07302 spring,07302 summer,07302 winter,07304^2,07304 07305,07304 07306,07304 07310,07304 07311,07304 spring,07304 summer,07304 winter,07305^2,07305 07306,07305 07310,07305 07311,07305 spring,07305 summer,07305 winter,07306^2,07306 07310,07306 07311,07306 spring,07306 summer,07306 winter,07310^2,07310 07311,07310 spring,07310 summer,07310 winter,07311^2,07311 spring,07311 summer,07311 winter,spring^2,spring summer,spring winter,summer^2,summer winter,winter^2
0,79.0,0.0,0.010906,0.043861,76967.05,98752.3753,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6241.0,0.0,0.861613,3.465017,6080397.0,7801438.0,0.0,79.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000119,0.000478,839.440752,1077.042295,0.0,0.010906,0.0,0.0,0.0,0.010906,0.0,0.0,0.0,0.0,0.0,0.0,0.010906,0.0,0.001924,3375.850265,4331.37599,0.0,0.043861,0.0,0.0,0.0,0.043861,0.0,0.0,0.0,0.0,0.0,0.0,0.043861,0.0,5923927000.0,7600679000.0,0.0,76967.05,0.0,0.0,0.0,76967.05,0.0,0.0,0.0,0.0,0.0,0.0,76967.05,0.0,9752032000.0,0.0,98752.3753,0.0,0.0,0.0,98752.3753,0.0,0.0,0.0,0.0,0.0,0.0,98752.3753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,79.0,0.0,0.01027,0.04078,76967.05,98752.3753,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6241.0,0.0,0.811331,3.221621,6080397.0,7801438.0,0.0,79.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000105,0.000419,790.452283,1014.187766,0.0,0.01027,0.0,0.0,0.0,0.01027,0.0,0.0,0.0,0.0,0.0,0.0,0.01027,0.0,0.001663,3138.717414,4027.123295,0.0,0.04078,0.0,0.0,0.0,0.04078,0.0,0.0,0.0,0.0,0.0,0.0,0.04078,0.0,5923927000.0,7600679000.0,0.0,76967.05,0.0,0.0,0.0,76967.05,0.0,0.0,0.0,0.0,0.0,0.0,76967.05,0.0,9752032000.0,0.0,98752.3753,0.0,0.0,0.0,98752.3753,0.0,0.0,0.0,0.0,0.0,0.0,98752.3753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,79.0,0.0,0.008522,0.038837,76967.05,98752.3753,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6241.0,0.0,0.673262,3.068126,6080397.0,7801438.0,0.0,79.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.3e-05,0.000331,655.936368,841.597468,0.0,0.008522,0.0,0.0,0.0,0.008522,0.0,0.0,0.0,0.0,0.0,0.0,0.008522,0.0,0.001508,2989.17256,3835.250155,0.0,0.038837,0.0,0.0,0.0,0.038837,0.0,0.0,0.0,0.0,0.0,0.0,0.038837,0.0,5923927000.0,7600679000.0,0.0,76967.05,0.0,0.0,0.0,76967.05,0.0,0.0,0.0,0.0,0.0,0.0,76967.05,0.0,9752032000.0,0.0,98752.3753,0.0,0.0,0.0,98752.3753,0.0,0.0,0.0,0.0,0.0,0.0,98752.3753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,79.0,0.0,0.003982,0.036882,76967.05,98752.3753,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6241.0,0.0,0.314579,2.913689,6080397.0,7801438.0,0.0,79.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6e-05,0.000147,306.484135,393.23368,0.0,0.003982,0.0,0.0,0.0,0.003982,0.0,0.0,0.0,0.0,0.0,0.0,0.003982,0.0,0.00136,2838.709891,3642.199416,0.0,0.036882,0.0,0.0,0.0,0.036882,0.0,0.0,0.0,0.0,0.0,0.0,0.036882,0.0,5923927000.0,7600679000.0,0.0,76967.05,0.0,0.0,0.0,76967.05,0.0,0.0,0.0,0.0,0.0,0.0,76967.05,0.0,9752032000.0,0.0,98752.3753,0.0,0.0,0.0,98752.3753,0.0,0.0,0.0,0.0,0.0,0.0,98752.3753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,79.0,0.0,0.025381,0.061002,40861.4099,56704.7387,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6241.0,0.0,2.005111,4.819144,3228051.0,4479674.0,0.0,79.0,0.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000644,0.001548,1037.109771,1439.231752,0.0,0.025381,0.0,0.0,0.0,0.0,0.025381,0.0,0.0,0.0,0.0,0.0,0.025381,0.0,0.003721,2492.620612,3459.092596,0.0,0.061002,0.0,0.0,0.0,0.0,0.061002,0.0,0.0,0.0,0.0,0.0,0.061002,0.0,1669655000.0,2317036000.0,0.0,40861.4099,0.0,0.0,0.0,0.0,40861.4099,0.0,0.0,0.0,0.0,0.0,40861.4099,0.0,3215427000.0,0.0,56704.7387,0.0,0.0,0.0,0.0,56704.7387,0.0,0.0,0.0,0.0,0.0,56704.7387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [110]:
# Call train_test_split on the data and capture the results
X_train, X_test, y_train, y_test = train_test_split(df_poly2, target, random_state=20200912,test_size=0.2)


In [111]:
scaler = StandardScaler()

# Fit the scaler to the training data only
scaler.fit(X_train)

# Transform the training data
X_train = pd.DataFrame(data=scaler.transform(X_train), columns=df_poly2.columns)

# Transform the testing dat
X_test = pd.DataFrame(data=scaler.transform(X_test), columns=df_poly2.columns)

In [170]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Instantiate a linear regression object
lm = LinearRegression()

# Fit the linear regression to the data
lm = lm.fit(X_train, y_train)

# Create predictions for the training data
y_train_pred = lm.predict(X_train)

# Evaluate the model
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))

ridership_std = target.std()

In [171]:
# Use fitted model to predict on test data
y_pred = lm.predict(X_test)

test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print('Training: ', int(train_rmse)/ridership_std, "vs. Testing: ", int(test_rmse)/ridership_std)

Training:  0.7830890279154448 vs. Testing:  0.7212662099221202


Our RMSE values are fairly high and our training model seems to underfit the data. We'll conduct some feature selecion methods to see if we can remove any features that may help the model to better fit the data. 

## Filter method for feature selection

In [172]:
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression


# Running a for loop to determine which number of features in a polynomial model will minimize the difference 
# between the training and testing RMSE values.

min_diff = []

for i in range(1,len(df_features.columns)):
    
    selector = SelectKBest(f_regression, k=i)
    selector.fit(X_train, y_train)

    selected_columns = X_train.columns[selector.get_support()]
    removed_columns = X_train.columns[~selector.get_support()]

    #instantiate a linear regression object
    lm_kbest = LinearRegression()

    #fit the linear regression to the data
    lm_kbest = lm_kbest.fit(X_train[selected_columns], y_train)

    y_train_kbest = lm_kbest.predict(X_train[selected_columns])


    trainK_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_kbest))


    y_kbest = lm_kbest.predict(X_test[selected_columns])

    testK_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_kbest))

    min_diff.append((i, abs(trainK_rmse/ridership_std - testK_rmse/ridership_std)))

  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)
  corr /= X_norms
  cond2 = cond0 & (x <= _a)


In [173]:
# Create a df that displays the abs difference between training and test sets for a given numer of features, k.
opt_num_feat = pd.DataFrame(min_diff, columns=["k", "abs_difference"])

opt_num_feat.sort_values(by='abs_difference')

Unnamed: 0,k,abs_difference
13,14,0.04295
14,15,0.0433
18,19,0.043379
17,18,0.043379
15,16,0.043379
16,17,0.043379
12,13,0.044585
2,3,0.045815
1,2,0.046355
0,1,0.046596


In [174]:
# Re-run the filter method by replacing k with the best performing number of features
selector = SelectKBest(f_regression, k=14)
selector.fit(X_train, y_train)

selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]

#instantiate a linear regression object
lm_kbest = LinearRegression()

#fit the linear regression to the data
lm_kbest = lm_kbest.fit(X_train[selected_columns], y_train)

y_train_kbest = lm_kbest.predict(X_train[selected_columns])


trainK_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_kbest))


y_kbest = lm_kbest.predict(X_test[selected_columns])

testK_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_kbest))


print('Training Root Mean Squared Error:' , trainK_rmse/ridership_std)

print('Testing Root Mean Squared Error:' , testK_rmse/ridership_std)


Training Root Mean Squared Error: 0.8636037897522917
Testing Root Mean Squared Error: 0.8206534526367342


  corr /= X_norms
  cond2 = cond0 & (x <= _a)


Our R^2 values seem to have increased slightly but have moved slightly closer. We'll next try a lasso method to see if we can achieve better model performance.

## Lasso method

In [124]:
## training the model
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1, normalize=False)

lasso.fit(X_train,y_train)

y_train_lasso01 = lasso.predict(X_train)
y_test_lasso01 = lasso.predict(X_test)

train_rmse_lasso01 = metrics.mean_absolute_error(y_train, y_train_lasso01)
test_rmse_lasso01 = np.sqrt(metrics.mean_squared_error(y_test, y_test_lasso01))
print('Training Error: '+ str(train_rmse_lasso01/ridership_std) )
print('Testing Error: '+ str(test_rmse_lasso01/ridership_std) )



Training Error: 0.4211518329842482
Testing Error: 0.7525279838161044


In [129]:
coeff_df = lasso_coef01.T

coeff_df[coeff_df[0]!=0].count()

0    93
dtype: int64

The lasso method seems to really overfit the model to training data resulting in a very large discrepancy between the RMSE values of the the model's performance on the training data and the testing data.

## Summary
Our polynomial model seems to perform better than our linear regression predictive model, but still does not seem to be too predictive of ridership. For this reason, we will conduct a traditional OLS regression and exploratory data analysis to better understand the individual relationships between each feature and our target variable, ridership.