In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import datetime

# Import from sklearn.
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# Set a random seed.
np.random.seed(42)

In [30]:
train = pd.read_csv('datasets/clean_daily-totals-L-station-ridership.csv')
train.head()

Unnamed: 0,station_id,stationname,date,daytype,daily_total_riders
0,41280,Jefferson Park,2017-12-22,weekday,6104
1,41000,Cermak-Chinatown,2017-12-18,weekday,3636
2,40280,Central-Lake,2017-12-02,saturday,1270
3,40140,Dempster-Skokie,2017-12-19,weekday,1759
4,40690,Dempster,2017-12-03,sunday-holiday,499


In [31]:
train.shape

(1049633, 5)

In [32]:
oh = OneHotEncoder(categories="auto", sparse=False)
train['stationname'] = oh.fit_transform(np.array(train['stationname']).reshape(-1,1))
train['daytype'] = oh.fit_transform(np.array(train['daytype']).reshape(-1,1))
train['station_id'] = oh.fit_transform(np.array(train['station_id']).reshape(-1,1))

In [22]:
train['year'] = pd.to_datetime(train['date']).dt.year
train['month'] = pd.to_datetime(train['date']).dt.month
train['year'] = oh.fit_transform(np.array(train['year']).reshape(-1,1))
train['month'] = oh.fit_transform(np.array(train['month']).reshape(-1,1))

In [23]:
X = train.drop(columns=['daily_total_riders', 'date'])
y = train['daily_total_riders']

#### **Null Model/Baseline**

In [24]:
np.mean(y)

3195.5388578674642

### **Linear Regression**

In [25]:
X_train,X_test, y_train,y_test = train_test_split(X,y, test_size = 0.33,random_state=42)

# instantiate linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Score on training and testing sets.
print(f'Training Score: {round(lr.score(X_train, y_train),4)}')
print(f'Testing Score: {round(lr.score(X_test, y_test),4)}')

Training Score: 0.0227
Testing Score: 0.0221


### **Principal Component Analysis**

In [26]:
pf = PolynomialFeatures(degree = 3)
X_poly = pf.fit_transform(X)

In [27]:
X_train,X_test, y_train,y_test = train_test_split(X_poly,y, test_size = 0.33,random_state=42)

# Instantiate our StandardScaler.
xc = StandardScaler()
# Standardize X_train.
X_train_xc = xc.fit_transform(X_train)
# Standardize X_test.
X_test_xc = xc.transform(X_test)

In [28]:
# instantiate PCA
pca = PCA(n_components = 10, random_state=42)
# fit PCA to training data
pca.fit(X_train)
#transform X_train and X_test
Z_train = pca.transform(X_train_xc)
Z_test = pca.transform(X_test_xc)

# instantiate linear regression
lr = LinearRegression()
# fit on Z_train
lr.fit(Z_train, y_train)

# Score on training and testing sets.
print(f'Training Score: {lr.score(Z_train, y_train)}')
print(f'Testing Score: {lr.score(Z_test, y_test)}')

Training Score: 0.022754847076801088
Testing Score: 0.022162484954760475


#### **Decision to drop this dataset since we have datasets with averages and lack of time for amopunt of time to model on 1 million rows**