# Dependencies

In [102]:
import pandas as pd

# Load the Data

In [103]:
fremont_bridge = 'https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD'

bicycle_weather = 'https://raw.githubusercontent.com/jakevdp/PythonDataScienceHandbook/master/notebooks/data/BicycleWeather.csv'

counts = pd.read_csv(fremont_bridge, index_col='Date', parse_dates=True, 
                     infer_datetime_format=True)

weather = pd.read_csv(bicycle_weather, index_col='DATE', parse_dates=True, 
                      infer_datetime_format=True)

daily = counts.resample('d').sum()
daily['Total'] = daily.sum(axis=1)
daily = daily[['Total']] # remove other columns

weather_columns = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'AWND']
daily = daily.join(weather[weather_columns], how='inner')

# Make a feature for yesterday's total
daily['Total_yesterday'] = daily.Total.shift(1)
daily = daily.drop(index=daily.index[0])

daily.head()

Unnamed: 0,Total,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,Total_yesterday
2012-10-04,3475.0,0,0,0,189,83,65,3521.0
2012-10-05,3148.0,0,0,0,217,89,57,3475.0
2012-10-06,2006.0,0,0,0,239,78,51,3148.0
2012-10-07,2142.0,0,0,0,239,78,13,2006.0
2012-10-08,3537.0,0,0,0,211,78,19,2142.0


# Create Test/Train Sets

In [104]:
X_train = daily.drop(columns='Total')[:-100]
X_test = daily.drop(columns='Total')[-100:]

y_train = daily['Total'][:-100]
y_test = daily['Total'][-100:]

In [105]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((963, 7), (100, 7), (963,), (100,))

# Detect & Replace Incorrect  Data

In [106]:
daily.describe()

Unnamed: 0,Total,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,Total_yesterday
count,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0,1063.0
mean,2632.449671,29.350894,-37.496707,0.098777,166.863594,84.472248,22.338664,2633.056444
std,1252.86402,65.813053,612.512583,2.570041,74.779734,50.916006,307.984292,1253.138245
min,98.0,0.0,-9999.0,0.0,-16.0,-71.0,-9999.0,98.0
25%,1806.0,0.0,0.0,0.0,111.0,44.0,22.0,1806.0
50%,2435.0,0.0,0.0,0.0,150.0,83.0,29.0,2435.0
75%,3574.5,26.5,0.0,0.0,222.0,128.0,40.0,3574.5
max,6088.0,559.0,74.0,80.0,356.0,183.0,95.0,6088.0


In [107]:
def clean(X):
    AWND_mean = X_train['AWND'].mean()
    SNOW_mean = X_train['SNOW'].mean()

    X['AWND'] = X['AWND'].replace({-9999:AWND_mean})
    X['SNOW'] = X['SNOW'].replace({-9999:SNOW_mean})
    
    return X

In [108]:
X_train = clean(X_train)
X_test = clean(X_test)

# Create Features

In [109]:
import numpy as np

In [110]:
def make_features(X):
    X = X.copy()

    # patterns of use generally vary from day to day; 
    # let's add binary columns that indicate the day of the week:
    days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    for i, day in enumerate(days):
        X[day] = (X.index.dayofweek == i).astype(float)


    # we might expect riders to behave differently on holidays; 
    # let's add an indicator of this as well:
    from pandas.tseries.holiday import USFederalHolidayCalendar
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays('2012', '2016')
    X = X.join(pd.Series(1, index=holidays, name='holiday'))
    X['holiday'].fillna(0, inplace=True)
    
    # We also might suspect that the hours of daylight would affect 
    # how many people ride; let's use the standard astronomical calculation 
    # to add this information:
    def hours_of_daylight(date, axis=23.44, latitude=47.61):
        """Compute the hours of daylight for the given date"""
        days = (date - pd.datetime(2000, 12, 21)).days
        m = (1. - np.tan(np.radians(latitude))
             * np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25)))
        return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180.

    X['daylight_hrs'] = list(map(hours_of_daylight, X.index))
    
    # temperatures are in 1/10 deg C; convert to C
    X['TMIN'] /= 10
    X['TMAX'] /= 10
    
    # We can also calcuate the average temperature.
    X['Temp (C)'] = 0.5 * (X['TMIN'] + X['TMAX'])
    
    # Convert Temperatures to Farenheit
    X['TMIN'] = X['TMIN'] * 1.8 + 32
    X['TMAX'] = X['TMAX'] * 1.8 + 32
    X['Temp (C)'] = X['Temp (C)'] * 1.8 + 32
    X = X.rename(columns={'Temp (C)':'Temp (F)'})
    
    # precip is in 1/10 mm; convert to inches
    X['PRCP'] /= 254

    # In addition to the inches of precipitation, let's add a flag that 
    # indicates whether a day is dry (has zero precipitation):
    X['dry day'] = (X['PRCP'] == 0).astype(int)

    # Let's add a counter that increases from day 1, and measures how many 
    # years have passed. This will let us measure any observed annual increase 
    # or decrease in daily crossings:
    X['annual'] = (X.index - X.index[0]).days / 365.
    
    # Create feature to indicate how many standardeviations from the mean the data is 
    temp_mean = X['Temp (F)'].mean()
    temp_std = X['Temp (F)'].std()
    X['Temp_STD_from_mean'] = abs(X['Temp (F)'] - temp_mean) / temp_std
    
    # Create feature that record's the temperate from yesterday
    X['Yesterdays Temp'] = X.tshift(periods=1)['Temp (F)'] 
    X['Yesterdays Temp'] = X['Yesterdays Temp'].fillna(method='bfill')
    
    # Create feature that records the total number of crossings from a week ago
    X['Total_yesterday'] = X.shift(periods=7)['Total_yesterday']
    X['Total_yesterday'] = X['Total_yesterday'].fillna(method='bfill')
    
    return X

In [111]:
X_train = make_features(X_train)
X_test = make_features(X_test)

In [112]:
X_train

Unnamed: 0,PRCP,SNOW,SNWD,TMAX,TMIN,AWND,Total_yesterday,Mon,Tue,Wed,...,Fri,Sat,Sun,holiday,daylight_hrs,Temp (F),dry day,annual,Temp_STD_from_mean,Yesterdays Temp
2012-10-04,0.000000,0.0,0,66.02,46.94,65,3521.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.219142,56.48,1,0.000000,0.321047,56.48
2012-10-05,0.000000,0.0,0,71.06,48.02,57,3521.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,11.161038,59.54,1,0.002740,0.616379,56.48
2012-10-06,0.000000,0.0,0,75.02,46.04,51,3521.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,11.103056,60.53,1,0.005479,0.711927,59.54
2012-10-07,0.000000,0.0,0,75.02,46.04,13,3521.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,11.045208,60.53,1,0.008219,0.711927,60.53
2012-10-08,0.000000,0.0,0,69.98,46.04,19,3521.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,10.987503,58.01,1,0.010959,0.468713,60.53
2012-10-09,0.000000,0.0,0,60.98,48.02,16,3521.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,10.929950,54.50,1,0.013699,0.129951,58.01
2012-10-10,0.000000,0.0,0,53.96,46.94,14,3521.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,10.872560,50.45,1,0.016438,0.260929,54.50
2012-10-11,0.000000,0.0,0,57.02,44.96,13,3521.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.815345,50.99,1,0.019178,0.208812,50.45
2012-10-12,0.078740,0.0,0,57.02,48.02,46,3475.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,10.758314,52.52,0,0.021918,0.061146,50.99
2012-10-13,0.188976,0.0,0,60.08,53.96,39,3148.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,10.701479,57.02,0,0.024658,0.373165,52.52


## Polynomial Features

# Preprocessing

In [113]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

In [114]:
cont_cols = ['PRCP', 'SNWD', 'TMAX', 'TMIN', 'AWND', 'Total_yesterday', 'daylight_hrs', 'Temp (F)', 'annual', 'Temp_STD_from_mean', 'Yesterdays Temp']
preprocess = make_column_transformer(
    (StandardScaler(), cont_cols)
)

# Random Forest Modeling

In [115]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [116]:
build_forest = Pipeline(steps=[
    ('model', RandomForestRegressor(n_jobs=-1))
])

param_grid = {
    'model__max_depth':[6, 8, 10, 12],
    'model__n_estimators':[100, 250, 500]
}

In [117]:
search = GridSearchCV(build_forest, 
                      param_grid=param_grid, 
                      return_train_score=True, 
                      scoring='neg_mean_absolute_error', 
                      cv=3,
                      n_jobs=-1)
search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__max_depth': [6, 8, 10, 12], 'model__n_estimators': [100, 250, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_absolute_error', verbose=0)

In [118]:
print('Best Score:', search.best_score_)
print('Best Params:', search.best_params_)

Best Score: -350.1218942620433
Best Params: {'model__max_depth': 10, 'model__n_estimators': 100}


# Logistic Regression Modeling

In [119]:
from sklearn.linear_model import LinearRegression
build_lr = Pipeline(steps=[
    ('model', LinearRegression(n_jobs=-1))
])

param_grid = {
    'model__fit_intercept':[True, False],
}

In [120]:
search = GridSearchCV(build_lr, 
                      param_grid=param_grid, 
                      return_train_score=True, 
                      scoring='neg_mean_absolute_error', 
                      cv=3,
                      n_jobs=-1)
search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_absolute_error', verbose=0)

In [121]:
print('Best Score:', search.best_score_)
print('Best Params:', search.best_params_)

Best Score: -340.2948517001636
Best Params: {'model__fit_intercept': False}


# XGBoost Modeling

In [122]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [123]:
from xgboost import XGBRegressor

build_boost = Pipeline(steps=[
       ('model', XGBRegressor(n_jobs=-1))
])

param_grid = {
    'model__n_estimators':[100, 250, 500],
    'model__learning_rate':[.03, .05, .07, .09],
    'model__max_depth':[1, 5, 10, 15]
}

In [124]:
search = GridSearchCV(build_boost, 
                      param_grid=param_grid, 
                      return_train_score=True, 
                      scoring='neg_mean_absolute_error', 
                      cv=3,
                      n_jobs=-1)
search.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('model', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'model__n_estimators': [100, 250, 500], 'model__learning_rate': [0.03, 0.05, 0.07, 0.09], 'model__max_depth': [1, 5, 10, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_absolute_error', verbose=0)

In [125]:
print('Best Score:', search.best_score_)
print('Best Params:', search.best_params_)

Best Score: -313.24819791007513
Best Params: {'model__learning_rate': 0.07, 'model__max_depth': 5, 'model__n_estimators': 500}


# Test With Polynomial Features

In [126]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = ['PRCP', 'SNOW', 'TMAX', 'TMIN', 'AWND', 'Total_yesterday', 'daylight_hrs', 'Temp (F)', 'Temp_STD_from_mean', 'Yesterdays Temp']

poly_train = X_train[poly_features]
poly_test = X_test[poly_features]

poly_transformer = PolynomialFeatures(degree=3)

poly_transformer.fit(poly_train)

poly_train = poly_transformer.transform(poly_train)

poly_train = pd.DataFrame(poly_train, columns=poly_transformer.get_feature_names(poly_features))

In [127]:
poly_test = poly_transformer.transform(poly_test)

poly_test = pd.DataFrame(poly_test, columns=poly_transformer.get_feature_names(poly_features))

In [128]:
poly_test.shape, poly_train.shape

((100, 286), (963, 286))

In [129]:
poly_test.head()

Unnamed: 0,1,PRCP,SNOW,TMAX,TMIN,AWND,Total_yesterday,daylight_hrs,Temp (F),Temp_STD_from_mean,...,Temp (F)^3,Temp (F)^2 Temp_STD_from_mean,Temp (F)^2 Yesterdays Temp,Temp (F) Temp_STD_from_mean^2,Temp (F) Temp_STD_from_mean Yesterdays Temp,Temp (F) Yesterdays Temp^2,Temp_STD_from_mean^3,Temp_STD_from_mean^2 Yesterdays Temp,Temp_STD_from_mean Yesterdays Temp^2,Yesterdays Temp^3
0,1.0,0.0,0.0,60.08,51.98,27.0,1776.0,15.269661,56.03,2.473126,...,175898.391227,7764.036236,175898.391227,342.699318,7764.036236,175898.391227,15.126517,342.699318,7764.036236,175898.391227
1,1.0,0.0,0.0,71.06,53.06,21.0,1776.0,15.303607,62.06,1.300199,...,239020.589816,5007.642653,215796.384908,104.913493,4521.079888,194828.737454,2.198008,94.719675,4081.793525,175898.391227
2,1.0,0.0,0.0,75.92,53.06,18.0,1776.0,15.336561,64.49,0.827527,...,268211.336849,3441.650137,258105.063806,44.16277,3311.967863,248379.597764,0.56669,42.498706,3187.172051,239020.589816
3,1.0,0.0,0.0,82.04,53.96,21.0,1776.0,15.368497,68.0,0.144778,...,314432.0,669.452393,298201.76,1.425321,634.896836,282809.2868,0.003035,1.351749,602.124955,268211.336849
4,1.0,0.0,0.0,78.98,55.04,25.0,1776.0,15.399396,67.01,0.337348,...,300897.690101,1514.807037,305343.1268,7.625982,1537.186667,309854.24,0.038391,7.738648,1559.896931,314432.0


In [130]:
search.fit(poly_train, y_train)

KeyboardInterrupt: 

In [None]:
print('Best Score:', search.best_score_)
print('Best Params:', search.best_params_)