In [86]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append(os.path.abspath('..'))

from src.features.dates import convert_to_date
from src.data.sets import split_sets_by_time, save_sets
from src.models.performance import print_reg_perf

In [49]:
df = pd.read_csv('../data/raw/day.csv')

In [5]:
df.shape

(731, 16)

In [53]:
df_cleaned = df.copy()
df.head()
df_cleaned.drop('instant', axis=1, inplace=True)

In [88]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
df_cleaned = convert_to_date(df_cleaned, ['dteday'])
df_cleaned.head()

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [55]:
df_cleaned['yr'] = df_cleaned['dteday'].dt.year
df_cleaned['mnth'] = df_cleaned['dteday'].dt.month_name()
df_cleaned['weekday'] = df_cleaned['dteday'].dt.day_name()

season_mapping = {1: 'winter',
                 2: 'spring',
                 3: 'summer',
                 4: 'autumn'
                 }
df_cleaned['season'] = df_cleaned['season'].map(season_mapping)
df_cleaned['weathersit'] = df_cleaned['weathersit'].map(season_mapping)

df_cleaned.head()

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,2011-01-01,winter,2011,January,0,Saturday,0,spring,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2011-01-02,winter,2011,January,0,Sunday,0,spring,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2011-01-03,winter,2011,January,0,Monday,1,winter,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2011-01-04,winter,2011,January,0,Tuesday,1,winter,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,2011-01-05,winter,2011,January,0,Wednesday,1,winter,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [59]:
df_cleaned['holidaydate'] = np.nan

df_cleaned = convert_to_date(df_cleaned, ['holidaydate'])
df_cleaned.head()

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,holidaydate
0,2011-01-01,winter,2011,January,0,Saturday,0,spring,0.344167,0.363625,0.805833,0.160446,331,654,985,NaT
1,2011-01-02,winter,2011,January,0,Sunday,0,spring,0.363478,0.353739,0.696087,0.248539,131,670,801,NaT
2,2011-01-03,winter,2011,January,0,Monday,1,winter,0.196364,0.189405,0.437273,0.248309,120,1229,1349,NaT
3,2011-01-04,winter,2011,January,0,Tuesday,1,winter,0.2,0.212122,0.590435,0.160296,108,1454,1562,NaT
4,2011-01-05,winter,2011,January,0,Wednesday,1,winter,0.226957,0.22927,0.436957,0.1869,82,1518,1600,NaT


In [61]:
holiday_mask = df_cleaned['holiday'] == 1


df_cleaned.loc[holiday_mask, 'holidaydate'] = df_cleaned.loc[holiday_mask, 'dteday']
df_cleaned.head()

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,holidaydate
0,2011-01-01,winter,2011,January,0,Saturday,0,spring,0.344167,0.363625,0.805833,0.160446,331,654,985,NaT
1,2011-01-02,winter,2011,January,0,Sunday,0,spring,0.363478,0.353739,0.696087,0.248539,131,670,801,NaT
2,2011-01-03,winter,2011,January,0,Monday,1,winter,0.196364,0.189405,0.437273,0.248309,120,1229,1349,NaT
3,2011-01-04,winter,2011,January,0,Tuesday,1,winter,0.2,0.212122,0.590435,0.160296,108,1454,1562,NaT
4,2011-01-05,winter,2011,January,0,Wednesday,1,winter,0.226957,0.22927,0.436957,0.1869,82,1518,1600,NaT


In [63]:
df_cleaned['last_holiday'] = df_cleaned['holidaydate'].fillna(method='ffill')

In [67]:
df_cleaned['next_holiday'] = df_cleaned['holidaydate'].fillna(method='ffill')
df_cleaned['next_holiday'].fillna(pd.Timestamp('2013-01-01'), inplace=True)
df_cleaned.head()

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,holidaydate,last_holiday,next_holiday
0,2011-01-01,winter,2011,January,0,Saturday,0,spring,0.344167,0.363625,0.805833,0.160446,331,654,985,NaT,NaT,2013-01-01
1,2011-01-02,winter,2011,January,0,Sunday,0,spring,0.363478,0.353739,0.696087,0.248539,131,670,801,NaT,NaT,2013-01-01
2,2011-01-03,winter,2011,January,0,Monday,1,winter,0.196364,0.189405,0.437273,0.248309,120,1229,1349,NaT,NaT,2013-01-01
3,2011-01-04,winter,2011,January,0,Tuesday,1,winter,0.2,0.212122,0.590435,0.160296,108,1454,1562,NaT,NaT,2013-01-01
4,2011-01-05,winter,2011,January,0,Wednesday,1,winter,0.226957,0.22927,0.436957,0.1869,82,1518,1600,NaT,NaT,2013-01-01


In [69]:
df_cleaned['days_last_holiday'] = (df_cleaned['dteday'] - df_cleaned['last_holiday']).dt.days
df_cleaned['days_next_holiday'] = (df_cleaned['last_holiday'] - df_cleaned['dteday']).dt.days
df_cleaned.head()

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,holidaydate,last_holiday,next_holiday,days_last_holiday,days_next_holiday
0,2011-01-01,winter,2011,January,0,Saturday,0,spring,0.344167,0.363625,0.805833,0.160446,331,654,985,NaT,NaT,2013-01-01,,
1,2011-01-02,winter,2011,January,0,Sunday,0,spring,0.363478,0.353739,0.696087,0.248539,131,670,801,NaT,NaT,2013-01-01,,
2,2011-01-03,winter,2011,January,0,Monday,1,winter,0.196364,0.189405,0.437273,0.248309,120,1229,1349,NaT,NaT,2013-01-01,,
3,2011-01-04,winter,2011,January,0,Tuesday,1,winter,0.2,0.212122,0.590435,0.160296,108,1454,1562,NaT,NaT,2013-01-01,,
4,2011-01-05,winter,2011,January,0,Wednesday,1,winter,0.226957,0.22927,0.436957,0.1869,82,1518,1600,NaT,NaT,2013-01-01,,


In [70]:
cat_cols = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

In [73]:
#df_cleaned = pd.get_dummies(df_cleaned, columns=cat_cols)
df_cleaned.head()

Unnamed: 0,dteday,yr,temp,atemp,hum,windspeed,casual,registered,cnt,holidaydate,...,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,workingday_0,workingday_1,weathersit_spring,weathersit_summer,weathersit_winter
0,2011-01-01,2011,0.344167,0.363625,0.805833,0.160446,331,654,985,NaT,...,1,0,0,0,0,1,0,1,0,0
1,2011-01-02,2011,0.363478,0.353739,0.696087,0.248539,131,670,801,NaT,...,0,1,0,0,0,1,0,1,0,0
2,2011-01-03,2011,0.196364,0.189405,0.437273,0.248309,120,1229,1349,NaT,...,0,0,0,0,0,0,1,0,0,1
3,2011-01-04,2011,0.2,0.212122,0.590435,0.160296,108,1454,1562,NaT,...,0,0,0,1,0,0,1,0,0,1
4,2011-01-05,2011,0.226957,0.22927,0.436957,0.1869,82,1518,1600,NaT,...,0,0,0,0,1,0,1,0,0,1


In [74]:
df_cleaned.to_csv('../data/interim/day.csv', index=False)

In [75]:
df_cleaned.drop(['dteday', 'holidaydate', 'last_holiday', 'next_holiday'], axis=1, inplace=True)

In [81]:
X_train, y_train, X_val, y_val, X_test, y_test  = split_sets_by_time(df_cleaned, 'cnt', test_ratio=0.2)
save_sets(X_train, y_train, X_val, y_val, X_test, y_test, path='../data/processed/')

### Baseline Model

In [83]:
y_mean = y_train.mean()
y_mean

3433.48291571754

In [89]:
y_base = np.full((len(y_train), 1), y_mean)

print_reg_perf(y_preds=y_base, y_actuals=y_train, set_name='Training')

RMSE Training: 1755060.1768151887
MAE Training: 1132.7364532147508
