In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import time
import datetime

In [None]:
dynamic_features = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']
target_values = ['count','registered','casual']

In [None]:
# Copy over from biketrain_data_preparation

freq='H'#predict hourly rental count

# 12 days of hourly forecast 
prediction_length = 288 

context_length = 288

In [None]:
dt_predict_max = pd.Timestamp("2012-12-31 23:00:00", freq=freq) # 2012-12-31 23:00 alt way..pd.datetime(2012,12,31,23,0,0)

dt_dataset_start_time = pd.Timestamp("2011-01-01 00:00:00", freq=freq)
dt_dataset_end_time = pd.Timestamp("2012-12-19 23:00:00", freq=freq)


dt_train_range = (dt_dataset_start_time,
                  dt_dataset_end_time - datetime.timedelta(hours=12*24) )


dt_test_range = (dt_dataset_start_time, 
                 dt_dataset_end_time) 

In [None]:
# if there are gaps in timesteps
def is_missing_steps(df,start,end,freq='D'):
    dt_range = pd.date_range(start=start,end=end,freq=freq)
    return not dt_range.equals(df[start:end].index)

def get_missing_steps(df,start,end,freq='D'):
    dt_range = pd.date_range(start=start,end=end,freq=freq)
    return dt_range.difference(df[start:end].index)    


def timeseries_with_only_nans(df):
    l = []
    for col in df.columns:
        if pd.isna(df[col].min()):
            #print (col)
            l.append(col)
    return l

In [None]:
df = pd.read_csv('train.csv', parse_dates=['datetime'],index_col=0)
df_test = pd.read_csv('test.csv', parse_dates=['datetime'],index_col=0)

In [None]:
df['2011-01']['temp'].plot(title='Temperature upto 19th day - train.csv')

In [None]:
df_test['2011-01']['temp'].plot(title='Temperature from 20th day to end of month - test.csv')

In [None]:
df_test['2011-01'].index.min(),df_test['2011-01'].index.max()

In [None]:
# Check missing time steps in test.csv
get_missing_steps(df_test,'2011-01-20','2011-01-31','H')

In [None]:
# append a new row with max predicted date
obj = {}
for col in df.columns:
    obj[col] = [np.nan]
df = df.append(pd.DataFrame(obj,index=[dt_predict_max]))

In [None]:
# Resample entire data at 1 hour frequency
df = df.resample('1h').mean()

In [None]:
df.tail()

In [None]:
df_test.tail()

In [None]:
# update data from test
df.update(df_test)

In [None]:
df.tail()

In [None]:
df.isna().any()

In [None]:
# look at some missing steps
# Jan 2011
get_missing_steps(df_test,'2011-01-20','2011-01-31','H')

In [None]:
# Dec 2012
get_missing_steps(df_test,'2012-12-20','2012-12-31','H')

In [None]:
df['2011-01-26']

In [None]:
df['2012-12-24']

In [None]:
df['season']['2011-01'].plot()

In [None]:
df['season']['2012-12-17':'2012-12-25'].plot()

In [None]:
group_ymd = df[['holiday','workingday','season']].groupby([df.index.year,df.index.month,df.index.day])

for col in ['holiday','workingday','season']:
    print(col)
    df[col] = group_ymd[col].transform(lambda x: x.fillna(x.max()))

In [None]:
df['season']['2011-01'].plot()

In [None]:
df['season']['2012-12-17':'2012-12-25'].plot()

In [None]:
df['season'].plot()

In [None]:
for col in ['weather', 'temp', 'atemp', 'humidity', 'windspeed']:
    df[col] = df[col].fillna(method='ffill')

In [None]:
df['2011-01-26']

In [None]:
df['2012-12-24']

In [None]:
df.isna().any()

In [None]:
df_dynamic_feat = df[dynamic_features]

In [None]:
df_dynamic_feat.head()

In [None]:
df_dynamic_feat.tail()

In [None]:
# Missing target values that we need to predict and fill
df['2012-01':'2012-02']['count'].plot()

In [None]:
time_series_test = []
time_series_training = []

for t in target_values:
    time_series_test.append(df[dt_test_range[0]:dt_test_range[1]][t])
    time_series_training.append(df[dt_train_range[0]:dt_train_range[1]][t])

In [None]:
# Dynamic features are the same for count, registered, casual
dynamic_features_test = df_dynamic_feat [dt_test_range[0]:dt_test_range[1]]
dynamic_features_training = df_dynamic_feat[dt_train_range[0]:dt_train_range[1]]

In [None]:
time_series_test[0].plot(label='test')
time_series_training[0].plot(label='train')
plt.legend()
plt.show()

In [None]:
def encode_target(ts):
    return [x if np.isfinite(x) else "NaN" for x in ts]  

def encode_dynamic_feat(dynamic_feat):  
    l = []
    for col in dynamic_feat:
        assert (not dynamic_feat[col].isna().any()), col  + ' has NaN'             
        l.append(dynamic_feat[col].tolist())
    return l

def series_to_obj(ts, cat=None, dynamic_feat=None):
    obj = {"start": str(ts.index[0]), "target": encode_target(ts)}
    if cat is not None:
        obj["cat"] = cat
    if dynamic_feat is not None:
        obj["dynamic_feat"] = encode_dynamic_feat(dynamic_feat)
    return obj

def series_to_jsonline(ts, cat=None, dynamic_feat=None):
    return json.dumps(series_to_obj(ts, cat, dynamic_feat))   

In [None]:
series_to_jsonline(time_series_training[0][:5], dynamic_feat=dynamic_features_training[:5])

In [None]:
encoding = "utf-8"
with open("train_dynamic_feat.json", 'wb') as fp:
    for ts in time_series_training:
        fp.write(series_to_jsonline(ts,dynamic_feat=dynamic_features_training).encode(encoding))
        fp.write('\n'.encode(encoding))

In [None]:
with open("test_dynamic_feat.json", 'wb') as fp:
    for ts in time_series_test:
        fp.write(series_to_jsonline(ts,dynamic_feat=dynamic_features_test).encode(encoding))
        fp.write('\n'.encode(encoding))

In [None]:
df.to_csv('all_data_dynamic_feat.csv',index=True,index_label='datetime')

In [None]:
for ts in time_series_test:
    print (len(ts),ts.name)

In [None]:
for ts in time_series_training:
    print (len(ts),ts.name)