In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import time
import datetime

In [None]:
target_values = ['count','registered','casual']

# no categories included
with_categories = False

# Set datetime column as index to work with data based on Date/Time
df = pd.read_csv('train.csv', parse_dates=['datetime'],index_col=0)
df_test = pd.read_csv('test.csv', parse_dates=['datetime'],index_col=0)

In [None]:
df.index.min(),df.index.max()

In [None]:
df_test.index.min(),df_test.index.max()

In [None]:
df_test.head(25)

In [None]:
df['2011-01']['count'].plot()

In [None]:
# finding out how many hours we need to predict in a month using test.csv data file
hours_to_predict = []
print ('Check maximum hours we need to predict')

predict_window = df_test.groupby([df_test.index.year,df_test.index.month])
for i,x in predict_window:
    delta = x.index.max() - x.index.min() 
    hours = np.ceil(delta.total_seconds()/3600)
    hours_to_predict.append(hours)
    print ("{0}, Hours:{1}".format(i, hours))

print ("Maximum Prediction Length in Hours: ", np.max(hours_to_predict))

In [None]:
freq='H' predict hourly rental count
# 12 days of hourly forecast 
prediction_length = 288 

# choose setting context same as prediction length as a starting point
# controls how far in the past the network can see
context_length = 288

In [None]:
dt_predict_max = pd.Timestamp("2012-12-31 23:00:00", freq=freq) # 2012-12-31 23:00 alt way..pd.datetime(2012,12,31,23,0,0)

dt_dataset_start_time = pd.Timestamp("2011-01-01 00:00:00", freq=freq)
dt_dataset_end_time = pd.Timestamp("2012-12-19 23:00:00", freq=freq)

dt_train_range = (dt_dataset_start_time,
                  dt_dataset_end_time - datetime.timedelta(hours=12*24) )

# Use entire data for testing

dt_test_range = (dt_dataset_start_time, 
                 dt_dataset_end_time) 

In [None]:
dt_predict_max,dt_predict_max+1

In [None]:
# if there are gaps in timesteps
def is_missing_steps(df,start,end,freq='D'):
    dt_range = pd.date_range(start=start,end=end,freq=freq)
    return not dt_range.equals(df[start:end].index)

def get_missing_steps(df,start,end,freq='D'):
    dt_range = pd.date_range(start=start,end=end,freq=freq)
    return dt_range.difference(df[start:end].index)    

# List timeseries with only NaNs

def timeseries_with_only_nans(df):
    l = []
    for col in df.columns:
        if pd.isna(df[col].min()):
            #print (col)
            l.append(col)
    return l

In [None]:
is_missing_steps(df, '2011-01-01 00:00:00', '2011-01-19 23:00:00','H')

In [None]:
get_missing_steps(df, '2011-01-01 00:00:00', '2011-01-19 23:00:00','H')

In [None]:
df['2011-01-02 00:00:00':'2011-01-02 14:00:00']

In [None]:
df['2011-01-02 00:00:00':'2011-01-02 14:00:00']['count'].plot()

In [None]:
df = df.resample('1h').mean()

In [None]:
df['2011-01-02 00:00:00':'2011-01-02 14:00:00']

In [None]:
df['2011-01-02 00:00:00':'2011-01-02 14:00:00']['count'].plot(title='Missing values in training data')

In [None]:
df['2012-01':'2012-02']['count'].plot()

In [None]:
target_values

In [None]:
df[dt_test_range[0]:dt_test_range[1]]['count'].tail()

In [None]:
dt_test_range

In [None]:
dt_train_range

In [None]:
time_series_test = []
time_series_training = []

for target in target_values:
    time_series_test.append(df[dt_test_range[0]:dt_test_range[1]][target])
    time_series_training.append(df[dt_train_range[0]:dt_train_range[1]][target])

In [None]:
time_series_test[0][:5],time_series_test[1][:5],time_series_test[2][:5]

In [None]:
time_series_training[0][:5],time_series_training[1][:5],time_series_training[2][:5]

In [None]:
time_series_test[0].plot(label='test')
time_series_training[0].plot(label='train')
plt.legend()
plt.show()

In [None]:
def encode_target(ts):
    return [x if np.isfinite(x) else "NaN" for x in ts]  

def encode_dynamic_feat(dynamic_feat):  
    l = []
    for col in dynamic_feat:
        assert (not dynamic_feat[col].isna().any()), col  + ' has NaN'             
        l.append(dynamic_feat[col].tolist())
    return l

def series_to_obj(ts, cat=None, dynamic_feat=None):
    obj = {"start": str(ts.index[0]), "target": encode_target(ts)}
    if cat is not None:
        obj["cat"] = cat
    if dynamic_feat is not None:
        obj["dynamic_feat"] = encode_dynamic_feat(dynamic_feat)
    return obj

def series_to_jsonline(ts, cat=None, dynamic_feat=None):
    return json.dumps(series_to_obj(ts, cat, dynamic_feat))   

In [None]:
list(time_series_training[0][:5])

In [None]:
series_to_obj(time_series_training[0][:5],[0] if with_categories else None)

In [None]:
series_to_jsonline(time_series_training[0][:5],[0] if with_categories else None)

In [None]:
encoding = "utf-8"
cat_idx = 0

train_file_name = "train.json"
test_file_name = "test.json"

if with_categories:
    train_file_name = "train_with_categories.json"
    test_file_name = "test_with_categories.json"

with open(train_file_name, 'wb') as fp:
    for ts in time_series_training:
        fp.write(series_to_jsonline(ts,[cat_idx] if with_categories else None).encode(encoding))
        fp.write('\n'.encode(encoding))
        cat_idx += 1

In [None]:
cat_idx = 0
with open(test_file_name, 'wb') as fp:
    for ts in time_series_test:
        fp.write(series_to_jsonline(ts,[cat_idx] if with_categories else None).encode(encoding))
        fp.write('\n'.encode(encoding))
        cat_idx += 1

In [None]:
df.to_csv('all_data.csv',index=True,index_label='datetime')

In [None]:
for ts in time_series_test:
    print (len(ts),ts.name)

In [None]:
for ts in time_series_training:
    print (len(ts),ts.name)