### Imports, datasets and helper functions

In [None]:
import pickle, json
from datetime import date, timedelta, datetime

import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import AxesGrid

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from functools import partial
import multiprocessing
from multiprocessing import Pool
num_core = max(multiprocessing.cpu_count()-1,1)

import gc

import autoreload
%load_ext autoreload
%autoreload 2

In [None]:
# to do: change const to CAPS
from utils import prep_holdout_set, get_timespan, get_timespan_15, gen_features, gen_test_features, prep_dataset, extract_time
from utils import time_features, num_labels, num_req

In [None]:
# to do: gen or parse datetime for train set
# do same for holdout set in final notebook

df = pd.read_csv("./data/df_transformed_seasonal_10.csv") # "data/df_transformed_simple_fill_noise.csv"
df = df.set_index('geohash6')
df.columns = pd.DatetimeIndex(df.columns)

cluster_df = pd.read_csv("./data/cluster_df.csv")
cluster_df = cluster_df.set_index('geohash6')

df_raw = pd.read_csv("./data/datetime_coords_dow.csv")
df_raw['datetime'] = pd.to_datetime(df_raw['datetime'])
df_holdout_raw = df_raw.loc[(df_raw.datetime>pd.datetime(2019,5, 17, 12, 0)) & (df_raw.datetime<pd.datetime(2019,5, 24, 12, 0))]

### Create holdout set 

In [None]:
# frac to speed up, tests show not much difference in performance

df_holdout_raw = df_holdout_raw.sample(frac=0.1, random_state=8) 

# Split into partitions for multiprocessing
partition_size = math.floor(len(df_holdout_raw.index) / num_core)
holdout_partitions = [df_holdout_raw.iloc[i*partition_size:i*partition_size+partition_size,:] for i in range(0, num_core)]

# Add remainders after dividing Dataframe
if (num_core * partition_size < len(df_holdout_raw.index)): 
    leftover = df_holdout_raw[num_core * partition_size:]
    holdout_partitions[num_core-1] = pd.concat([holdout_partitions[num_core-1], leftover], axis=0)

In [None]:
# multiprocessing to generate "transformed" holdout set, used to generate test set

results = []
if __name__ == '__main__':
    with Pool(num_core) as p:
        results =  p.map_async(partial(prep_holdout_set, df=df, cluster_df=cluster_df, groupby_sets={}), holdout_partitions)
        p.close()
        p.join()

In [None]:
# to do: change time to be part of feature generation

dt_holdout = datetime(2019, 5, 20) # arbitrary, value does not matter
df_holdout, geohash_holdout = [], []

results = [i for i in results.get()]
for i in range(len(results)):
    df_holdout.append(results[i][0])
    geohash_holdout.append(results[i][1])

geohash_holdout = sum(geohash_holdout, [])
df_holdout = pd.DataFrame(sum(df_holdout, []))
df_holdout.columns = pd.date_range(dt_holdout - timedelta(minutes=15 * num_req), dt_holdout + timedelta(minutes=15 * num_labels),
                         freq="15min")

df_holdout['geohash6'] = geohash_holdout
df_holdout = df_holdout.set_index('geohash6')

time_holdout = df_holdout_raw['datetime'].apply(extract_time)
time_holdout = pd.DataFrame(time_holdout.tolist(), columns=time_features)

#### Init vars

In [None]:
round = 0
model_list = []
y_pred, labels_list = [], [] #  val
y_pred2, labels_list2 = [], [] # test

dt_train = datetime(2019,5,17, 11, 45)
dt_val = dt_train + timedelta(days=7) # beyond 1 week

dt_holdout = datetime(2019, 5, 20) # arbitrary # Make sure same as above

#### Generate train, val, test sets

In [None]:
num_samples = 65 # to use: 400

X_train, y_train = prep_dataset(dt_train, num_samples, df, cluster_df)
X_val, y_val = prep_dataset(dt_val, 1, df, cluster_df)

X_holdout, y_holdout = gen_test_features(dt_holdout, df_holdout, geohash_holdout, cluster_df)

# replace with real time features
X_holdout = X_holdout.drop(time_features, axis=1)
X_holdout = pd.concat([X_holdout.reset_index(),time_holdout], axis=1).set_index('geohash6')
X_holdout = X_holdout.reindex_axis(X_train.columns, axis=1)

In [None]:
# to do: pickle fitted scaler for production

scaler = StandardScaler()
scaler.fit(pd.concat([X_train, X_val, X_holdout]))
X_train[:] = scaler.transform(X_train)
X_val[:] = scaler.transform(X_val)
X_holdout[:] = scaler.transform(X_holdout)

#### Train model

In [None]:
MAX_ROUNDS = 200 # to use 2000
params = {
    'learning_rate' : 0.1,
    'metric': 'rmse',
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'feature_fraction': 0.5,
    'max_bin': 800,
    'min_split_gain': 0.03,
     'subsample_for_bin': 3000,
  'bagging_fraction': 0.5 # for speed, doesnt affect score
}

cate_vars = ['demand-3500', 'demand-3000', 'demand-2800', 'demand-2500', 'demand-2000',
            'lat', 'lon', 'grid_id_by_lat', 'grid_id_by_lon']

evals_result = {}

print('Model {}'.format(round))

dtrain = lgb.Dataset(
            X_train, label=y_train,
            categorical_feature=cate_vars)

dval = lgb.Dataset(
            X_holdout, label=y_holdout, reference=dtrain,
            categorical_feature=cate_vars)

model = lgb.train(
                params, dtrain, num_boost_round=MAX_ROUNDS,
                valid_sets=[dtrain, dval], early_stopping_rounds=20,
                verbose_eval=50, evals_result=evals_result)
    
model_list.append(model)
# to do: save model to drive

### Handle predictions

In [None]:
# Handle prediction

print("Predictions for T+%s", round+1)

y_pred.append(model.predict(X_val).tolist())
labels_list.append(y_val.tolist())

y_pred2.append(model.predict(X_holdout).tolist())
labels_list2.append(y_holdout.tolist())

val_rms = math.sqrt(mean_squared_error([item for items in y_pred for item in items], [item for items in labels_list for item in items]))
print("Validation rmse:", val_rms)

holdout_rms = math.sqrt(mean_squared_error([item for items in y_pred2 for item in items], [item for items in labels_list2 for item in items]))
print("Holdout rmse:", holdout_rms)

In [None]:
# rolling forecast
# Rerun all cells starting from "Generate train, val, test sets" to get the next set of predictions
# Repeat until T+5 to get the overall holdout rmse

round += 1
dt_train = dt_train + timedelta(minutes=15)
dt_val = dt_train + timedelta(days=7)
dt_holdout = dt_holdout + timedelta(minutes=15)

In [None]:
# to do: save predictions
# save models here too if possible