#### Imports and datasets

In [1]:
from datetime import date, timedelta, datetime
import math
import pickle
import gc

from functools import partial
import multiprocessing
from multiprocessing import Pool
num_core = max(multiprocessing.cpu_count()-1,1)

import numpy as np
import pandas as pd

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from utils import parse_datetime, transform_df
from utils import prep_holdout_set, get_timespan_15, gen_features, gen_test_features, prep_dataset
from utils import TIME_FEATURES, NUM_LABELS, NUM_REQ

In [3]:
df_train_raw = pd.read_csv('./data_raw/training.csv')

df_holdout_raw = pd.read_csv('./data_raw/mock_holdout.csv')

cluster_df = pd.read_csv("./data_temp/cluster_df.csv")
cluster_df = cluster_df.set_index('geohash6')

In [4]:
# combine both sets to create ground truth set for extracting features and labels
# when creating train and test set later, it is ensured that test samples are leaked into train set

df_raw = pd.concat([df_train_raw, df_holdout_raw], axis=0)

In [5]:
# Preprocessing

NUM_DAYS = max(df_raw['day'])
START_DATE = date(2019, 4, 1) # arbitrary value chosen for first day of dataset
# April has 30 days, May 31, making up the 61 days in train set

parse_datetime(df_raw, START_DATE) # takes a few mins
df = transform_df(df_raw, START_DATE, NUM_DAYS)

In [6]:
# Assuming T is random and multiple for each location
# Remove samples from holdout set that cannot be used as T (i.e. last 5 time stamps from holdout set)

parse_datetime(df_holdout_raw, START_DATE)
last_timestamp = max(df_holdout_raw['datetime'])
df_holdout_raw = df_holdout_raw[df_holdout_raw['datetime'] < last_timestamp-timedelta(minutes=60)]

In [7]:
del df_train_raw, df_raw, last_timestamp
gc.collect()

114

#### Create intermediate holdout set from which to extract holdout features

In [8]:
# multiprocessing to generate "transformed" holdout set, used to generate test set
# This will take quite some time, even with multiprocessing
# 3 cores: 17 min for 300k rows in holdout set

# frac to speed up, tests show not much difference in performance
df_holdout_raw = df_holdout_raw.sample(frac=0.1, random_state=8) # to comment out

# Split into partitions for multiprocessing
partition_size = math.floor(len(df_holdout_raw.index) / num_core)
holdout_partitions = [df_holdout_raw.iloc[i*partition_size:i*partition_size+partition_size,:] for i in range(0, num_core)]

# Add remainders after dividing Dataframe
if (num_core * partition_size < len(df_holdout_raw.index)): 
    leftover = df_holdout_raw[num_core * partition_size:]
    holdout_partitions[num_core-1] = pd.concat([holdout_partitions[num_core-1], leftover], axis=0)

In [9]:
results = []
if __name__ == '__main__':
    with Pool(num_core) as p:
        results =  p.map_async(partial(prep_holdout_set, df=df), holdout_partitions)
        p.close()
        p.join()

dt_holdout = datetime(2019, 5, 20) # arbitrary
df_holdout, geohash_holdout = [], []

results = [i for i in results.get()]
for i in range(len(results)):
    df_holdout.append(results[i][0])
    geohash_holdout.append(results[i][1])

geohash_holdout = sum(geohash_holdout, [])
df_holdout = pd.DataFrame(sum(df_holdout, []))
df_holdout.columns = pd.date_range(dt_holdout - timedelta(minutes=15 * NUM_REQ), dt_holdout + timedelta(minutes=15 * NUM_LABELS),
                         freq="15min")

df_holdout['geohash6'] = geohash_holdout
df_holdout = df_holdout.set_index('geohash6')
dt_holdout_list = df_holdout_raw['datetime']
time_inc = pd.to_timedelta([timedelta(minutes=15)] * len(dt_holdout_list),'minutes')

del results

#### Load models, prepare test feature set and predict

In [43]:
y_pred, labels_list = [], []

myFile = open('./models/scaler.bin', 'rb')
scaler = pickle.load(myFile)
myFile.close()

In [44]:
# Generate test set

X_holdout, y_holdout = gen_test_features(dt_holdout-timedelta(minutes=15), df_holdout, geohash_holdout, dt_holdout_list-time_inc, cluster_df)
X_holdout[:] = scaler.transform(X_holdout)

# Load model and predict
model = lgb.Booster(model_file='./models/model_1.txt')

print("Predicting...")

y_pred = model.predict(X_holdout).tolist()
labels_list = y_holdout.tolist()
      
# Calc overall rmse and save predictions
rms = math.sqrt(mean_squared_error([item for item in y_pred], [item for item in labels_list]))
print("Overall holdout rmse:", rms)

Predicting...
Overall holdout rmse: 0.03260220766803412


In [45]:
# Save predictions

df_holdout_raw['prediction'] = y_pred
df_holdout_raw.to_csv('./predictions.csv', index=False, columns=['geohash6', 'timestamp', 'demand', 'prediction'])