In [0]:
#Input Files
import pandas as pd
import time
import numpy as np
from multiprocessing import Pool, Manager
import random

#Input address to test dataset csv file
test_df = pd.read_csv("gdrive/My Drive/GrabAI/training.csv")
#Address to csv files with geohashes (came in same directory as this jupyter notebook)
geohash_df = pd.read_csv("gdrive/My Drive/GrabAI/geohash_df.csv")

In [0]:
#Get last data set values leading up to T
max_day = np.amax(test_df['day'])
test_df = test_df[test_df['day'] >= max_day - 2]

In [0]:
#Convert day and Time stamp to mins

tic = time.time()
def parallelize_dataframe_time(df, func):
  fragment_array = np.array_split(df, 20)
  pool = Pool(20)
  df = pd.concat(pool.map(func, fragment_array))
  pool.close()
  pool.join()
  return df

def get_time(x):
  hour, minute = x[1].split(':')
  return int(x[0]-1)*24*60 + int(hour)*60 + int(minute)

def test_func_time(data):
  fragment_id = random.randint(0,999)
  print('Running: {}'.format(fragment_id))
  data['time'] = data[['day','timestamp']].apply(get_time, axis=1)
  print('Done: {}'.format(fragment_id))
  return data

test_df = parallelize_dataframe_time(test_df, test_func_time)
toc = time.time()
print(toc - tic)

In [0]:
#Get demands for last 97 time values
import itertools

tic = time.time()
max_time = np.amax(test_df['time'])
time_list = np.zeros(97)
for i in range(0,97):
  time_list[i] = max_time - i * 15

unique_geohash_list = geohash_df.values.tolist()
n_unique_geohash = len(unique_geohash_list)
unique_geohash_list  = list(itertools.chain(*unique_geohash_list))
geohash_dict = {unique_geohash_list[i]:i for i in range(n_unique_geohash)}
demand_df = pd.DataFrame(time_list, columns=['time'])
processes = 40

def parallelize_dataframe(df, func):
  fragment_array = np.array_split(df, processes)
  pool = Pool(processes)
  df = pd.concat(pool.map(func, fragment_array))
  pool.close()
  pool.join()
  return df

def get_demand_snapshot(x):
  demand_snapshot = np.zeros(n_unique_geohash)
  time_df = test_df['time'] == x
  df_snapshot = test_df[time_df]
  for _, row in df_snapshot.iterrows():
    demand_snapshot[geohash_dict[row['geohash6']]] = row['demand']
  return demand_snapshot

def test_func_demand(data):
  fragment_id = random.randint(0,999)
  print('Running: {}'.format(fragment_id))
  data['demand_snapshot'] = data['time'].apply(get_demand_snapshot)
  n_done[0] += 1
  print('Done: {}, {}/{}'.format(fragment_id, n_done[0], processes))
  return data

n_done = Manager().list(range(1))
n_done[0] = 0
demand_df = parallelize_dataframe(demand_df, test_func_demand)


toc = time.time()
print(toc - tic)

In [0]:
#create array for model prediction
demand_df.sort_values(by=['time'], inplace=True)
data_list= demand_df['demand_snapshot'].values.tolist()
data = pd.DataFrame(data_list)
test_X = data.values
test_X = test_X.reshape(1,test_X.shape[0],test_X.shape[1])

In [0]:
#Prediction Using Model
import keras

#Input Path to model (Came in the same directory as this jupyter notebook)
new_model = keras.models.load_model('gdrive/My Drive/GrabAI/model_30_96_25.h5')
yhat = new_model.predict(test_X)
names = list()
predictions_df = pd.DataFrame()
predictions_df['geohash'] = unique_geohash_list
for i in range(0,5):
  start = n_unique_geohash*i
  end = (i+1)*n_unique_geohash
  demand = yhat[0][start:end]
  predictions_df['T + {}'.format(i+1)] = demand

In [0]:
predictions_df