In [None]:
from HTML import config as Config
from HTML.dataset import *
from HTML.config import ratio, nominal, ordinal, meaningless
from HTML.preprocessing import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Load training set

In [None]:
df_train = pd.read_csv(Config.train_path)
df_train.head()

In [None]:
df_train = add_arrival_date(df_train)

In [None]:
df_train.shape

In [None]:
df_train = preprocessing(df_train)

In [None]:
df_train.shape

In [None]:
y_train_is_canceled = df_train['is_canceled'].to_numpy()
y_train_adr = df_train[df_train['adr'] < 5000]['adr'].to_numpy()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [None]:
categories = df_train[sorted(list(set(nominal+ordinal) - {'arrival_date', 'agent', 'company', 'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].astype(str).to_numpy()
cat = enc.fit_transform(categories)

In [None]:
cat.shape

In [None]:
num = df_train[sorted(list(set(df_train.columns) - set(nominal+ordinal) - {'arrival_date', 'agent', 'company','adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].to_numpy()

In [None]:
tmp = np.concatenate((cat, num), axis=1)
x_train_is_canceled = tmp.copy()
x_train_adr = tmp[df_train['adr'] < 5000,:].copy()

In [None]:
print('adr:', x_train_adr.shape, y_train_adr.shape)
print('res:', x_train_is_canceled.shape, y_train_is_canceled.shape)

## Load testing set

In [None]:
df_test = pd.read_csv(Config.test_path)
df_test.head()

In [None]:
df_test = add_arrival_date(df_test)

In [None]:
df_test.shape

In [None]:
categories = df_test[sorted(list(set(nominal+ordinal) - {'arrival_date', 'agent', 'company', 'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].astype(str).to_numpy()
cat = enc.transform(categories)

In [None]:
cat.shape

In [None]:
num = df_test[sorted(list(set(df_test.columns) - set(nominal+ordinal) - {'arrival_date', 'agent', 'company', 'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].to_numpy()

In [None]:
num.shape

In [None]:
tmp = np.concatenate((cat, num), axis=1)
x_test = tmp.copy()

In [None]:
print('test:', x_test.shape)

## Training adr

In [None]:
from xgboost import XGBClassifier, XGBRegressor

In [None]:
regr = XGBRegressor(
    n_estimators=100,
    learning_rate=.3,
    max_depth=6,
    subsample=1,
    n_jobs=16,
    colsample_bytree=.7,
    random_state=1126,
    gamma=10,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    deterministic_histogram=False
)

In [None]:
regr.get_params()

In [None]:
train_times = df_train[df_train['adr'] < 5000]['arrival_date'].to_numpy()
test_times = df_test['arrival_date'].to_numpy()

In [None]:
model, x_train, y_train, x_test, train_times, test_times = regr, x_train_adr, y_train_adr, x_test, train_times, test_times
window_size=10

In [None]:
train_dates = np.unique(train_times)
test_dates = np.unique(test_times)
time_x_train = np.zeros((x_train.shape[0], window_size))
pred_y = np.array([np.mean(y_train[train_times == d]) for d in train_dates[:window_size]]).reshape(1,-1)
for d in train_dates[window_size:]:
    time_x_train[train_times == d] = pred_y.copy()
    pred_y = np.roll(pred_y, -1)
    pred_y[0, -1] = np.mean(y_train[train_times == d])
x_train = np.concatenate((x_train, time_x_train), axis=1)

In [None]:
result = []
selected = [t in train_dates[window_size:2*window_size] for t in train_times]
cur_x_train = x_train[selected]
cur_y_train = y_train[selected]
N = len(train_dates[2*window_size:])
amae = 0
amse = 0
size = 0
for i, d in enumerate(train_dates[2*window_size:]):
    model.fit(cur_x_train, cur_y_train)
    selected = (train_times == d)
    cur_x_test = x_train[selected]
#     print(cur_x_train.shape, cur_x_test.shape)
    cur_y_test = model.predict(cur_x_test)
    result.append(cur_y_test)
    mae = np.mean(np.abs(cur_y_test - y_train[selected]))
    mse = np.mean((cur_y_test - y_train[selected]) ** 2)
    cur_size = np.sum(selected)
    amae = (amae * size + mae * cur_size) / (size + cur_size)
    amse = (amse * size + mse * cur_size) / (size + cur_size)
    size += cur_size
#     selected = [t in train_dates[window_size+i+1:2*window_size+i+1] for t in train_times]
#     cur_x_train = x_train[selected]
#     cur_y_train = y_train[selected]
    cur_x_train = np.concatenate((cur_x_train, cur_x_test), axis=0)
    cur_y_train = np.concatenate((cur_y_train, y_train[selected]), axis=0)
#     cur_y_train = np.concatenate((cur_y_train, cur_y_test), axis=0)
    print(f'[{i+1}/{N}] Evaluating: MAE: {mae} / Avg. MAE: {amae} / MSE: {mse} / Avg. MSE {amse}', end='\r')

In [None]:
pred = []
N = len(test_dates)
size = 0
for i, d in enumerate(test_dates):
    print(f'[{i+1}/{N}] Predicting', end='\r')
    model.fit(cur_x_train, cur_y_train)
    selected = (test_times == d)
    cur_x_test = x_test[selected]
    cur_x_test = np.concatenate((cur_x_test, np.repeat(pred_y, cur_x_test.shape[0], axis=0)), axis=1)
#     print(cur_x_train.shape, cur_x_test.shape)
    cur_y_test = model.predict(cur_x_test)
    pred.append(cur_y_test)
    cur_size = np.sum(selected)
    size += cur_size
#     selected = [t in train_dates[window_size+i+1:2*window_size+i+1] for t in train_times]
#     cur_x_train = x_train[selected]
#     cur_y_train = y_train[selected]
    cur_x_train = np.concatenate((cur_x_train, cur_x_test), axis=0)
    cur_y_train = np.concatenate((cur_y_train, cur_y_test), axis=0)
    pred_y = np.roll(pred_y, -1, axis=1)
    pred_y[0,-1] = np.mean(cur_y_test)

In [None]:
real_y = y_train_adr[[t in train_dates[2*window_size:] for t in train_times]]

In [None]:
np.mean(np.abs(np.concatenate(result, axis=None) - real_y))

In [None]:
np.mean((np.concatenate(result, axis=None) - real_y)**2)

In [None]:
plt.hist((np.concatenate(result, axis=None) - real_y))
plt.show()

In [None]:
np.save('./time_series_adr_train.npy', np.concatenate(result, axis=None))

In [None]:
np.save('./time_series_adr_test.npy', np.concatenate(pred, axis=None))