In [None]:
from HTML import config as Config
from HTML.dataset import *
from HTML.config import ratio, nominal, ordinal, meaningless
from HTML.preprocessing import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_train = pd.read_csv(Config.train_path)
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train = preprocessing(df_train)

In [None]:
df_train.shape

In [None]:
fact = pd.factorize(df_train['reservation_status'])
y_train_res = fact[0]
y_train_adr = df_train[df_train['adr'] < 5000]['adr'].to_numpy()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [None]:
categories = df_train[sorted(list(set(nominal+ordinal) - {'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].astype(str).to_numpy()
cat = enc.fit_transform(categories)

In [None]:
cat.shape

In [None]:
num = df_train[sorted(list(set(df_train.columns) - set(nominal+ordinal) - {'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].to_numpy()

In [None]:
num.shape

In [None]:
tmp = np.concatenate((cat, num), axis=1)
x_train_res = tmp.copy()
x_train_adr = tmp[df_train['adr'] < 5000,:].copy()

In [None]:
# dummies = pd.get_dummies(
#         data=df_train[set(df_train.columns) - {'reservation_status', 'reservation_status_date', 'is_canceled'}],
#         columns=set(nominal+ordinal) - {'reservation_status', 'reservation_status_date', 'is_canceled'},
#         drop_first=True,
#         dummy_na=True)
# dummies = dummies[sorted(set(dummies.columns) - {'index', 'ID'} - set(meaningless))]
# x_train_res = dummies[sorted(list(set(dummies.columns) - {'adr'}))].to_numpy()
# x_train_adr = dummies[dummies['adr'] < 5000][sorted(list(set(dummies.columns) - {'adr'}))].to_numpy()

In [None]:
print('adr:', x_train_adr.shape, y_train_adr.shape)
print('res:', x_train_res.shape, y_train_res.shape)

In [None]:
from xgboost import XGBClassifier, XGBRegressor

In [None]:
regr = XGBRegressor(
    n_estimators=100,
    learning_rate=.3,
    max_depth=6,
    subsample=1,
    n_jobs=6,
    colsample_bytree=.7,
    random_state=1126,
    gamma=10
)
regr.fit(x_train_adr, y_train_adr)

In [None]:
clf = XGBClassifier(
    objective='multi:prob',
    n_estimators=10,
    learning_rate=.3,
    max_depth=6,
    subsample=.3,
    n_jobs=6,
    colsample_bytree=.9,
    random_state=1126,
    gamma=10,
    num_class=2
)
clf.fit(x_train_res, y_train_res)

In [None]:
pred_adr = regr.predict(x_train_adr)
prob_res = clf.predict_proba(x_train_adr)

In [None]:
pred_res = 1 - prob_res[:,0].copy()

In [None]:
pred_res

In [None]:
tmp = clf.predict(x_train_adr)

In [None]:
new_df_train = df_train[df_train['adr'] < 5000].copy().reset_index()
new_df_train['revenue'] = new_df_train['adr'] * np.sum(new_df_train[['stays_in_weekend_nights', 'stays_in_week_nights']], axis=1) * (1-new_df_train['is_canceled'])
new_df_train['pred_adr'] = pred_adr
# new_df_train['pred_is_canceled'] = (tmp != 0).astype(float)
new_df_train['pred_is_canceled'] = pred_res
new_df_train['pred_revenue'] = new_df_train['pred_adr'] * np.sum(new_df_train[['stays_in_weekend_nights', 'stays_in_week_nights']], axis=1) * (1-new_df_train['pred_is_canceled'])
new_df_train[['revenue', 'pred_revenue']].head()
def calculate_revenue_label(x):
    tmp = add_arrival_date(x)
    tmp = tmp['pred_adr'] * np.sum(tmp[['stays_in_weekend_nights', 'stays_in_week_nights']], axis=1) * (1-tmp['pred_is_canceled'])
    x['pred_revenue'] = tmp
    predict_revenue = x.groupby(['arrival_date'])[['pred_revenue']].sum()
    tmp = (predict_revenue['pred_revenue'] / 10000).map(int)
    predict_revenue['pred_label'] = tmp
    return predict_revenue

calculate_revenue_label(new_df_train)

In [None]:
plt.scatter(new_df_train['pred_revenue'], new_df_train['revenue'])
plt.show()

In [None]:
np.mean((new_df_train['revenue'] - new_df_train['pred_revenue']).to_numpy() ** 2)

In [None]:
np.mean(np.abs((new_df_train['revenue'] - new_df_train['pred_revenue']).to_numpy()))

In [None]:
new_df_train = add_arrival_date(new_df_train)

In [None]:
new_df_train_sum = new_df_train.groupby('arrival_date').sum()

In [None]:
new_df_train_sum.head()

In [None]:
plt.scatter(new_df_train_sum['pred_revenue'], new_df_train_sum['revenue'])
plt.show()

In [None]:
np.mean((new_df_train_sum['revenue'] - new_df_train_sum['pred_revenue']).to_numpy() ** 2)

In [None]:
np.mean(np.abs((new_df_train_sum['revenue'] - new_df_train_sum['pred_revenue']).to_numpy()))

In [None]:
y_train = pd.read_csv(Config.train_label_path)
train = new_df_train_sum.merge(y_train, left_index=True, right_on='arrival_date')

In [None]:
plt.scatter(train['pred_revenue'], train['label'], alpha=.5)
plt.scatter(train['revenue'], train['label'], alpha=.5)
plt.show()

In [None]:
np.mean(np.abs(np.floor(train['revenue'] / 10000) - train['label']))

In [None]:
np.mean(np.abs(np.floor(train['pred_revenue'] / 10000) - train['label']))

In [None]:
np.mean(np.abs(train['pred_revenue'] / 10000 - train['label']))

In [None]:
df_test = pd.read_csv(Config.test_path)
df_test.head()

In [None]:
df_test.shape

In [None]:
categories = df_test[sorted(list(set(nominal+ordinal) - {'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].astype(str).to_numpy()
cat = enc.transform(categories)

In [None]:
cat.shape

In [None]:
num = df_test[sorted(list(set(df_test.columns) - set(nominal+ordinal) - {'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].to_numpy()

In [None]:
num.shape

In [None]:
tmp = np.concatenate((cat, num), axis=1)
x_test = tmp.copy()

In [None]:
print('test:', x_test.shape)

In [None]:
pred_adr = regr.predict(x_test)
prob_res = clf.predict_proba(x_test)

In [None]:
pred_res = 1- prob_res[:,0].copy()

In [None]:
pred_res

In [None]:
tmp = clf.predict(x_test)

In [None]:
new_df_test = df_test.copy().reset_index()
new_df_test['pred_adr'] = pred_adr
# new_df_test['pred_is_canceled'] = (tmp != 0).astype(float)
new_df_test['pred_is_canceled'] = pred_res
new_df_test['pred_revenue'] = new_df_test['pred_adr'] * np.sum(new_df_test[['stays_in_weekend_nights', 'stays_in_week_nights']], axis=1) * (1-new_df_test['pred_is_canceled'])
new_df_test[['pred_adr', 'pred_is_canceled', 'pred_revenue']].head()

In [None]:
new_df_test[['pred_adr', 'pred_is_canceled', 'pred_revenue']].describe()

In [None]:
new_df_test = add_arrival_date(new_df_test)

In [None]:
new_df_test_sum = new_df_test.groupby('arrival_date').sum()

In [None]:
new_df_test_sum.head()

In [None]:
new_df_test_sum['pred_revenue'].describe()

In [None]:
new_df_test_sum['pred_revenue'].hist()
plt.show()

In [None]:
test_index = new_df_test_sum.index

In [None]:
label = np.floor(new_df_test_sum['pred_revenue'].to_numpy() / 10000)

In [None]:
from HTML.save import save_prediction
save_prediction('../outputs//xgboost2.csv', test_index, label)