In [None]:
from HTML import config as Config
from HTML.dataset import *
from HTML.config import ratio, nominal, ordinal, meaningless
# from HTML.preprocessing import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_train = pd.read_csv(Config.train_path)
df_train.head()

In [None]:
df_train.shape

In [None]:
def preprocessing(df):
    result = df.copy()
    result = result.drop(columns=['company', 'ID', 'index'], errors='ignore')
    result = result.fillna({
        'children': 0,
        'country': 'None',
        'agent': 'None'
    })
    result['stays_in_nights'] = result['stays_in_weekend_nights'] + result['stays_in_week_nights']
    result['same_room_type'] = (df['assigned_room_type'] == df['reserved_room_type']).astype(str)
    result['persons'] = result['adults'] + result['children'] + result['babies']
    result['previous_booking'] = result['previous_cancellations']+result['previous_bookings_not_canceled']
    result['confirmed_lead_time'] = result['lead_time'] - result['days_in_waiting_list']
    result['previous_cancellation_rate'] = [pc / pb if pb > 0 else .5 for pc, pb in result[['previous_cancellations', 'previous_booking']].to_numpy()]
    categories = sorted(list(set(result.columns) & set(ordinal+nominal+meaningless+['same_room_type'])))
    numeral = sorted(list(set(result.columns) - set(categories)))
    result[categories] = result[categories].astype(str)
    result[numeral] = result[numeral].astype(float)
    return result

In [None]:
df_train = preprocessing(df_train)
df_train.head()

In [None]:
df_train.shape

In [None]:
y_train_is_canceled = df_train['is_canceled'].to_numpy()
y_train_adr = df_train[df_train['adr'] < 5000]['adr'].to_numpy()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(drop='if_binary', sparse=False)

In [None]:
not_features = {'arrival_date_year', 'adr', 'reservation_status', 'reservation_status_date', 'is_canceled'}

In [None]:
categories_train = df_train[sorted(list(set(df_train.columns[df_train.dtypes == object]) - not_features))].astype(str)
cat = enc.fit_transform(categories_train.to_numpy())

In [None]:
categories_train.shape

In [None]:
cat.shape

In [None]:
number = df_train[sorted(list(set(df_train.columns[df_train.dtypes == float]) - not_features))]
num = number.to_numpy()

In [None]:
num.shape

In [None]:
tmp = np.concatenate((cat, num), axis=1)
x_train_is_canceled = tmp.copy()
x_train_adr = tmp[df_train['adr'] < 5000,:].copy()

In [None]:
# dummies = pd.get_dummies(
#         data=df_train[set(df_train.columns) - {'reservation_status', 'reservation_status_date', 'is_canceled'}],
#         columns=set(nominal+ordinal) - {'reservation_status', 'reservation_status_date', 'is_canceled'},
#         drop_first=True,
#         dummy_na=True)
# dummies = dummies[sorted(set(dummies.columns) - {'index', 'ID'} - set(meaningless))]
# x_train_res = dummies[sorted(list(set(dummies.columns) - {'adr'}))].to_numpy()
# x_train_adr = dummies[dummies['adr'] < 5000][sorted(list(set(dummies.columns) - {'adr'}))].to_numpy()

In [None]:
print('adr:', x_train_adr.shape, y_train_adr.shape)
print('res:', x_train_is_canceled.shape, y_train_is_canceled.shape)

In [None]:
from sklearn.ensemble import VotingClassifier, VotingRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

In [None]:
params_xgb_r = {
    'objective': 'reg:squarederror',
    'base_score': 0.5,
    'booster': 'gbtree',
    'colsample_bylevel': 1,
    'colsample_bynode': 1,
    'colsample_bytree': 0.3,
    'gamma': 10,
    'gpu_id': 0,
    'importance_type': 'gain',
    'interaction_constraints': '',
    'learning_rate': 0.3,
    'max_delta_step': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'monotone_constraints': '()',
    'n_estimators': 100,
    'n_jobs': -1,
    'num_parallel_tree': 1,
    'random_state': 1126,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'scale_pos_weight': 1,
    'subsample': 0.7,
    'tree_method': 'gpu_hist',
    'validate_parameters': 1,
    'verbosity': None,
    'predictor': 'gpu_predictor',
    'deterministic_histogram': False
}
regr_xgb = XGBRegressor(**params_xgb_r)

In [None]:
params_lgbm_r = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 0.5,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': 30,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': -1,
    'num_leaves': 64,
    'objective': None,
    'random_state': 1126,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'silent': True,
    'subsample': 0.5,
    'subsample_for_bin': 2000,
    'subsample_freq': 0
}
regr_lgbm = LGBMRegressor(**params_lgbm_r)

In [None]:
params_cat_r = {
    'depth': 8,
    'l2_leaf_reg': 1,
    'rsm': 1,
    'loss_function': 'RMSE',
    'border_count': 254,
    'subsample': 0.8,
    'n_estimators': 100,
    'random_state': 1126,
    'max_leaves': 31
}
regr_cat = CatBoostRegressor(**params_cat_r)

In [None]:
from sklearn.model_selection import train_test_split

x_train_adr_t, x_train_adr_v, y_train_adr_t, y_train_adr_v = train_test_split(x_train_adr, y_train_adr, test_size=.2, random_state=1126)

In [None]:
regr_xgb.fit(
    x_train_adr_t, y_train_adr_t,
    eval_metric=['rmse', 'mae'],
    eval_set=[(x_train_adr_t, y_train_adr_t), (x_train_adr_v, y_train_adr_v)],early_stopping_rounds=1
)
regr_lgbm.fit(
    x_train_adr_t, y_train_adr_t,
    eval_metric=['rmse', 'mae'],
    eval_set=[(x_train_adr_t, y_train_adr_t), (x_train_adr_v, y_train_adr_v)],early_stopping_rounds=1
)
regr_cat.fit(
    x_train_adr, y_train_adr,
    early_stopping_rounds=1
)

In [None]:
params_xgb_c = {
    'objective': 'count:poisson',
    'use_label_encoder': True,
    'base_score': 0.5,
    'booster': 'gbtree',
    'colsample_bylevel': 1,
    'colsample_bynode': 1,
    'colsample_bytree': 0.7,
    'gamma': 10,
    'gpu_id': 0,
    'importance_type': 'gain',
    'interaction_constraints': '',
    'learning_rate': 0.3,
    'max_delta_step': 0.699999988,
    'max_depth': 10,
    'min_child_weight': 1,
    'monotone_constraints': '()',
    'n_estimators': 100,
    'n_jobs': -1,
    'num_parallel_tree': 1,
    'random_state': 1126,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'scale_pos_weight': None,
    'subsample': 0.3,
    'tree_method': 'gpu_hist',
    'validate_parameters': 1,
    'verbosity': None,
    'predictor': 'gpu_predictor',
    'deterministic_histogram': False
}
clf_xgb = XGBClassifier(**params_xgb_c)

In [None]:
params_lgbm_c = {
    'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 1,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': 30,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': -1,
    'num_leaves': 31,
    'objective': None,
    'random_state': 1126,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'silent': True,
    'subsample': 1,
    'subsample_for_bin': 2000000,
    'subsample_freq': 0 
}
clf_lgbm = LGBMClassifier(**params_lgbm_c)

In [None]:
params_cat_c = {
    'depth': 6,
    'l2_leaf_reg': 10,
    'rsm': 0.5,
    'loss_function': 'Logloss',
    'border_count': 64,
    'subsample': 0.66,
    'n_estimators': 100,
    'random_state': 1126,
    'max_leaves': 31
}
clf_cat = CatBoostClassifier(**params_cat_c)

In [None]:
from sklearn.model_selection import train_test_split

x_train_is_canceled_t, x_train_is_canceled_v, y_train_is_canceled_t, y_train_is_canceled_v = train_test_split(x_train_is_canceled, y_train_is_canceled, test_size=.2, random_state=1126)

In [None]:
clf_xgb.fit(
    x_train_is_canceled_t, y_train_is_canceled_t,
    eval_metric=['error', 'logloss'],
    eval_set=[(x_train_is_canceled_t, y_train_is_canceled_t), (x_train_is_canceled_v, y_train_is_canceled_v)],early_stopping_rounds=1
)
clf_lgbm.fit(
    x_train_is_canceled_t, y_train_is_canceled_t,
    eval_metric=['error', 'logloss'],
    eval_set=[(x_train_is_canceled_t, y_train_is_canceled_t), (x_train_is_canceled_v, y_train_is_canceled_v)],early_stopping_rounds=10
)
clf_cat.fit(
    x_train_is_canceled, y_train_is_canceled,
    early_stopping_rounds=10
)

In [None]:
pred_adr_xgb = regr_xgb.predict(x_train_adr)
pred_adr_lgbm = regr_lgbm.predict(x_train_adr)
pred_adr_cat = regr_cat.predict(x_train_adr)

In [None]:
for p, name in zip([pred_adr_xgb, pred_adr_lgbm, pred_adr_cat],['xgb','lgbm','cat']):
    print(f'--- {name} ---')
    print('MAE:', np.mean(np.abs(p - y_train_adr)))
    print('MSE:', np.mean((p - y_train_adr)**2))

In [None]:
for i in [1,2,3]:
    for j in [1,2,3]:
        for k in [1,2,3]:
            if len({i,j,k}) == 1 and i != 1:
                continue
            p = i * pred_adr_xgb + j * pred_adr_lgbm + k * pred_adr_cat
            p /= (i+j+k)
            print(f'[{i}, {j}, {k}]: MAE: {np.mean(np.abs(p - y_train_adr))} / MSE: {np.mean((p - y_train_adr)**2)}')

In [None]:
pred_adr = (3 * pred_adr_xgb + 3 * pred_adr_lgbm + 1 * pred_adr_cat) / (3+3+1)

In [None]:
np.mean(np.abs(pred_adr - y_train_adr))

In [None]:
np.mean((pred_adr - y_train_adr)**2)

In [None]:
prob_is_canceled_xgb = clf_xgb.predict_proba(x_train_adr)
prob_is_canceled_lgbm = clf_lgbm.predict_proba(x_train_adr)
prob_is_canceled_cat = clf_cat.predict_proba(x_train_adr)

In [None]:
p

In [None]:
y_train_is_canceled[df_train['adr'] < 5000].astype(int)

In [None]:
y = y_train_is_canceled[df_train['adr'] < 5000].astype(int)
for i in [1,2,3]:
    for j in [1,2,3]:
        for k in [1,2,3]:
            if len({i,j,k}) == 1 and i != 1:
                continue
            p = i * prob_is_canceled_xgb + j * prob_is_canceled_lgbm + k * prob_is_canceled_cat
            p /= (i+j+k)
            p = np.round(p[:,1])
            
            print(f'[{i}, {j}, {k}]: Acc: {np.mean(np.abs(p - y))}')

In [None]:
pred_is_canceled_xgb = clf_xgb.predict(x_train_adr)
pred_is_canceled_lgbm = clf_lgbm.predict(x_train_adr)
pred_is_canceled_cat = clf_cat.predict(x_train_adr)

In [None]:
pred_is_canceled_xgb = pred_is_canceled_xgb.astype(int)
pred_is_canceled_lgbm = pred_is_canceled_lgbm.astype(int)
pred_is_canceled_cat = pred_is_canceled_cat.astype(int)

In [None]:
y = y_train_is_canceled[df_train['adr'] < 5000].astype(int)
for p, name in zip([pred_is_canceled_xgb, pred_is_canceled_lgbm, pred_is_canceled_cat],['xgb','lgbm','cat']):
    print(f'--- {name} ---')
    print('Acc:', np.mean(np.abs(p - y)))

In [None]:
y = y_train_is_canceled[df_train['adr'] < 5000].astype(int)
for i in [1,2,3]:
    for j in [1,2,3]:
        for k in [1,2,3]:
            if len({i,j,k}) == 1 and i != 1:
                continue
            if i > (i+j+k) / 2 or j > (i+j+k) / 2 or k > (i+j+k) / 2:
                continue
            p = i * pred_is_canceled_xgb + j * pred_is_canceled_lgbm + k * pred_is_canceled_cat
            p = p >= (i+j+k) / 2
            print(f'[{i}, {j}, {k}]: Acc: {np.mean(np.abs(p - y))}')

In [None]:
tmp = (1 * prob_is_canceled_xgb + 2 * prob_is_canceled_lgbm + 1 * prob_is_canceled_cat) / (1+2+1)
tmp = np.round(tmp[:,1]).astype(int)
np.unique(tmp)

In [None]:
tmp

In [None]:
1 - np.mean(np.abs(tmp - y_train_is_canceled[df_train['adr'] < 5000].astype(int)))

In [None]:
new_df_train = df_train[df_train['adr'] < 5000].copy().reset_index()
new_df_train['revenue'] = new_df_train['adr'] * np.sum(new_df_train[['stays_in_weekend_nights', 'stays_in_week_nights']], axis=1) * (1-new_df_train['is_canceled'].astype(int))
new_df_train['pred_adr'] = pred_adr
new_df_train['pred_is_canceled'] = tmp
new_df_train['pred_revenue'] = new_df_train['pred_adr'] * np.sum(new_df_train[['stays_in_weekend_nights', 'stays_in_week_nights']], axis=1) * (1-new_df_train['pred_is_canceled'])
new_df_train[['revenue', 'pred_revenue']].head()

In [None]:
plt.scatter(new_df_train['pred_revenue'], new_df_train['revenue'], alpha=.3)
plt.show()

In [None]:
np.mean((new_df_train['revenue'] - new_df_train['pred_revenue']).to_numpy() ** 2)

In [None]:
np.mean(np.abs((new_df_train['revenue'] - new_df_train['pred_revenue']).to_numpy()))

In [None]:
dates = ['arrival_date_year', 'arrival_date_day_of_month']
new_df_train[dates] = new_df_train[dates].astype(int)
new_df_train = add_arrival_date(new_df_train)

In [None]:
new_df_train_sum = new_df_train[['revenue', 'pred_revenue', 'arrival_date']].groupby('arrival_date').sum()

In [None]:
new_df_train_sum.head()

In [None]:
plt.scatter(new_df_train_sum['pred_revenue'], new_df_train_sum['revenue'], alpha=.3)
plt.show()

In [None]:
np.mean((new_df_train_sum['revenue'] - new_df_train_sum['pred_revenue']).to_numpy() ** 2)

In [None]:
np.mean(np.abs((new_df_train_sum['revenue'] - new_df_train_sum['pred_revenue']).to_numpy()))

In [None]:
y_train = pd.read_csv(Config.train_label_path)
train = new_df_train_sum.merge(y_train, left_index=True, right_on='arrival_date')

In [None]:
plt.scatter(train['pred_revenue'], train['label'], alpha=.5)
plt.scatter(train['revenue'], train['label'], alpha=.5)
plt.show()

In [None]:
np.mean(np.abs(np.floor(train['revenue'] / 10000) - train['label']))

In [None]:
np.mean(np.abs(np.floor(train['pred_revenue'] / 10000) - train['label']))

In [None]:
np.mean(np.abs(train['pred_revenue'] / 10000 - train['label']))

In [None]:
plt.plot(train['revenue'])
plt.show()
plt.plot(train['pred_revenue'])
plt.show()

In [None]:
df_test = pd.read_csv(Config.test_path)
df_test.head()

In [None]:
df_test = preprocessing(df_test)
df_test.head()

In [None]:
num.shape

In [None]:
df_test.shape

In [None]:
categories_test = df_test[sorted(list(set(df_test.columns[df_test.dtypes == object]) - not_features))].astype(str)
for c in categories_train.columns:
    if 'None' in categories_train[c].unique():
        unkown = set(categories_test[c].unique()) - set(categories_train[c].unique())
        categories_test[c][categories_test[c].isin(unkown)] = 'None'
cat = enc.transform(categories_test.to_numpy())

In [None]:
categories_test.shape

In [None]:
cat.shape

In [None]:
number = df_test[sorted(list(set(df_test.columns[df_test.dtypes == float]) - not_features))]
num = number.to_numpy()

In [None]:
num.shape

In [None]:
tmp = np.concatenate((cat, num), axis=1)
x_test = tmp.copy()

In [None]:
print('test:', x_test.shape)

In [None]:
pred_adr_xgb = regr_xgb.predict(x_test)
pred_adr_lgbm = regr_lgbm.predict(x_test)
pred_adr_cat = regr_cat.predict(x_test)

In [None]:
pred_adr = (3 * pred_adr_xgb + 3 * pred_adr_lgbm + 1 * pred_adr_cat) / (3+3+1)

In [None]:
prob_is_canceled_xgb = clf_xgb.predict_proba(x_test)
prob_is_canceled_lgbm = clf_lgbm.predict_proba(x_test)
prob_is_canceled_cat = clf_cat.predict_proba(x_test)

In [None]:
tmp = (1 * prob_is_canceled_xgb + 2 * prob_is_canceled_lgbm + 1 * prob_is_canceled_cat) / (1+2+1)
tmp = np.round(tmp[:,1]).astype(int)
np.unique(tmp)

In [None]:
tmp

In [None]:
new_df_test = df_test.copy().reset_index()
new_df_test['pred_adr'] = pred_adr
new_df_test['pred_is_canceled'] = tmp
new_df_test['pred_revenue'] = new_df_test['pred_adr'] * np.sum(new_df_test[['stays_in_weekend_nights', 'stays_in_week_nights']], axis=1) * (1-new_df_test['pred_is_canceled'])
new_df_test[['pred_adr', 'pred_is_canceled', 'pred_revenue']].head()

In [None]:
new_df_train['is_canceled'] = new_df_train['is_canceled'].astype(int)
new_df_train[['pred_adr', 'adr', 'pred_is_canceled', 'is_canceled', 'pred_revenue', 'revenue']].describe()

In [None]:
new_df_test[['pred_adr', 'pred_is_canceled', 'pred_revenue']].describe()

In [None]:
dates = ['arrival_date_year', 'arrival_date_day_of_month']
new_df_test[dates] = new_df_test[dates].astype(int)
new_df_test = add_arrival_date(new_df_test)

In [None]:
new_df_test_sum = new_df_test.groupby('arrival_date').sum()

In [None]:
new_df_test_sum.head()

In [None]:
new_df_test_sum['pred_revenue'].describe()

In [None]:
new_df_train_sum['revenue'].hist(density=True, alpha=.4)
new_df_train_sum['pred_revenue'].hist(density=True, alpha=.4)
new_df_test_sum['pred_revenue'].hist(density=True, alpha=.4)
plt.legend(['train (actual)', 'train (pred)', 'test'])
plt.show()

In [None]:
plt.plot(new_df_test_sum['pred_revenue'])
plt.show()

In [None]:
len(new_df_train_sum)

In [None]:
len(new_df_test_sum)

In [None]:
plt.plot((new_df_train_sum['revenue'].to_list() + new_df_test_sum['pred_revenue'].to_list())[-200:])
plt.show()

In [None]:
test_index = new_df_test_sum.index

In [None]:
label = np.floor(new_df_test_sum['pred_revenue'].to_numpy() / 10000)

In [None]:
label

In [None]:
from HTML.save import save_prediction
save_prediction('../outputs//votingv2.csv', test_index, label)