In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb

import lightgbm as lgbm
import catboost
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from category_encoders import LeaveOneOutEncoder
import optuna
from sklearn.preprocessing import StandardScaler

In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

# Loading Data

In [3]:
BASE_PATH = Path("/kaggle/input/playground-series-s3e7")

train = pd.read_csv(BASE_PATH / "train.csv").drop(columns="id")
train["is_original"] = 0
test = pd.read_csv(BASE_PATH / "test.csv")
# we'll need the test ids to make the submission file
test_idx = test.id
test = test.drop(columns="id")
test["is_original"] = 0

original = pd.read_csv("/kaggle/input/reservation-cancellation-prediction/train__dataset.csv")
original["is_original"] =  1

In [4]:
all_datasets = {"train": train,
               "test": test,
               "original": original}

# Preliminary Data Analysis

## Checking for Missing values

In [5]:
pd.concat([dataset.isnull().sum().rename(f"Missing in {dataset_name}") 
               for dataset_name, dataset in all_datasets.items()],
                 axis=1)

Unnamed: 0,Missing in train,Missing in test,Missing in original
no_of_adults,0,0.0,0
no_of_children,0,0.0,0
no_of_weekend_nights,0,0.0,0
no_of_week_nights,0,0.0,0
type_of_meal_plan,0,0.0,0
required_car_parking_space,0,0.0,0
room_type_reserved,0,0.0,0
lead_time,0,0.0,0
arrival_year,0,0.0,0
arrival_month,0,0.0,0


In [6]:
train['arrival_year_month'] = pd.to_datetime(train['arrival_year'].astype(str)
                                            +train['arrival_month'].astype(str), format='%Y%m')
test['arrival_year_month'] = pd.to_datetime(test['arrival_year'].astype(str)
                                            +test['arrival_month'].astype(str), format='%Y%m')
original["arrival_year_month"] = pd.to_datetime(original["arrival_year"].astype(str)
                                            +original["arrival_month"].astype(str), format="%Y%m")

train.loc[train.arrival_date > train.arrival_year_month.dt.days_in_month, 'arrival_date'] = train.arrival_year_month.dt.days_in_month
test.loc[test.arrival_date > test.arrival_year_month.dt.days_in_month, 'arrival_date'] = test.arrival_year_month.dt.days_in_month
original.loc[original.arrival_date > original.arrival_year_month.dt.days_in_month, 'arrival_date'] = original.arrival_year_month.dt.days_in_month

train.drop(columns='arrival_year_month', inplace=True)
test.drop(columns='arrival_year_month', inplace=True)
original.drop(columns="arrival_year_month", inplace=True)

# Checking for categorical values

In [7]:
pd.concat([train.dtypes.rename("Data Type")] + \
          [dataset.nunique().rename(f"{dataset_name} UniqueValues") for dataset_name, dataset in all_datasets.items()],
          axis=1).sort_values(by="train UniqueValues")

Unnamed: 0,Data Type,train UniqueValues,test UniqueValues,original UniqueValues
is_original,int64,1,1.0,1
repeated_guest,int64,2,2.0,2
booking_status,int64,2,,2
required_car_parking_space,int64,2,2.0,2
arrival_year,int64,2,2.0,2
type_of_meal_plan,int64,4,4.0,4
market_segment_type,int64,5,5.0,5
no_of_adults,int64,5,5.0,5
no_of_children,int64,6,6.0,5
no_of_special_requests,int64,6,6.0,6


In [8]:
cat_features = [col for col in train.columns if train[col].nunique() <= 31]

# removinng booking status and is_original
cat_features = cat_features[:-2]
cat_features

['no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'type_of_meal_plan',
 'required_car_parking_space',
 'room_type_reserved',
 'arrival_year',
 'arrival_month',
 'arrival_date',
 'market_segment_type',
 'repeated_guest',
 'no_of_previous_cancellations',
 'no_of_special_requests']

In [9]:
X = train.drop(columns="booking_status")
y = train.booking_status
X_original = original.drop(columns="booking_status")
y_original = original.booking_status

In [10]:
len_X = len(X)

In [11]:
X[cat_features] = X[cat_features].astype("category")
test[cat_features] = test[cat_features].astype("category")
X_original[cat_features] = X_original[cat_features].astype("category")

In [12]:
X_combined = pd.concat([X, X_original], axis=0)
y_combined = pd.concat([y, y_original], axis=0)

In [13]:
loe = LeaveOneOutEncoder(sigma=0.05)
loe.fit(X_combined[cat_features], y=y_combined)
X_combined[cat_features] = loe.transform(X_combined[cat_features])
test[cat_features] = loe.transform(test[cat_features])

In [32]:
test["no_of_children"] = test["no_of_children"].astype("int")

In [33]:
test["no_of_previous_cancellations"] = test["no_of_previous_cancellations"].astype("int")

In [14]:
numerical_features = ["lead_time", "avg_price_per_room"]

sc = StandardScaler()
sc.fit(X_combined[numerical_features])
X_combined[numerical_features] = sc.transform(X_combined[numerical_features])
test[numerical_features] = sc.transform(test[numerical_features])

In [15]:
X = X_combined.iloc[:len_X, :]
y = y_combined.iloc[:len_X]
X_org = X_combined.iloc[len_X: , :]
y_org = y_combined.iloc[len_X:]

In [16]:
len(X), len(X_org), len(y), len(y_org)

(42100, 18137, 42100, 18137)

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True,
                                                 random_state=1337, stratify=y)

In [21]:
X_train = pd.concat([X_train, X_org])
y_train = pd.concat([y_train, y_org])

# XGBoost

In [19]:
xgb_params = {'n_estimators': 366,
                 'max_depth': 10,
                 'learning_rate': 0.05,
                 'min_child_weight': 2,
                 'gamma': 0.00095,
                 'subsample': 0.85,
                 'colsample_bytree': 0.3,
                 'early_stoppig_rounds': 95}

xgb_clf = xgb.XGBClassifier(**xgb_params)

In [22]:
xgb_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

Parameters: { "early_stoppig_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.3,
              early_stoppig_rounds=95, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=0.00095,
              gpu_id=-1, grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.05, max_bin=256,
              max_cat_to_onehot=4, max_delta_step=0, max_depth=10, max_leaves=0,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=366, n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [23]:
lgbm_params = {'n_estimators': 1852,
 'num_rounds': 444,
 'learning_rate': 0.271,
 'num_leaves': 276,
 'max_depth': 7,
 'min_data_in_leaf': 237,
 'lambda_l1': 0.0004,
 'lambda_l2': 0.025,
 'min_gain_to_split': 0.04,
 'bagging_fraction': 0.35,
 'feature_fraction': 0.47,
 'early_stopping_rounds': 55}

lgbm_clf = lgbm.LGBMClassifier(objective="binary", is_unbalance=True, **lgbm_params)

In [24]:
lgbm_clf.fit(X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="auc",
            verbose=-1,
        )



LGBMClassifier(bagging_fraction=0.35, early_stopping_rounds=55,
               feature_fraction=0.47, is_unbalance=True, lambda_l1=0.0004,
               lambda_l2=0.025, learning_rate=0.271, max_depth=7,
               min_data_in_leaf=237, min_gain_to_split=0.04, n_estimators=1852,
               num_leaves=276, num_rounds=444, objective='binary')

In [28]:
cat_params = {'n_estimators': 1733, 'loss_function': 'CrossEntropy', 
              'learning_rate': 0.24, 'l2_leaf_reg': 0.35, 
              'colsample_bylevel': 0.0554046992591773, 'depth': 7, 
              'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 
              'min_data_in_leaf': 8, 'one_hot_max_size': 18, 'early_stopping_rounds': 103
             }
cat_clf = catboost.CatBoostClassifier(**cat_params)
cat_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

<catboost.core.CatBoostClassifier at 0x7fceeedf3610>

In [34]:
y_preds_xgb = xgb_clf.predict_proba(test)[:, 1]
y_preds_lgbm = lgbm_clf.predict_proba(test)[:, 1]
y_preds_cat = cat_clf.predict_proba(test)[:, 1]

In [40]:
y_preds_final = np.mean([y_preds_xgb, y_preds_lgbm, y_preds_cat], axis=0)
y_preds_final.shape

(28068,)

In [41]:
submission = pd.DataFrame({"id": test_idx, "booking_status": y_preds_final})
submission.head()

Unnamed: 0,id,booking_status
0,42100,0.117819
1,42101,0.071584
2,42102,0.357406
3,42103,0.07528
4,42104,0.506617


In [38]:
# submission = pd.DataFrame({"id": test_idx, "booking_status": y_preds_final})
# submission.head()

Unnamed: 0,id,booking_status
0,42100,0.118409
1,42101,0.080162
2,42102,0.364822
3,42103,0.060078
4,42104,0.524641


In [42]:
submission.to_csv("submission.csv", index=False)