## 0. Load Required Libraries

In [48]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np 
import joblib
import os
import yaml
import src.util as util

## 1. Load Configuration File

In [49]:
config_data = util.load_config()

## 2. Data Collection

In [50]:
def read_raw_data(config: dict) -> pd.DataFrame:
    # Create variable to store raw dataset
    raw_dataset = pd.DataFrame()

    # Raw dataset dir
    raw_dataset_dir = config["raw_dataset_dir"]

    # Look and load add CSV files
    for i in tqdm(os.listdir(raw_dataset_dir)):
        raw_dataset = pd.concat([pd.read_csv(raw_dataset_dir + i), raw_dataset])
    
    # Return raw dataset
    return raw_dataset

In [51]:
raw_dataset = read_raw_data(config_data)

100%|██████████| 1/1 [00:00<00:00,  4.36it/s]


In [52]:
# Check our data
raw_dataset

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.00,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.00,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.00,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.50,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,INN36271,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.80,1,Not_Canceled
36271,INN36272,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.95,2,Canceled
36272,INN36273,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.39,2,Not_Canceled
36273,INN36274,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.50,0,Canceled


## 2. Data Definition

In [64]:
# Define data type, range of data and some explanation out data for each variable
"""
[object] Booking_ID: unique identifier of each booking
[integer] no_of_adults: Number of adults
[integer] no_of_children: Number of Children
[integer] no_of_weekend_nights: Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel
[integer] no_of_week_nights: Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel
[object] type_of_meal_plan: Type of meal plan booked by the customer:
[integer] required_car_parking_space: Does the customer require a car parking space? (0 - No, 1- Yes)
[object] room_type_reserved: Type of room reserved by the customer. The values are ciphered (encoded) by INN Hotels.
[integer] lead_time: Number of days between the date of booking and the arrival date
[integer] arrival_year: Year of arrival date
[integer] arrival_month: Month of arrival date
[integer] arrival_date: Date of the month
[object] market_segment_type: Market segment designation.
[integer] repeated_guest: Is the customer a repeated guest? (0 - No, 1- Yes)
[integer] no_of_previous_cancellations: Number of previous bookings that were canceled by the customer prior to the current booking
[integer] no_of_previous_bookings_not_canceled: Number of previous bookings not canceled by the customer prior to the current booking
[float] avg_price_per_room: Average price per day of the reservation; prices of the rooms are dynamic. (in euros)
[integer] no_of_special_requests: Total number of special requests made by the customer (e.g. high floor, view from the room, etc)
[object] booking_status: Flag indicating if the booking was canceled or not.

"""
print()




## 3. Data Validation

### 3.1. Tipe Data

In [54]:
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          36275 non-null  int64  
 1   no_of_children                        36275 non-null  int64  
 2   no_of_weekend_nights                  36275 non-null  int64  
 3   no_of_week_nights                     36275 non-null  int64  
 4   required_car_parking_space            36275 non-null  int64  
 5   room_type_reserved                    36275 non-null  object 
 6   lead_time                             36275 non-null  int64  
 7   arrival_year                          36275 non-null  int64  
 8   arrival_month                         36275 non-null  int64  
 9   arrival_date                          36275 non-null  int64  
 10  repeated_guest                        36275 non-null  int64  
 11  no_of_previous_

### 3.2. Range

In [66]:
# Check the range of data for each variable
raw_dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
no_of_adults,36275.0,1.844962,0.518715,0.0,2.0,2.0,2.0,4.0
no_of_children,36275.0,0.105279,0.402648,0.0,0.0,0.0,0.0,10.0
no_of_weekend_nights,36275.0,0.810724,0.870644,0.0,0.0,1.0,2.0,7.0
no_of_week_nights,36275.0,2.2043,1.410905,0.0,1.0,2.0,3.0,17.0
required_car_parking_space,36275.0,0.030986,0.173281,0.0,0.0,0.0,0.0,1.0
lead_time,36275.0,85.232557,85.930817,0.0,17.0,57.0,126.0,443.0
arrival_year,36275.0,2017.820427,0.383836,2017.0,2018.0,2018.0,2018.0,2018.0
arrival_month,36275.0,7.423653,3.069894,1.0,5.0,8.0,10.0,12.0
arrival_date,36275.0,15.596995,8.740447,1.0,8.0,16.0,23.0,31.0
repeated_guest,36275.0,0.025637,0.158053,0.0,0.0,0.0,0.0,1.0


### 3.4. Handling Variables Error

In [53]:
raw_dataset = raw_dataset.drop(['Booking_ID', 'type_of_meal_plan', 'market_segment_type'], axis=1)

In [55]:
raw_dataset.no_of_adults = raw_dataset.no_of_adults.astype(int)
raw_dataset.no_of_children = raw_dataset.no_of_children.astype(int)
raw_dataset.no_of_weekend_nights = raw_dataset.no_of_weekend_nights.astype(int)
raw_dataset.no_of_week_nights = raw_dataset.no_of_week_nights.astype(int)
raw_dataset.required_car_parking_space = raw_dataset.required_car_parking_space.astype(int)
raw_dataset.lead_time = raw_dataset.lead_time.astype(int)
raw_dataset.arrival_year = raw_dataset.arrival_year.astype(int)
raw_dataset.arrival_month = raw_dataset.arrival_month.astype(int)
raw_dataset.arrival_date = raw_dataset.arrival_date.astype(int)
raw_dataset.repeated_guest = raw_dataset.repeated_guest.astype(int)
raw_dataset.no_of_previous_cancellations = raw_dataset.no_of_previous_cancellations.astype(int)
raw_dataset.no_of_previous_bookings_not_canceled = raw_dataset.no_of_previous_bookings_not_canceled.astype(int)
raw_dataset.avg_price_per_room = raw_dataset.avg_price_per_room.astype(int)
raw_dataset.no_of_special_requests = raw_dataset.no_of_special_requests.astype(int)
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   no_of_adults                          36275 non-null  int32 
 1   no_of_children                        36275 non-null  int32 
 2   no_of_weekend_nights                  36275 non-null  int32 
 3   no_of_week_nights                     36275 non-null  int32 
 4   required_car_parking_space            36275 non-null  int32 
 5   room_type_reserved                    36275 non-null  object
 6   lead_time                             36275 non-null  int32 
 7   arrival_year                          36275 non-null  int32 
 8   arrival_month                         36275 non-null  int32 
 9   arrival_date                          36275 non-null  int32 
 10  repeated_guest                        36275 non-null  int32 
 11  no_of_previous_cancellations

In [56]:
util.pickle_dump(raw_dataset, config_data["cleaned_raw_dataset_path"])

## 4. Data Defense

In [59]:
def check_data(input_data, params):
    # Check data types
    #assert input_data.select_dtypes("datetime").columns.to_list() == params["datetime_columns"], "an error occurs in datetime column(s)."
    assert input_data.select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    #assert input_data.select_dtypes("int").columns.to_list() == params["int32_columns"], "an error occurs in int32 column(s)."

    # Check range of data
    
    assert set(input_data.room_type_reserved).issubset(set(params["range_room_type_reserved"])), "an error occurs in room_type_reserved."
    assert input_data.no_of_adults.between(params["range_no_of_adults"][0], params["range_no_of_adults"][1]).sum() == len(input_data), "an error occurs in no_of_adults range."
    assert input_data.avg_price_per_room.between(params["range_avg_price_per_room"][0], params["range_avg_price_per_room"][1]).sum() == len(input_data), "an error occurs in avg_price_per_room range."
    assert input_data.no_of_weekend_nights.between(params["range_no_of_weekend_nights"][0], params["range_no_of_weekend_nights"][1]).sum() == len(input_data), "an error occurs in no_of_weekend_nights range."
    assert input_data.no_of_week_nights.between(params["range_no_of_week_nights"][0], params["range_no_of_week_nights"][1]).sum() == len(input_data), "an error occurs in no_of_week_nights range."
    assert input_data.required_car_parking_space.between(params["range_required_car_parking_space"][0], params["range_required_car_parking_space"][1]).sum() == len(input_data), "an error occurs in required_car_parking_space range."
    assert input_data.lead_time.between(params["range_lead_time"][0], params["range_lead_time"][1]).sum() == len(input_data), "an error occurs in lead_time range."
    assert input_data.arrival_year.between(params["range_arrival_year"][0], params["range_arrival_year"][1]).sum() == len(input_data), "an error occurs in arrival_year range."
    assert input_data.arrival_month.between(params["range_arrival_month"][0], params["range_arrival_month"][1]).sum() == len(input_data), "an error occurs in arrival_month range."
    assert input_data.arrival_date.between(params["range_arrival_date"][0], params["range_arrival_date"][1]).sum() == len(input_data), "an error occurs in arrival_date range."
    assert input_data.repeated_guest.between(params["range_repeated_guest"][0], params["range_repeated_guest"][1]).sum() == len(input_data), "an error occurs in repeated_guest range."
    assert input_data.no_of_previous_cancellations.between(params["range_no_of_previous_cancellations"][0], params["range_no_of_previous_cancellations"][1]).sum() == len(input_data), "an error occurs in no_of_previous_cancellations range."
    assert input_data.no_of_previous_bookings_not_canceled.between(params["range_no_of_previous_bookings_not_canceled"][0], params["range_no_of_previous_bookings_not_canceled"][1]).sum() == len(input_data), "an error occurs in no_of_previous_bookings_not_canceled range."
    assert input_data.no_of_special_requests.between(params["range_no_of_special_requests"][0], params["range_no_of_special_requests"][1]).sum() == len(input_data), "an error occurs in no_of_special_requests range."
    
    

In [60]:
check_data(raw_dataset, config_data)

## 5. Data Splitting

In [61]:
# Split input/variable/feature with target/labet/output
x = raw_dataset[config_data["predictors"]].copy()
y = raw_dataset.booking_status.copy()

In [62]:
# First split, splitting train and test set with ratio 0.7:0.3 and do stratify splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [63]:
# Second split, splitting test and valid set with ratio 0.5:0.5 and do stratify splitting
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [36]:
util.pickle_dump(x_train, config_data["train_set_path"][0])
util.pickle_dump(y_train, config_data["train_set_path"][1])

util.pickle_dump(x_valid, config_data["valid_set_path"][0])
util.pickle_dump(y_valid, config_data["valid_set_path"][1])

util.pickle_dump(x_test, config_data["test_set_path"][0])
util.pickle_dump(y_test, config_data["test_set_path"][1])