# ホテルキャンセル予約予測分析


```
sample_base_notebook.ipynbは各ソース機能に切り出す前のNotebookの例です。
機能をnotebook上に実装する際の参考にしてください。
"./src"の処理は以下に実装した機能をpythonファイルに切り出したものです。
```


# 機能・ロジックの実装

## データ読み込み

In [37]:
# interface: (data_loader.py)
import pandas as pd

def raw_data_loader(path = "./data/raw/dataset.csv"):

    raw_df = pd.read_csv(path)

    return raw_df

def save_dataset(df, file_path):

    df.to_csv(file_path)

In [38]:
# interface: (model_loader.py)
import joblib

def save_model(model, file_path):

    joblib.dump(model, file_path)

def load_model(file_path):

    model = joblib.load(file_path)
    
    return model

## データ前処理(実装)

### 欠損値補間

In [39]:
# usecase: データの前処理(data_preparation.py)
def complement_repeated_guest_by_median(repeated_guest_col):

    # 対象列の中央値を計算
    fill_value = repeated_guest_col.median()
    completed_col =  repeated_guest_col.fillna(fill_value)

    return completed_col


def complement_children_by_zero(no_of_children_col):

    completed_col =  no_of_children_col.fillna(0)

    return completed_col
    
def complement_required_car_parking_space_by_zero(required_car_parking_space_col):

    completed_col =  required_car_parking_space_col.fillna(0)

    return completed_col

def drop_index(df):

    return df.dropna()

def null_colum_names(df):
    
    count_null = df.isnull().sum()
    null_columuns = count_null[count_null>0].index
    
    return null_columuns


### 特徴量生成

In [40]:
# usecase: データの前処理(data_preparation.py)
def set_one_hot_vector_of(col_name, df):

    feature_df = pd.get_dummies(df, columns=[col_name])

    return feature_df

def calc_total_price(children_col, adults_col, price_par_person_col):
    
    total_proce_col = (children_col + adults_col) * price_par_person_col

    return total_proce_col


----

# 処理の確認

In [41]:
# パラメータの設定
from src.entities.constants import preprocessed_dir, version, model_dir, date_jst, dataset_path
from src.entities.constants import dataset_path
from src.interfaces.data_loader import raw_data_loader

# 保存先のファイルパス
preprocess_fpath = f"{preprocessed_dir}/preprocessed_dataset_{version}_{date_jst}.csv"
model_fpath =  f"{model_dir}/model_{version}_{date_jst}"

# データ読み込み
raw_df = raw_data_loader(dataset_path)

In [42]:
raw_df

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,lead_time,repeated_guest,room_type_reserved,price_per_person,booking_status
0,1,0.0,0,1,breakfast_and_dinner,0.0,14,2.0,single,5500.0,1
1,1,1.0,1,0,,0.0,107,1.0,single,5500.0,0
2,2,2.0,0,1,dinner,0.0,50,0.0,twin,20400.0,1
3,1,0.0,1,1,dinner,1.0,5,2.0,single,5500.0,1
4,1,0.0,0,1,breakfast,1.0,5,2.0,single,5500.0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,1,0.0,1,3,breakfast,0.0,11,1.0,single,5500.0,1
996,1,0.0,1,3,breakfast,1.0,20,1.0,single,5500.0,1
997,2,1.0,1,3,,0.0,50,,twin,17700.0,1
998,1,2.0,0,2,breakfast_and_dinner,0.0,54,3.0,deluxe_twin,21300.0,1


## 欠損値補間

__repeated_guest__

In [43]:
na_index = raw_df["repeated_guest"].isnull()
raw_df[na_index].head(10)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,lead_time,repeated_guest,room_type_reserved,price_per_person,booking_status
35,1,,1,2,dinner,,30,,twin,,1
37,2,,1,2,,0.0,99,,deluxe_twin,,1
39,1,,2,4,dinner,,36,,single,,1
51,2,,0,1,dinner,1.0,3,,twin,,1
66,1,,1,5,breakfast,0.0,57,,single,,1
72,1,1.0,0,2,dinner,0.0,45,,single,5500.0,1
80,1,0.0,1,2,,1.0,18,,deluxe_twin,,1
142,2,,1,2,breakfast,1.0,48,,twin,,1
144,1,,2,2,breakfast_and_dinner,,33,,twin,,1
148,2,,0,1,breakfast_and_dinner,,76,,twin,,1


In [44]:
# 欠損補間
raw_df["repeated_guest"] = complement_repeated_guest_by_median(raw_df["repeated_guest"])
# 欠損補間の確認
print(raw_df["repeated_guest"] .isnull().sum())

raw_df[na_index].head(10)

0


Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,lead_time,repeated_guest,room_type_reserved,price_per_person,booking_status
35,1,,1,2,dinner,,30,1.0,twin,,1
37,2,,1,2,,0.0,99,1.0,deluxe_twin,,1
39,1,,2,4,dinner,,36,1.0,single,,1
51,2,,0,1,dinner,1.0,3,1.0,twin,,1
66,1,,1,5,breakfast,0.0,57,1.0,single,,1
72,1,1.0,0,2,dinner,0.0,45,1.0,single,5500.0,1
80,1,0.0,1,2,,1.0,18,1.0,deluxe_twin,,1
142,2,,1,2,breakfast,1.0,48,1.0,twin,,1
144,1,,2,2,breakfast_and_dinner,,33,1.0,twin,,1
148,2,,0,1,breakfast_and_dinner,,76,1.0,twin,,1


__no_of_children__

In [45]:
na_index = raw_df["no_of_children"].isnull()
raw_df[na_index].head(10)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,lead_time,repeated_guest,room_type_reserved,price_per_person,booking_status
35,1,,1,2,dinner,,30,1.0,twin,,1
37,2,,1,2,,0.0,99,1.0,deluxe_twin,,1
39,1,,2,4,dinner,,36,1.0,single,,1
51,2,,0,1,dinner,1.0,3,1.0,twin,,1
66,1,,1,5,breakfast,0.0,57,1.0,single,,1
142,2,,1,2,breakfast,1.0,48,1.0,twin,,1
144,1,,2,2,breakfast_and_dinner,,33,1.0,twin,,1
148,2,,0,1,breakfast_and_dinner,,76,1.0,twin,,1
153,2,,0,2,breakfast_and_dinner,0.0,24,1.0,deluxe_twin,,1
216,2,,1,3,breakfast,,60,1.0,twin,,1


In [46]:
# no_of_children
raw_df["no_of_children"] =  complement_children_by_zero(raw_df["no_of_children"])

# 欠損補間の確認
raw_df["repeated_guest"] .isnull().sum()
raw_df[na_index].head(10)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,lead_time,repeated_guest,room_type_reserved,price_per_person,booking_status
35,1,0.0,1,2,dinner,,30,1.0,twin,,1
37,2,0.0,1,2,,0.0,99,1.0,deluxe_twin,,1
39,1,0.0,2,4,dinner,,36,1.0,single,,1
51,2,0.0,0,1,dinner,1.0,3,1.0,twin,,1
66,1,0.0,1,5,breakfast,0.0,57,1.0,single,,1
142,2,0.0,1,2,breakfast,1.0,48,1.0,twin,,1
144,1,0.0,2,2,breakfast_and_dinner,,33,1.0,twin,,1
148,2,0.0,0,1,breakfast_and_dinner,,76,1.0,twin,,1
153,2,0.0,0,2,breakfast_and_dinner,0.0,24,1.0,deluxe_twin,,1
216,2,0.0,1,3,breakfast,,60,1.0,twin,,1


__required_car_parking_space__

In [47]:
na_index = raw_df["required_car_parking_space"].isnull()
raw_df[na_index].head(10)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,lead_time,repeated_guest,room_type_reserved,price_per_person,booking_status
35,1,0.0,1,2,dinner,,30,1.0,twin,,1
39,1,0.0,2,4,dinner,,36,1.0,single,,1
144,1,0.0,2,2,breakfast_and_dinner,,33,1.0,twin,,1
148,2,0.0,0,1,breakfast_and_dinner,,76,1.0,twin,,1
216,2,0.0,1,3,breakfast,,60,1.0,twin,,1
267,4,0.0,1,2,breakfast,,13,1.0,family,,1
272,2,0.0,1,3,,,32,1.0,twin,,1
305,1,0.0,4,2,breakfast_and_dinner,,42,1.0,single,,1
312,1,0.0,1,1,breakfast_and_dinner,,8,1.0,single,,1
384,2,0.0,0,2,breakfast_and_dinner,,7,1.0,twin,,1


In [48]:
# required_car_parking_space
raw_df["required_car_parking_space"] =  complement_required_car_parking_space_by_zero(raw_df["required_car_parking_space"])

# 欠損補間の確認
raw_df["repeated_guest"] .isnull().sum()
raw_df[na_index].head(10)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,lead_time,repeated_guest,room_type_reserved,price_per_person,booking_status
35,1,0.0,1,2,dinner,0.0,30,1.0,twin,,1
39,1,0.0,2,4,dinner,0.0,36,1.0,single,,1
144,1,0.0,2,2,breakfast_and_dinner,0.0,33,1.0,twin,,1
148,2,0.0,0,1,breakfast_and_dinner,0.0,76,1.0,twin,,1
216,2,0.0,1,3,breakfast,0.0,60,1.0,twin,,1
267,4,0.0,1,2,breakfast,0.0,13,1.0,family,,1
272,2,0.0,1,3,,0.0,32,1.0,twin,,1
305,1,0.0,4,2,breakfast_and_dinner,0.0,42,1.0,single,,1
312,1,0.0,1,1,breakfast_and_dinner,0.0,8,1.0,single,,1
384,2,0.0,0,2,breakfast_and_dinner,0.0,7,1.0,twin,,1


__その他の欠損を削除__

In [57]:
raw_df.isnull().sum()

no_of_adults                    0
no_of_children                  0
no_of_weekend_nights            0
no_of_week_nights               0
type_of_meal_plan             205
required_car_parking_space      0
lead_time                       0
repeated_guest                  0
room_type_reserved              0
price_per_person               70
booking_status                  0
dtype: int64

In [58]:
# 欠損カラムを削除
completed_df =  drop_index(raw_df)

completed_df.isnull().sum()

no_of_adults                  0
no_of_children                0
no_of_weekend_nights          0
no_of_week_nights             0
type_of_meal_plan             0
required_car_parking_space    0
lead_time                     0
repeated_guest                0
room_type_reserved            0
price_per_person              0
booking_status                0
dtype: int64

## 特徴量生成

__type_of_meal_plan__

In [59]:
preproc_df = set_one_hot_vector_of("type_of_meal_plan", completed_df)

preproc_df

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,repeated_guest,room_type_reserved,price_per_person,booking_status,type_of_meal_plan_breakfast,type_of_meal_plan_breakfast_and_dinner,type_of_meal_plan_dinner
0,1,0.0,0,1,0.0,14,2.0,single,5500.0,1,False,True,False
2,2,2.0,0,1,0.0,50,0.0,twin,20400.0,1,False,False,True
3,1,0.0,1,1,1.0,5,2.0,single,5500.0,1,False,False,True
4,1,0.0,0,1,1.0,5,2.0,single,5500.0,0,True,False,False
5,1,0.0,1,1,0.0,21,6.0,single,5500.0,0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,1,0.0,0,0,0.0,52,1.0,single,5500.0,1,True,False,False
995,1,0.0,1,3,0.0,11,1.0,single,5500.0,1,True,False,False
996,1,0.0,1,3,1.0,20,1.0,single,5500.0,1,True,False,False
998,1,2.0,0,2,0.0,54,3.0,deluxe_twin,21300.0,1,False,True,False


__room_type_reserved__

In [60]:
preproc_df = set_one_hot_vector_of("room_type_reserved", preproc_df)
preproc_df

__total_price__

In [63]:
preproc_df["total_price"] = calc_total_price(completed_df["no_of_children"], completed_df["no_of_adults"], completed_df["price_per_person"])

preproc_df

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,repeated_guest,price_per_person,booking_status,type_of_meal_plan_breakfast,type_of_meal_plan_breakfast_and_dinner,type_of_meal_plan_dinner,room_type_reserved_deluxe_twin,room_type_reserved_double,room_type_reserved_family,room_type_reserved_single,room_type_reserved_twin,total_price
0,1,0.0,0,1,0.0,14,2.0,5500.0,1,False,True,False,False,False,False,True,False,5500.0
2,2,2.0,0,1,0.0,50,0.0,20400.0,1,False,False,True,False,False,False,False,True,81600.0
3,1,0.0,1,1,1.0,5,2.0,5500.0,1,False,False,True,False,False,False,True,False,5500.0
4,1,0.0,0,1,1.0,5,2.0,5500.0,0,True,False,False,False,False,False,True,False,5500.0
5,1,0.0,1,1,0.0,21,6.0,5500.0,0,False,True,False,False,False,False,True,False,5500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,1,0.0,0,0,0.0,52,1.0,5500.0,1,True,False,False,False,False,False,True,False,5500.0
995,1,0.0,1,3,0.0,11,1.0,5500.0,1,True,False,False,False,False,False,True,False,5500.0
996,1,0.0,1,3,1.0,20,1.0,5500.0,1,True,False,False,False,False,False,True,False,5500.0
998,1,2.0,0,2,0.0,54,3.0,21300.0,1,False,True,False,True,False,False,False,False,63900.0
