In [33]:
from datetime import datetime

import pandas as pd
import numpy as np

In [34]:
pd.set_option("display.max_columns", 100)

## Preprocessing

In [35]:
train = pd.read_csv('train_final.csv')

In [36]:
train.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0,44638.0
mean,0.297594,99.395291,2016.15675,27.095166,15.778328,0.931516,2.507303,1.852883,0.102424,0.008289,0.034052,0.069672,0.140799,0.238026,2.149133,101.624687,0.067723,0.603253
std,0.457205,104.485734,0.70504,13.654692,8.807604,1.002021,1.928111,0.578898,0.393273,0.091649,0.181364,0.726281,1.540109,0.682989,17.134385,48.42993,0.255077,0.803659
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,16.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.38,0.0,0.0
50%,0.0,65.0,2016.0,27.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.5,0.0,0.0
75%,1.0,154.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,16.0,40.0,55.0,3.0,2.0,1.0,26.0,72.0,21.0,391.0,510.0,8.0,5.0


In [37]:
# Сумма ночевок
train['total_nights'] = train['stays_in_weekend_nights'] + train['stays_in_week_nights']
# Проживание длительностью > 7 ночей
train['long_stay_flag'] = (train['total_nights'] > 7).astype(int)
# Общее кол-во гостей
train['total_guests'] = train[['adults', 'children', 'babies']].sum(axis=1)
# Плотность проживания
train['guests_per_night'] = train.apply(lambda x: x.total_guests / x.total_nights if x.total_nights > 0 else 0, axis=1)
# Кол-во запросов на гостя
train['special_requests_per_guest'] = train.apply(lambda x: x.total_of_special_requests / x.total_guests if x.total_guests > 0 else 0, axis=1)
# Имеется ли special_request
train['has_special_request'] = (train['total_of_special_requests'] > 0).astype(int)

In [38]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
train["country_encoded"] = le.fit_transform(train["country"])

train["non_refund"] = (train["deposit_type"] == "Non Refund").astype(int)
train["reservation_match"] = (train["reserved_room_type"] == train["assigned_room_type"]).astype(int)
train["is_resort"] = (train["hotel"] == "Resort Hotel").astype(int)
train["arrival_date_month"] = train["arrival_date_month"].apply(lambda m: datetime.strptime(m, "%B").month)

meal_mapping = {
    "Undefined": 0,
    "SC": 0,
    "BB": 1,
    "HB": 2,
    "FB": 3
}
train["meal_encoded"] = train["meal"].apply(lambda x: meal_mapping[x])


features_to_encode = ["market_segment", "customer_type", "distribution_channel"]
feature_to_drop = ["hotel", "arrival_date_week_number", "arrival_date_day_of_month",
                   "reserved_room_type", "assigned_room_type", "meal", "country",
                   "market_segment", "distribution_channel", "deposit_type",
                   "customer_type", "reservation_status_date"]


def encode_labels_by_cancel_ratio(df, feature):
    cats = train.groupby(feature)["is_canceled"].mean().sort_values().index.tolist()
    mapping = dict(zip(cats, list(range(len(cats)))))
    return df[feature].apply(lambda x: mapping[x])

for feature in features_to_encode:
    train[feature + "_encoded"] = encode_labels_by_cancel_ratio(train, feature)

train = train.drop(columns=feature_to_drop)

In [39]:
train.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,total_nights,long_stay_flag,total_guests,guests_per_night,special_requests_per_guest,has_special_request,country_encoded,non_refund,reservation_match,is_resort,meal_encoded,market_segment_encoded,customer_type_encoded,distribution_channel_encoded
0,0,3,2016,2,0,2,2,0.0,0,0,0,0,0,0,81.0,1,0,2,0,2.0,1.0,0.0,0,121,0,0,1,1,1,3,0
1,1,50,2016,11,1,2,2,0.0,0,0,0,0,0,0,93.6,0,2,3,0,2.0,0.666667,1.0,1,12,0,1,0,1,5,3,3
2,0,14,2015,9,1,5,2,2.0,0,0,0,0,0,0,166.0,0,3,6,0,4.0,0.666667,0.75,1,71,0,1,0,1,5,2,3
3,0,18,2015,12,2,0,2,0.0,0,0,0,0,0,0,107.0,0,0,2,0,2.0,1.0,0.0,0,46,0,1,0,1,5,3,3
4,1,263,2016,9,2,4,2,0.0,0,0,0,0,0,0,100.3,0,0,6,0,2.0,0.333333,0.0,0,38,0,1,0,1,5,3,3


In [40]:
X = train.drop(columns=["is_canceled"])
y = train["is_canceled"]

## Modelling

In [41]:
RANDOM_SEED = 47

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

**Бабин Я.В.**

**Поиск гиперпараметров**

In [43]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


In [61]:
model = RandomForestClassifier(n_estimators=200, random_state=RANDOM_SEED, max_depth=None, min_samples_split=5)
model.fit(X_train, y_train)

In [62]:
from sklearn.metrics import roc_auc_score, recall_score, f1_score, precision_score

**Без преобразований**

In [63]:
y_pred = model.predict(X_valid)
y_pred_proba = model.predict_proba(X_valid)[:, 1]
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("ROC-AUC:", roc_auc_score(y_valid, y_pred_proba))
print("Recall:", recall_score(y_valid, y_pred))
print("Precision:", precision_score(y_valid, y_pred))
print("f1:", f1_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))

Accuracy: 0.881608422939068
ROC-AUC: 0.9374495724989571
Recall: 0.7173750932140194
Precision: 0.8654970760233918
f1: 0.7845056065239552
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      6246
           1       0.87      0.72      0.78      2682

    accuracy                           0.88      8928
   macro avg       0.88      0.83      0.85      8928
weighted avg       0.88      0.88      0.88      8928



**StandardScaler**

In [56]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
X_train_scaled = standard_scaler.fit_transform(X_train)
X_valid_scaled = standard_scaler.transform(X_valid)

In [57]:
model = RandomForestClassifier(n_estimators=200, random_state=RANDOM_SEED, max_depth=None, min_samples_split=5)
model.fit(X_train_scaled, y_train)

In [53]:
y_pred = model.predict(X_valid_scaled)
print("Accuracy:", accuracy_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))

Accuracy: 0.7001568100358423
ROC-AUC: 0.5009321401938852
Recall: 0.0018642803877703207
Precision: 1.0
f1: 0.003721622627465575
              precision    recall  f1-score   support

           0       0.70      1.00      0.82      6246
           1       1.00      0.00      0.00      2682

    accuracy                           0.70      8928
   macro avg       0.85      0.50      0.41      8928
weighted avg       0.79      0.70      0.58      8928



Т.к. деревья не чувствительны к масштабу, смысла применения нет.

**PCA**

In [58]:
from sklearn.decomposition import PCA

pca = PCA(n_components=X.shape[1])
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)

model = RandomForestClassifier(n_estimators=200, random_state=RANDOM_SEED, max_depth=None, min_samples_split=5)
model.fit(X_train_pca, y_train)

y_pred = model.predict(X_valid_pca)
print("Accuracy:", accuracy_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))

Accuracy: 0.8761200716845878
ROC-AUC: 0.8214685586694949
Recall: 0.6845637583892618
Precision: 0.8759541984732825
f1: 0.7685223943072416
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      6246
           1       0.88      0.68      0.77      2682

    accuracy                           0.88      8928
   macro avg       0.88      0.82      0.84      8928
weighted avg       0.88      0.88      0.87      8928



**StandardScaler + PCA**

In [59]:
pca = PCA(n_components=X.shape[1])
X_train_pca = pca.fit_transform(X_train_scaled)
X_valid_pca = pca.transform(X_valid_scaled)

model = RandomForestClassifier(n_estimators=200, random_state=RANDOM_SEED, max_depth=None, min_samples_split=5)
model.fit(X_train_pca, y_train)

y_pred = model.predict(X_valid_pca)
print("Accuracy:", accuracy_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))

Accuracy: 0.8671594982078853
ROC-AUC: 0.8059160547314039
Recall: 0.6524981357196122
Precision: 0.873253493013972
f1: 0.7469056764831413
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      6246
           1       0.87      0.65      0.75      2682

    accuracy                           0.87      8928
   macro avg       0.87      0.81      0.83      8928
weighted avg       0.87      0.87      0.86      8928

