In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from copy import deepcopy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
import matplotlib
%matplotlib inline
matplotlib.rcParams.update({'font.size': 14})

In [2]:
def rename_features(date):
    return date.rename(columns={'№ брони':'id',
                        'Номеров':'rooms_cnt',
                        'Стоимость':'cost',
                        'Внесена предоплата':'prepayment',
                        'Способ оплаты':'psp',
                        'Дата бронирования':'booked_at',
                        'Дата отмены':'cancelled_at',
                        'Заезд':'arrival_date',
                        'Ночей':'nights_cnt',
                        'Выезд':'departure_date',
                        'Источник':'source',
                        'Статус брони':'status',
                        'Категория номера':'room_type',
                        'Гостей':'guests_cnt',
                        'Гостиница' : 'hotel',})

def prepare_data(data, flag):
    new_data = rename_features(data)
    new_data = shuffle(new_data)
    new_data = pd.DataFrame.reindex(new_data)

    new_data['cost'] = np.log(1 + new_data['cost'])
    new_data['prepayment'] = np.log(1 + new_data['prepayment'])
    new_data['book_arrival_delta'] = (new_data['arrival_date'] - new_data['booked_at']).dt.days
    if flag:
        new_data['is_cancelled'] = (new_data['status'] == 'Отмена').astype(int)
        new_data = new_data.drop(['Unnamed: 0', 'id', 'cancelled_at', 'booked_at', 'arrival_date', 'departure_date', 'status'], axis=1)
    else:
        new_data = new_data.drop(['Unnamed: 0', 'id', 'booked_at', 'arrival_date', 'departure_date'], axis=1)
    
    columns_list = deepcopy(new_data.select_dtypes(include=['object']).columns)
    new_data = pd.get_dummies(new_data, columns=columns_list, prefix=columns_list, dtype=int)
    return new_data

def complement_data(train, test):
    columns_to_train = list(set(test.columns) - set(train.columns))
    columns_to_test = list(set(train.columns) - set(test.columns))

    add_to_train = pd.DataFrame(0, index=train.index, columns=columns_to_train)
    add_to_test = pd.DataFrame(0, index=test.index, columns=columns_to_test)

    train = pd.concat([train, add_to_train], axis=1)
    test = pd.concat([test, add_to_test], axis=1)
    
    train = train.sort_index(axis=1)
    test = test.sort_index(axis=1)
    test = test.drop('is_cancelled', axis=1)
    return train, test

In [3]:
raw_train = pd.read_excel('/kaggle/input/psb-hack/train.xlsx')
raw_test = pd.read_excel('/kaggle/input/psb-hack/test.xlsx')

In [4]:
raw_train

Unnamed: 0.1,Unnamed: 0,№ брони,Номеров,Стоимость,Внесена предоплата,Способ оплаты,Дата бронирования,Дата отмены,Заезд,Ночей,Выезд,Источник,Статус брони,Категория номера,Гостей,Гостиница
0,0,20230428-6634-194809261,1,25700.0,0,Внешняя система оплаты,2023-04-20 20:37:30,2023-04-20 20:39:15,2023-04-28 15:00:00,3,2023-05-01 12:00:00,Яндекс.Путешествия,Отмена,Номер «Стандарт»,2,1
1,1,20220711-6634-144460018,1,24800.0,12400,Отложенная электронная оплата: Банк Россия (ба...,2022-06-18 14:17:02,NaT,2022-07-11 15:00:00,2,2022-07-13 12:00:00,Официальный сайт,Активный,Номер «Стандарт»,2,1
2,2,20221204-16563-171020423,1,25800.0,12900,Банк. карта: Банк Россия (банк. карта),2022-11-14 22:59:30,NaT,2022-12-04 15:00:00,2,2022-12-06 12:00:00,Официальный сайт,Активный,Номер «Студия»,2,4
3,3,20230918-7491-223512699,1,10500.0,0,Внешняя система оплаты (С предоплатой),2023-09-08 15:55:53,NaT,2023-09-18 15:00:00,1,2023-09-19 12:00:00,Bronevik.com(new),Активный,Номер «Стандарт»,1,3
4,4,20230529-6634-200121971,1,28690.0,28690,Система быстрых платежей: Эквайринг ComfortBoo...,2023-05-20 19:54:13,NaT,2023-05-29 15:00:00,2,2023-05-31 12:00:00,Официальный сайт,Активный,Номер «Люкс»,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26169,26169,20230310-7492-177993190,1,18240.0,9120,Банк. карта: Банк Россия (банк. карта),2023-01-07 17:45:18,NaT,2023-03-10 15:00:00,2,2023-03-12 12:00:00,Официальный сайт,Активный,Номер «Стандарт»,2,2
26170,26170,20230625-16563-206126520,1,69600.0,23200,Банк. карта: Банк Россия (банк. карта),2023-06-20 17:54:17,NaT,2023-06-25 15:00:00,3,2023-06-28 12:00:00,Официальный сайт,Активный,Номер «Студия»,3,4
26171,26171,20220624-7492-137587082,1,55600.0,13900,Банк. карта: Банк Россия (банк. карта),2022-05-08 19:24:05,NaT,2022-06-24 15:00:00,4,2022-06-28 12:00:00,Официальный сайт,Активный,Номер «Стандарт»,2,2
26172,26172,20220427-7491-125459150,1,6300.0,0,Гарантия банковской картой,2022-02-19 09:55:50,2022-04-16 23:14:35,2022-04-27 15:00:00,1,2022-04-28 12:00:00,booking.com,Отмена,Номер «Стандарт»,2,3


In [5]:
train = prepare_data(raw_train, 1)
test = prepare_data(raw_test, 0)

In [6]:
train_completed, test_completed = complement_data(train, test)

In [7]:
y = train_completed['is_cancelled']
X = train_completed.drop('is_cancelled', axis=1)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
X_train.columns

Index(['book_arrival_delta', 'cost', 'guests_cnt', 'hotel', 'nights_cnt',
       'prepayment',
       'psp_Банк. карта (SberPay): Эквайринг ComfortBooking (Банк. карта) (SberPay)',
       'psp_Банк. карта (Yandex Pay): Эквайринг ComfortBooking (Банк. карта) (Yandex Pay)',
       'psp_Банк. карта [Кешбэк. МИР]: Эквайринг ComfortBooking (Банк. карта)',
       'psp_Банк. карта [Кешбэк. МИР]: Эквайринг TravelLine Pro (Банк. карта)',
       'psp_Банк. карта: Банк Россия (банк. карта)',
       'psp_Банк. карта: Эквайринг ComfortBooking (Банк. карта)',
       'psp_Внешняя система оплаты',
       'psp_Внешняя система оплаты (Банковская карта)',
       'psp_Внешняя система оплаты (Оплата наличными)',
       'psp_Внешняя система оплаты (С предоплатой)',
       'psp_Гарантия банковской картой',
       'psp_Отложенная электронная оплата: Банк Россия (банк. карта)',
       'psp_При заселении',
       'psp_Система быстрых платежей: Эквайринг ComfortBooking (Система быстрых платежей)',
       'room_t

## Тестирование различных моделей

In [10]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier

In [11]:
import warnings
warnings.filterwarnings("ignore")

In [12]:
rf_model = RandomForestClassifier()
scores = cross_val_score(rf_model, X_train, y_train, cv=4, scoring='roc_auc')
print(scores)
print('avg auc-roc:', sum(scores) / len(scores))

[0.71034915 0.68797461 0.71305979 0.69878258]
avg auc-roc: 0.7025415322510584


In [13]:
lr_model = LogisticRegression()
scores = cross_val_score(lr_model, X_train, y_train, cv=4, scoring='roc_auc')
print(scores)
print('avg auc-roc:', sum(scores) / len(scores))

[0.71688296 0.71352741 0.72950439 0.72102541]
avg auc-roc: 0.7202350405618827


### CatBoost

In [14]:
import optuna
from catboost import cv, Pool
from catboost import CatBoostClassifier
from catboost import Pool

In [15]:
def cb_prepare_data(data, flag):
    new_data = rename_features(data)
    new_data = shuffle(new_data)
    new_data = pd.DataFrame.reindex(new_data)

    new_data['cost'] = np.log(1 + new_data['cost'])
    new_data['prepayment'] = np.log(1 + new_data['prepayment'])
    new_data['book_arrival_delta'] = (new_data['arrival_date'] - new_data['booked_at']).dt.days
    
    if flag:
        new_data['is_cancelled'] = (new_data['status'] == 'Отмена').astype(int)
        new_data = new_data.drop(['Unnamed: 0', 'id', 'cancelled_at', 'booked_at', 'arrival_date', 'departure_date', 'status'], axis=1)
    else:
        new_data = new_data.drop(['Unnamed: 0', 'id', 'booked_at', 'arrival_date', 'departure_date'], axis=1)
    
    return new_data

In [16]:
cb_train = cb_prepare_data(raw_train, 1)
cb_test = cb_prepare_data(raw_test, 0)

In [17]:
y = cb_train['is_cancelled']
X = cb_train.drop('is_cancelled', axis=1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
X

Unnamed: 0,rooms_cnt,cost,prepayment,psp,nights_cnt,source,room_type,guests_cnt,hotel,book_arrival_delta
24992,1,11.069774,9.971193,Отложенная электронная оплата: Банк Россия (ба...,3,Официальный сайт,Номер «Студия»,2,4,86
5989,1,9.575053,0.000000,Внешняя система оплаты (Оплата наличными),1,Bronevik.com(new),Номер «Стандарт»,2,3,0
6085,1,10.083348,0.000000,Отложенная электронная оплата: Банк Россия (ба...,3,Бронирование из экстранета,Номер «Стандарт»,2,3,33
22641,1,11.383966,9.723224,Система быстрых платежей: Эквайринг ComfortBoo...,5,Официальный сайт,Номер «Студия»,1,4,41
17973,1,10.724390,9.010791,Банк. карта: Банк Россия (банк. карта),5,Официальный сайт,Номер «Стандарт»,2,3,42
...,...,...,...,...,...,...,...,...,...,...
23150,1,8.936035,0.000000,Гарантия банковской картой,1,booking.com,Номер «Стандарт»,2,2,27
24879,1,10.635879,9.249657,Банк. карта: Банк Россия (банк. карта),4,Официальный сайт,Номер «Стандарт»,2,1,2
692,1,9.374243,0.000000,Отложенная электронная оплата: Банк Россия (ба...,2,Официальный сайт,Номер «Стандарт»,2,1,36
23226,1,10.114599,10.114599,Банк. карта [Кешбэк. МИР]: Эквайринг TravelLin...,2,Официальный сайт,Номер «Люкс»,3,1,68


In [20]:
cat_features = ['psp', 'source', 'room_type', 'hotel']

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 600, 1000),
        'depth': trial.suggest_int('depth', 6, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15),
    }
    
    model = CatBoostClassifier(**params, l2_leaf_reg = 2.1, cat_features = cat_features)
    model.fit(X_train, y_train, verbose = 200)
    
    y_pred = model.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    
    return score

In [22]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 5, n_jobs = 1)

[I 2025-05-24 15:52:40,122] A new study created in memory with name: no-name-2c09b806-b1c5-410a-8077-fccd1de42cd5


0:	learn: 0.6371634	total: 80.4ms	remaining: 1m 17s
200:	learn: 0.2919422	total: 3.76s	remaining: 14.3s
400:	learn: 0.2667631	total: 7.79s	remaining: 10.9s
600:	learn: 0.2475525	total: 11.9s	remaining: 7.16s
800:	learn: 0.2310498	total: 16s	remaining: 3.26s


[I 2025-05-24 15:52:59,893] Trial 0 finished with value: 0.5803799692168533 and parameters: {'iterations': 964, 'depth': 7, 'learning_rate': 0.07033670779638461}. Best is trial 0 with value: 0.5803799692168533.


963:	learn: 0.2190006	total: 19.4s	remaining: 0us
0:	learn: 0.6561170	total: 56.1ms	remaining: 35.6s
200:	learn: 0.2622342	total: 25.9s	remaining: 56s
400:	learn: 0.1955417	total: 1m 4s	remaining: 37.5s
600:	learn: 0.1613288	total: 1m 44s	remaining: 6.09s
635:	learn: 0.1571157	total: 1m 51s	remaining: 0us


[I 2025-05-24 15:54:52,123] Trial 1 finished with value: 0.5850459481690088 and parameters: {'iterations': 636, 'depth': 12, 'learning_rate': 0.045405041294948996}. Best is trial 1 with value: 0.5850459481690088.


0:	learn: 0.6200880	total: 17.7ms	remaining: 17.5s
200:	learn: 0.2937014	total: 3.2s	remaining: 12.5s
400:	learn: 0.2715718	total: 6.69s	remaining: 9.8s
600:	learn: 0.2558251	total: 10.1s	remaining: 6.51s
800:	learn: 0.2417729	total: 13.5s	remaining: 3.17s


[I 2025-05-24 15:55:09,195] Trial 2 finished with value: 0.582299120787297 and parameters: {'iterations': 989, 'depth': 6, 'learning_rate': 0.09375321973018919}. Best is trial 1 with value: 0.5850459481690088.


988:	learn: 0.2299248	total: 16.8s	remaining: 0us
0:	learn: 0.6115217	total: 23.2ms	remaining: 20.4s
200:	learn: 0.2765942	total: 3.91s	remaining: 13.2s
400:	learn: 0.2469347	total: 8.6s	remaining: 10.2s
600:	learn: 0.2227641	total: 12.8s	remaining: 5.89s
800:	learn: 0.2030161	total: 16.9s	remaining: 1.62s


[I 2025-05-24 15:55:27,914] Trial 3 finished with value: 0.5789941160658615 and parameters: {'iterations': 878, 'depth': 7, 'learning_rate': 0.10542590104379657}. Best is trial 1 with value: 0.5850459481690088.


877:	learn: 0.1968660	total: 18.4s	remaining: 0us
0:	learn: 0.6775977	total: 21.8ms	remaining: 17.3s
200:	learn: 0.3167506	total: 3.5s	remaining: 10.3s
400:	learn: 0.3071355	total: 7.01s	remaining: 6.85s
600:	learn: 0.2988496	total: 10.8s	remaining: 3.44s


[I 2025-05-24 15:55:42,861] Trial 4 finished with value: 0.5793869667259035 and parameters: {'iterations': 793, 'depth': 7, 'learning_rate': 0.01878032090660728}. Best is trial 1 with value: 0.5850459481690088.


792:	learn: 0.2893671	total: 14.7s	remaining: 0us


In [23]:
study.best_params

{'iterations': 636, 'depth': 12, 'learning_rate': 0.045405041294948996}

In [24]:
def objective_2(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 800, 1400),
        'depth': trial.suggest_int('depth', 10, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15),
    }
    
    model = CatBoostClassifier(**params, l2_leaf_reg = 2.1, cat_features = cat_features)
    model.fit(X_train, y_train, verbose = 200)
    
    y_pred = model.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    
    return score

In [25]:
study_2 = optuna.create_study(direction = 'maximize')
study_2.optimize(objective_2, n_trials = 7, n_jobs = 1)

[I 2025-05-24 15:55:42,901] A new study created in memory with name: no-name-95e44460-9de6-4d55-912a-769331598239


0:	learn: 0.6089840	total: 41.2ms	remaining: 47.2s
200:	learn: 0.1230495	total: 4m 11s	remaining: 19m 44s
400:	learn: 0.0642767	total: 9m 8s	remaining: 17m 1s
600:	learn: 0.0422685	total: 14m 12s	remaining: 12m 54s
800:	learn: 0.0304987	total: 19m 1s	remaining: 8m 13s
1000:	learn: 0.0231750	total: 23m 49s	remaining: 3m 28s
1146:	learn: 0.0189153	total: 27m 22s	remaining: 0us


[I 2025-05-24 16:23:10,636] Trial 0 finished with value: 0.5874237613980897 and parameters: {'iterations': 1147, 'depth': 15, 'learning_rate': 0.10842349019239034}. Best is trial 0 with value: 0.5874237613980897.


0:	learn: 0.6425577	total: 41.1ms	remaining: 36.4s
200:	learn: 0.1911800	total: 3m 6s	remaining: 10m 36s
400:	learn: 0.1152443	total: 7m 55s	remaining: 9m 36s
600:	learn: 0.0792500	total: 12m 31s	remaining: 5m 57s
800:	learn: 0.0587863	total: 17m 19s	remaining: 1m 51s
886:	learn: 0.0514791	total: 19m 27s	remaining: 0us


[I 2025-05-24 16:42:41,952] Trial 1 finished with value: 0.5898772538008458 and parameters: {'iterations': 887, 'depth': 15, 'learning_rate': 0.06287340417584165}. Best is trial 1 with value: 0.5898772538008458.


0:	learn: 0.5938351	total: 40.5ms	remaining: 44.3s
200:	learn: 0.1639013	total: 35s	remaining: 2m 35s
400:	learn: 0.1016323	total: 1m 13s	remaining: 2m 6s
600:	learn: 0.0701471	total: 1m 52s	remaining: 1m 32s
800:	learn: 0.0513279	total: 2m 33s	remaining: 56.5s
1000:	learn: 0.0396287	total: 3m 13s	remaining: 18.3s
1095:	learn: 0.0355449	total: 3m 32s	remaining: 0us


[I 2025-05-24 16:46:14,730] Trial 2 finished with value: 0.5893372144448957 and parameters: {'iterations': 1096, 'depth': 12, 'learning_rate': 0.1302018077870585}. Best is trial 1 with value: 0.5898772538008458.


0:	learn: 0.6410866	total: 39.7ms	remaining: 49.5s
200:	learn: 0.2311342	total: 31.1s	remaining: 2m 42s
400:	learn: 0.1684588	total: 1m 10s	remaining: 2m 28s
600:	learn: 0.1285894	total: 1m 50s	remaining: 1m 58s
800:	learn: 0.1029673	total: 2m 30s	remaining: 1m 24s
1000:	learn: 0.0855585	total: 3m 8s	remaining: 46.7s
1200:	learn: 0.0717156	total: 3m 46s	remaining: 9.04s
1248:	learn: 0.0691576	total: 3m 55s	remaining: 0us


[I 2025-05-24 16:50:10,971] Trial 3 finished with value: 0.5880747134824013 and parameters: {'iterations': 1249, 'depth': 12, 'learning_rate': 0.06479852063859998}. Best is trial 1 with value: 0.5898772538008458.


0:	learn: 0.5993742	total: 38.7ms	remaining: 33.9s
200:	learn: 0.1518802	total: 59.3s	remaining: 3m 19s
400:	learn: 0.0892571	total: 2m 9s	remaining: 2m 33s
600:	learn: 0.0601877	total: 3m 18s	remaining: 1m 31s
800:	learn: 0.0430748	total: 4m 28s	remaining: 25.5s
876:	learn: 0.0391657	total: 4m 54s	remaining: 0us


[I 2025-05-24 16:55:06,321] Trial 4 finished with value: 0.5866515419510021 and parameters: {'iterations': 877, 'depth': 13, 'learning_rate': 0.12214156122804073}. Best is trial 1 with value: 0.5898772538008458.


0:	learn: 0.6588958	total: 40.3ms	remaining: 53.3s
200:	learn: 0.2569090	total: 47.1s	remaining: 4m 23s
400:	learn: 0.1856189	total: 1m 56s	remaining: 4m 28s
600:	learn: 0.1469204	total: 3m 10s	remaining: 3m 49s
800:	learn: 0.1200702	total: 4m 25s	remaining: 2m 53s
1000:	learn: 0.0993887	total: 5m 40s	remaining: 1m 49s
1200:	learn: 0.0849445	total: 6m 53s	remaining: 42.3s
1323:	learn: 0.0781586	total: 7m 35s	remaining: 0us


[I 2025-05-24 17:02:43,260] Trial 5 finished with value: 0.589491144077302 and parameters: {'iterations': 1324, 'depth': 13, 'learning_rate': 0.04188468333131109}. Best is trial 1 with value: 0.5898772538008458.


0:	learn: 0.6319168	total: 39ms	remaining: 54.2s
200:	learn: 0.1638899	total: 3m 38s	remaining: 21m 33s
400:	learn: 0.0965649	total: 8m 27s	remaining: 20m 51s
600:	learn: 0.0650213	total: 13m 3s	remaining: 17m 9s
800:	learn: 0.0474773	total: 17m 53s	remaining: 13m 10s
1000:	learn: 0.0364437	total: 22m 34s	remaining: 8m 47s
1200:	learn: 0.0294203	total: 27m 15s	remaining: 4m 18s
1390:	learn: 0.0242696	total: 31m 50s	remaining: 0us


[I 2025-05-24 17:34:39,409] Trial 6 finished with value: 0.5886873047734447 and parameters: {'iterations': 1391, 'depth': 15, 'learning_rate': 0.07693792416585346}. Best is trial 1 with value: 0.5898772538008458.


In [26]:
study_2.best_params

{'iterations': 887, 'depth': 15, 'learning_rate': 0.06287340417584165}

In [25]:
bp = {
    'iterations' : 887,
    'depth': 15,
    'learning_rate': 0.06287340417584165,
    'loss_function': 'Logloss'
}

cb_model = CatBoostClassifier(**bp, cat_features = cat_features)
cb_model.fit(X, y, verbose = 200)

0:	learn: 0.6423165	total: 1.71s	remaining: 25m 16s
200:	learn: 0.2221672	total: 3m 22s	remaining: 11m 30s
400:	learn: 0.1549474	total: 8m 12s	remaining: 9m 56s
600:	learn: 0.1169079	total: 13m 20s	remaining: 6m 21s
800:	learn: 0.0931673	total: 18m 21s	remaining: 1m 58s
886:	learn: 0.0855284	total: 20m 26s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7e7f0a730040>

In [27]:
cb_prediction = cb_model.predict(cb_test)

In [28]:
cb_prediction.sum()

540

In [None]:
prediction_to_csv(cb_prediction, 'cb_try_0')

In [None]:
example = pd.read_csv('example.csv')
example['0'].sum()

## Предикты

In [None]:
def prediction_to_csv(prediction, file_name):
    df = pd.DataFrame(prediction)
    df.to_csv(f'{file_name}.csv', index=False, header=False)

In [None]:
lr_model.fit(X, y)
prediction = lr_model.predict(test_completed)

In [None]:
prediction_to_csv(prediction, 'lr_try_1')

In [None]:
df = pd.DataFrame(prediction, columns=['result'])
df

In [None]:
rf_model.fit(X_train_fit, y_train_fit)
prediction = rf_model.predict(test)
prediction

In [None]:
prediction = pd.DataFrame(prediction, columns=[0])
prediction

In [None]:
prediction = prediction.drop(index=0).reset_index(drop=True)
prediction

In [None]:
prediction.to_csv('rf_try_1.csv', index=False)