In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV, cross_val_predict
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import tqdm

In [None]:
SUMMER_COEF = 1.4
SPRING_AUTUMN_COEF = 1.2
SUMMER_LIST = ['june', 'july', 'august']
SPRING_AUTUMN_LIST = ['september', 'october', 'november',
                      'march', 'april', 'may']
PRICE_PER_CATEGORY_LIVING = {
    "A": 1000,
    "B": 800, 
    "C": 600, 
    "D": 550, 
    "E": 500, 
    "F": 450, 
    "G": 350,
}
PRICE_PER_CATEGORY_CLEANING = {
    "A": 400,
    "B": 350, 
    "C": 350, 
    "D": 150, 
    "E": 150, 
    "F": 150, 
    "G": 150,
}
RANDOM_STATE = 12345

BUDGET = 400_000

pd.options.display.max_columns = 200

In [None]:
try:
    data_train = pd.read_csv("hotel_train.csv")
    data_test = pd.read_csv("hotel_test.csv")
except:
    data_train = pd.read_csv("/datasets/hotel_train.csv")
    data_test = pd.read_csv("/datasets/hotel_test.csv")

In [None]:
data_test_V2 = data_test.copy()

In [None]:
data_train.info()

In [None]:
def profit(data):
    if data["is_canceled"] != 1:
        if data["arrival_date_month"] in SUMMER_LIST:
            return PRICE_PER_CATEGORY_LIVING[data["reserved_room_type"]] * SUMMER_COEF * data["total_nights"] - PRICE_PER_CATEGORY_CLEANING[data["reserved_room_type"]] * (data["total_nights"] // 2 + 1)
        elif data["arrival_date_month"] in SPRING_AUTUMN_LIST:
              return PRICE_PER_CATEGORY_LIVING[data["reserved_room_type"]] * SPRING_AUTUMN_COEF * data["total_nights"] - PRICE_PER_CATEGORY_CLEANING[data["reserved_room_type"]] * (data["total_nights"] // 2 + 1)
        else:
            return PRICE_PER_CATEGORY_LIVING[data["reserved_room_type"]] * data["total_nights"] - PRICE_PER_CATEGORY_CLEANING[data["reserved_room_type"]] * (data["total_nights"] // 2 + 1)
    else:
        if data["arrival_date_month"] in SUMMER_LIST:
            return (PRICE_PER_CATEGORY_LIVING[data["reserved_room_type"]] * SUMMER_COEF  + PRICE_PER_CATEGORY_CLEANING[data["reserved_room_type"]]) * -1
        elif data["arrival_date_month"] in SPRING_AUTUMN_LIST:
            return (PRICE_PER_CATEGORY_LIVING[data["reserved_room_type"]] * SPRING_AUTUMN_COEF  + PRICE_PER_CATEGORY_CLEANING[data["reserved_room_type"]]) * -1
        else:
            return (PRICE_PER_CATEGORY_LIVING[data["reserved_room_type"]]  + PRICE_PER_CATEGORY_CLEANING[data["reserved_room_type"]]) * -1

In [None]:
def seasons(data):
    if data in ['september', 'october', 'november']:
        return "autumn"
    if data in ['march', 'april', 'may']:
        return "spring"
    if data in ['june', 'july', 'august']:
        return "summer"
    if data in ['december', 'january', 'february']:
        return "winter"

In [None]:
def preprocessing(data:pd.DataFrame) -> pd.DataFrame:

    data = data.drop(["id", "arrival_date_year", "arrival_date_day_of_month", "arrival_date_week_number"], axis=1)

    data["arrival_date_month"] = data["arrival_date_month"].apply(str.lower)
    data["reserved_room_type"] = data["reserved_room_type"].apply(str.strip)
    data["meal"] = data["meal"].apply(str.strip)

    data["profit"] = [profit(data.loc[x]) for x in range(len(data))]

    data["arrival_date_month"] = data["arrival_date_month"].apply(seasons)

    return data

In [None]:
def getfullitemsforOHE(wholedf,featlist,sort=True):
    def sortornot(X):
        if sort==False:
            return X
        else:
            return sorted(X)
       
    fulllist=[]
    for feat in featlist:
        fulllist.append(sortornot(wholedf[feat].unique()))
    return fulllist

In [None]:
def features_target(data:pd.DataFrame):
    features = data.drop(["is_canceled", "profit"], axis=1)
    target = data["is_canceled"]
    return features, target

In [None]:
display(data_train["arrival_date_month"].unique())

In [None]:
display(data_train["reserved_room_type"].unique())

In [None]:
display(data_train["meal"].unique())

In [None]:
full_data = pd.concat((data_train, data_test))
full_data = full_data.reset_index(drop=True)

In [None]:
data_train = preprocessing(data_train)
data_test = preprocessing(data_test)
full_data = preprocessing(full_data)

In [None]:
cat_columns = ["country", "arrival_date_month", "meal", "distribution_channel", "reserved_room_type", "customer_type"]

In [None]:
cats = getfullitemsforOHE(full_data, cat_columns)

In [None]:
print(cats)

In [None]:
ohe=OneHotEncoder(categories=cats, sparse=False,handle_unknown="ignore")

X_train=ohe.fit_transform(data_train[cat_columns])
ohe_train = pd.DataFrame(X_train,columns=ohe.get_feature_names(cat_columns))

X_test=ohe.fit_transform(data_test[cat_columns])
ohe_test = pd.DataFrame(X_test,columns=ohe.get_feature_names(cat_columns))

In [None]:
data_train = data_train.drop(["country", "arrival_date_month", "meal", "distribution_channel", "reserved_room_type", "customer_type"], axis=1)
data_train = data_train.join(ohe_train)

data_test = data_test.drop(["country", "arrival_date_month", "meal", "distribution_channel", "reserved_room_type", "customer_type"], axis=1)
data_test = data_test.join(ohe_test)

In [None]:
data_explore = data_train.copy()

In [None]:
corr = data_explore.corr(method="pearson")

In [None]:
mask = np.triu(np.ones_like(corr, dtype=bool))

f, ax = plt.subplots(figsize=(300, 300))

cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
data_explore.boxplot("profit")

In [None]:
display(data_explore[data_explore["profit"] > 6000]["profit"].count())

In [None]:
data_explore.boxplot("total_of_special_requests")

In [None]:
display(data_explore[data_explore["total_of_special_requests"] >= 3]["total_of_special_requests"].count())

In [None]:
data_explore.boxplot("total_nights")

In [None]:
display(data_explore[data_explore["total_nights"] >= 8]
        ["total_nights"].count())

In [None]:
display(data_train.loc[data_train["is_canceled"] == 1]["is_canceled"].count() / len(data_train) * 100)

In [None]:
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["arrival_date_month_summer"] == 1)]["is_canceled"].count() / len(data_train) * 100)
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["arrival_date_month_autumn"] == 1)]["is_canceled"].count() / len(data_train) * 100)
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["arrival_date_month_winter"] == 1)]["is_canceled"].count() / len(data_train) * 100)
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["arrival_date_month_spring"] == 1)]["is_canceled"].count() / len(data_train) * 100)


In [None]:
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["babies"] >= 1)]["is_canceled"].count() / len(data_train) * 100)
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["children"] >= 1)]["is_canceled"].count() / len(data_train) * 100)
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["adults"] == 2)]["is_canceled"].count() / len(data_train) * 100)

In [None]:
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["babies"] == 1)]["is_canceled"].count() / len(data_train) * 100)
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["children"] == 1)]["is_canceled"].count() / len(data_train) * 100)
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["adults"] == 1)]["is_canceled"].count() / len(data_train) * 100)

In [None]:
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["babies"] == 0)]["is_canceled"].count() / len(data_train) * 100)
display(data_train.loc[(data_train["is_canceled"] == 1) & (data_train["children"] == 0)]["is_canceled"].count() / len(data_train) * 100)

In [None]:
data_train = data_train.drop(index=data_train[data_train["profit"] > 6000].index)
data_train = data_train.drop(index=data_train[data_train["total_of_special_requests"] >= 3].index)
data_train = data_train.drop(index=data_train[data_train["total_nights"] >= 8].index)

In [None]:
f, ax = plt.subplots(figsize=(15, 15))

sns.barplot(data=data_train[data_train["is_canceled"] == 1][['customer_type_Contract', 'customer_type_Group',
       'customer_type_Transient', 'customer_type_Transient-Party']])

In [None]:
f, ax = plt.subplots(figsize=(15, 15))

sns.barplot(data=data_train[data_train["is_canceled"] == 1][['reserved_room_type_A',
       'reserved_room_type_B', 'reserved_room_type_C',
       'reserved_room_type_D', 'reserved_room_type_E',
       'reserved_room_type_F', 'reserved_room_type_G']])

In [None]:
f, ax = plt.subplots(figsize=(15, 15))

sns.barplot(data=data_train[data_train["is_canceled"] == 1][['distribution_channel_Corporate',
       'distribution_channel_Direct', 'distribution_channel_GDS',
       'distribution_channel_TA/TO']])

In [None]:
f, ax = plt.subplots(figsize=(15, 15))

sns.barplot(data=data_train[data_train["is_canceled"] == 1][['meal_BB', 'meal_FB', 'meal_HB',
       'meal_SC']])

In [None]:
f, ax = plt.subplots(figsize=(200, 200))

sns.barplot(data=data_train[data_train["is_canceled"] == 1][['country_ABW', 'country_AGO', 'country_ALB', 'country_AND',
       'country_ARE', 'country_ARG', 'country_ARM', 'country_ASM',
       'country_ATA', 'country_ATF', 'country_AUS', 'country_AUT',
       'country_AZE', 'country_BDI', 'country_BEL', 'country_BEN',
       'country_BFA', 'country_BGD', 'country_BGR', 'country_BHR',
       'country_BHS', 'country_BIH', 'country_BLR', 'country_BOL',
       'country_BRA', 'country_BRB', 'country_BWA', 'country_CAF',
       'country_CHE', 'country_CHL', 'country_CHN', 'country_CIV',
       'country_CMR', 'country_CN', 'country_COL', 'country_CPV',
       'country_CRI', 'country_CUB', 'country_CYM', 'country_CYP',
       'country_CZE', 'country_DEU', 'country_DMA', 'country_DNK',
       'country_DOM', 'country_DZA', 'country_ECU', 'country_EGY',
       'country_ESP', 'country_EST', 'country_ETH', 'country_FIN',
       'country_FJI', 'country_FRA', 'country_FRO', 'country_GAB',
       'country_GBR', 'country_GEO', 'country_GGY', 'country_GHA',
       'country_GIB', 'country_GLP', 'country_GNB', 'country_GRC',
       'country_GTM', 'country_GUY', 'country_HKG', 'country_HND',
       'country_HRV', 'country_HUN', 'country_IDN', 'country_IMN',
       'country_IND', 'country_IRL', 'country_IRN', 'country_IRQ',
       'country_ISL', 'country_ISR', 'country_ITA', 'country_JAM',
       'country_JEY', 'country_JOR', 'country_JPN', 'country_KAZ',
       'country_KEN', 'country_KHM', 'country_KIR', 'country_KNA',
       'country_KOR', 'country_KWT', 'country_LAO', 'country_LBN',
       'country_LBY', 'country_LCA', 'country_LIE', 'country_LKA',
       'country_LTU', 'country_LUX', 'country_LVA', 'country_MAC',
       'country_MAR', 'country_MCO', 'country_MDG', 'country_MDV',
       'country_MEX', 'country_MKD', 'country_MLI', 'country_MLT',
       'country_MMR', 'country_MNE', 'country_MOZ', 'country_MRT',
       'country_MUS', 'country_MWI', 'country_MYS', 'country_MYT',
       'country_NAM', 'country_NCL', 'country_NGA', 'country_NIC',
       'country_NLD', 'country_NOR', 'country_NPL', 'country_NZL',
       'country_OMN', 'country_PAK', 'country_PAN', 'country_PER',
       'country_PHL', 'country_PLW', 'country_POL', 'country_PRI',
       'country_PRT', 'country_PRY', 'country_PYF', 'country_QAT',
       'country_ROU', 'country_RUS', 'country_RWA', 'country_SAU',
       'country_SDN', 'country_SEN', 'country_SGP', 'country_SLE',
       'country_SLV', 'country_SMR', 'country_SRB', 'country_STP',
       'country_SUR', 'country_SVK', 'country_SVN', 'country_SWE',
       'country_SYC', 'country_SYR', 'country_TGO', 'country_THA',
       'country_TJK', 'country_TMP', 'country_TUN', 'country_TUR',
       'country_TWN', 'country_TZA', 'country_UGA', 'country_UKR',
       'country_UMI', 'country_URY', 'country_USA', 'country_UZB',
       'country_VEN', 'country_VGB', 'country_VNM', 'country_ZAF',
       'country_ZMB', 'country_ZWE']])

In [None]:
features_train, target_train = features_target(data_train)
features_test, target_test = features_target(data_test)

In [None]:
model = RandomForestClassifier(n_estimators=80, max_depth=15, random_state=RANDOM_STATE)
model.fit(features_train, target_train)

probabilities = model.predict_proba(features_test)
probabilities_one = probabilities[:, 1]
prediction_test = probabilities_one > 0.5

In [None]:
sort = model.feature_importances_.argsort()

In [None]:
values = list(features_train.columns[sort].values[(len(features_train) // 2):])

In [None]:
plt.figure(figsize=(200, 200))
plt.bar(features_train.columns[sort], model.feature_importances_[sort])
plt.xticks(values, rotation=90)
plt.xlabel("Feature Importance")

In [None]:
params_cb_rf = {
    "n_estimators": [x for x in range(20, 100, 20)],
    "max_depth": [x for x in range(5, 17, 2)],
                }
params_gnb = {'var_smoothing': np.logspace(0,-9, num=100)}

recall_params = make_scorer(recall_score , average='macro')

In [None]:
model = RandomForestClassifier(random_state=RANDOM_STATE)

CV_model = GridSearchCV(estimator=model, param_grid=params_cb_rf, cv=5, scoring=recall_params)
CV_model.fit(features_train, target_train)

best_params = CV_model.best_params_

y_pred = cross_val_predict(CV_model.best_estimator_, features_train, target_train, method="predict_proba", cv=5)

trashold = y_pred[:, 1][-1]

print('Best score: ', CV_model.best_score_)
print('Best trashold: ', trashold)

In [None]:
model = GaussianNB()

CV_model = GridSearchCV(estimator=model,
                 param_grid=params_gnb,
                 cv=5, scoring=recall_params)

CV_model.fit(features_train, target_train)

y_pred = cross_val_predict(CV_model.best_estimator_, features_train, target_train, method="predict_proba", cv=5)

trashold = y_pred[:, 1][-1]

print('Best score: ', CV_model.best_score_)
print('Best trashold: ', trashold)

In [None]:
model = CatBoostClassifier(random_state=RANDOM_STATE)

CV_model = GridSearchCV(estimator=model, param_grid=params_cb_rf, cv = 5, scoring=recall_params)
CV_model.fit(features_train, target_train)

y_pred = cross_val_predict(CV_model.best_estimator_, features_train, target_train, method="predict_proba", cv=5)


In [None]:
trashold = y_pred[:, 1][0]

print('Best score: ', CV_model.best_score_)
print('Best trashold: ', trashold)

In [None]:
catboostclassifier_model = CatBoostClassifier(n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], random_state=RANDOM_STATE, logging_level='Silent')
catboostclassifier_model.fit(features_train, target_train)

In [None]:
probabilities = catboostclassifier_model.predict_proba(features_test)
probabilities_one = probabilities[:, 1]
prediction_test = probabilities_one > trashold

In [None]:
catboostclassifier_precision = precision_score(target_test, prediction_test)
catboostclassifier_recall = recall_score(target_test, prediction_test)
catboostclassifier_f1 = f1_score(target_test, prediction_test)

In [None]:
print(f"Best F1 = {'%.4f' % (catboostclassifier_f1)}\nPecision = {'%.4f' % (catboostclassifier_precision)}\nRecall = {'%.4f' % (catboostclassifier_recall)}")

In [None]:
data_test_V2 = preprocessing(data_test_V2)

In [None]:
def profit_whith_deposit(data):
    if data["predictions"] == 1 and data["is_canceled"]:
        return (profit(data) * 0.2)
    else:
        return profit(data)

In [None]:
data_test_V2["predictions"] = prediction_test

In [None]:
data_test_V2["profit_deposit"] = data_test_V2.apply(profit_whith_deposit, axis=1)

In [None]:
print(f"Итоговая прибыль = {data_test_V2['profit_deposit'].sum() - data_test_V2['profit'].sum() - BUDGET}")