In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from pandas.tseries.holiday import USFederalHolidayCalendar

In [2]:
dateparser = lambda x: pd.Timestamp(x).to_pydatetime().weekday()

df = pd.read_csv('../datasets/agoda_cancellation_train.csv')
week1_df = pd.read_csv('test_set_week_1.csv')

In [3]:
def pre_process(df, flag=False):
    no_use_now = ['h_booking_id', 'language', 'hotel_chain_code', 'hotel_area_code', 'hotel_brand_code',
                 'hotel_city_code', 'origin_country_code', 'h_customer_id', 'guest_nationality_country_name',     
                 'hotel_id', 'customer_nationality', 'hotel_country_code']

    one_hot_cols = ['charge_option', 'accommadation_type_name',
                    'original_payment_method', 'original_payment_type', 'original_payment_currency']

    dates = ['checkin_date', 'checkout_date', 'booking_datetime', 'cancellation_policy_code']

    req = ['request_highfloor', 'request_nonesmoke', 'request_latecheckin', 'request_largebed', 
           'request_twinbeds', 'request_airport', 'request_earlycheckin']

    response = ['cancellation_datetime']

    to_remove = no_use_now + one_hot_cols + dates + response + req 
    
    
    df['checkin_date_month'] = pd.to_datetime(df.checkin_date).apply(lambda d: d.month).astype(int)
    df['checkin_date_weekday'] = pd.to_datetime(df.checkin_date).apply(dateparser).astype(int)

    df['checkout_date_month'] = pd.to_datetime(df.checkout_date).apply(lambda d: d.month).astype(int)
    df['checkout_date_weekday'] = pd.to_datetime(df.checkout_date).apply(dateparser).astype(int)


    df['booking_datetime_month'] = pd.to_datetime(df.booking_datetime).apply(lambda d: d.month).astype(int)
    df['booking_datetime_weekday'] = pd.to_datetime(df.booking_datetime).apply(dateparser).astype(int)

    df['hotel_live_date'] = pd.to_datetime(df.hotel_live_date).apply(lambda d: d.year - 2000).astype(int)


    df["special_request"] = np.where((df.request_highfloor == 1.0)  | (df.request_nonesmoke == 1.0) | 
                                     (df.request_latecheckin == 1.0) | (df.request_largebed == 1.0) | 
                                     (df.request_twinbeds == 1.0) | (df.request_airport == 1.0) | 
                                     (df.request_earlycheckin == 1.0), 1.0, 0.0)

    for col in one_hot_cols:
        df = pd.concat([df, pd.get_dummies(df[col])], axis=1)


    df.is_user_logged_in = df.is_user_logged_in.astype(int)
    df.is_first_booking = df.is_first_booking.astype(int)


    df['first_policy_days'] = df.cancellation_policy_code.apply(lambda s: ((s.split('_')[0]).split('D')[0]))
    df['first_policy'] = df.cancellation_policy_code.apply(lambda s: (s.split('_')[0]).split('D')[1] if (len((s.split('_')[0]).split('D')) == 2) else np.nan)

    df = df[(df.first_policy.apply(str) != 'nan')]
    df = df[(df.first_policy_days != 'UNKNOWN')]

    df['first_policy_days'] = df.first_policy_days.astype(int)

    df['first_prec'] = np.where(
                        df.first_policy.astype(str).apply(lambda s: (str(s)[-1]) == 'N'),
                        df.first_policy_days.astype(int) / ((pd.to_datetime(df.checkout_date) - pd.to_datetime(df.checkin_date)).apply(lambda d: d.days).astype(int)) * 100,
                        df.first_policy.apply(lambda p: int(str(p)[:-1])))


    df['first_policy'] = df.first_policy.astype(str).apply(lambda s: (str(s)[-1]) == 'N').astype(int)
    
    df['days_from_order_to_checkin'] = (df.checkin_date.apply(lambda d: dt.datetime.strptime(d[:-9], '%Y-%m-%d')) - df.booking_datetime.apply(lambda d: dt.datetime.strptime(d[:-9], '%Y-%m-%d'))).dt.days
    df['total_stay_nights'] = (df.checkout_date.apply(lambda d: dt.datetime.strptime(d[:-9], '%Y-%m-%d')) - df.checkin_date.apply(lambda d: dt.datetime.strptime(d[:-9], '%Y-%m-%d'))).dt.days
    df['in_cancel_on_order'] = (df.days_from_order_to_checkin < df.first_policy_days).astype(int)    
    
#     cal = USFederalHolidayCalendar()
#     holidays = cal.holidays(start='2018-01-01', end='2018-12-31').to_pydatetime()
#     df['in_holidays'] = [int(date in holidays) for date in df.checkin_date.apply(lambda d: dt.datetime.strptime(d[:-9], '%Y-%m-%d'))]
    
    if flag:
        y = (df.cancellation_datetime.fillna(0) != 0).astype(int)
        df.drop(to_remove, axis=1, inplace=True)
        return df, y
    to_remove.remove('cancellation_datetime')
    df.drop(to_remove, axis=1, inplace=True)
    return df

In [4]:
# pre process:
df, y = pre_process(df, True)
week1_df = pre_process(week1_df)
df = df.loc[:, week1_df.columns]
df = df[[col for col in df.columns if col != 'UNKNOWN']]
week1_df = week1_df[[col for col in week1_df.columns if col != 'UNKNOWN']]

In [5]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hotel_live_date,58091.0,14.188997,2.837117,-1.0,12.0,15.0,17.0,19.0
hotel_star_rating,58091.0,3.223434,1.169664,-1.0,3.0,3.0,4.0,5.0
guest_is_not_the_customer,58091.0,0.214629,0.410568,0.0,0.0,0.0,0.0,1.0
no_of_adults,58091.0,2.344029,1.324086,1.0,2.0,2.0,2.0,38.0
no_of_children,58091.0,0.148715,0.530727,0.0,0.0,0.0,0.0,10.0
...,...,...,...,...,...,...,...,...
first_policy,58091.0,0.373535,0.483746,0.0,0.0,0.0,1.0,1.0
first_prec,58091.0,162.567639,941.521846,0.0,100.0,100.0,100.0,36500.0
days_from_order_to_checkin,58091.0,29.776678,46.596508,-1.0,1.0,10.0,37.0,448.0
total_stay_nights,58091.0,1.970890,1.660980,1.0,1.0,1.0,2.0,30.0


In [6]:
# split data
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=np.random.randint(1000))

# Random Forest Classifier


In [7]:
# # train model
# clf = RandomForestClassifier()
# clf.fit(X_train, y_train)

In [8]:
# # test score
# clf.score(X_test, y_test)

# Logistic Regression

In [9]:
# train model
lr = LogisticRegression(max_iter=12000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=12000)

In [10]:
# test score
lr.score(X_test, y_test)

0.7895258463303949

In [11]:
# predict:
week1_y = lr.predict(week1_df)

# results

In [12]:
# clf = RandomForestClassifier()
# clf.fit(df, y)

In [13]:
# clf.predict(X_test).sum()

In [14]:
# predict:
week1_y = lr.predict(week1_df)
# week1_y = clf.predict(week1_df)

In [15]:
# results:
week1_df['predicted_values'] = week1_y
week1_df.to_csv('207042714_315317255_207902537.csv')
week1_df

Unnamed: 0,hotel_live_date,hotel_star_rating,guest_is_not_the_customer,no_of_adults,no_of_children,no_of_extra_bed,no_of_room,original_selling_amount,is_user_logged_in,is_first_booking,...,USD,VND,ZAR,first_policy_days,first_policy,first_prec,days_from_order_to_checkin,total_stay_nights,in_cancel_on_order,predicted_values
0,17,3.0,0,4,0,0,2,89.32,0,1,...,0,0,0,2,0,100.0,24,1,0,0
1,14,3.0,0,2,0,0,1,135.36,1,1,...,0,0,0,7,1,350.0,71,2,0,0
2,12,3.0,0,4,0,0,2,215.04,1,0,...,0,0,0,3,0,100.0,148,1,0,1
3,17,5.0,0,2,1,0,1,930.67,0,1,...,0,0,0,27,0,100.0,26,4,1,0
4,18,5.0,0,2,0,0,1,233.10,0,0,...,0,0,0,3,0,50.0,26,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,15,3.0,0,4,0,0,2,148.84,0,0,...,0,0,0,1,1,100.0,163,1,0,0
696,12,4.5,0,4,1,0,3,217.08,1,0,...,0,0,0,1,1,100.0,151,1,0,0
697,18,0.0,0,4,0,0,1,97.01,0,1,...,0,0,0,14,0,100.0,36,1,0,1
698,15,3.0,0,2,0,0,1,124.80,0,0,...,0,0,0,1,1,20.0,38,5,0,0


not for this week 

In [16]:
# np.where(df.cancellation_policy_code.apply(lambda s: s.split('_')[1] if len(s.split('_')) > 1 else '').apply(lambda y: 'D' in y), 1, 0).sum()

In [17]:
# currency = {'RON': 0.253, 'RUB': 0.015, 'SAR': 0.267, 'SEK': 0.114, 'SGD': 0.747, 'THB': 0.031,
#             'TRY': 0.206, 'TWD': 0.033, 'UAH': 0.036, 'USD': 1, 'VND': 0.000044, 'XPF': 0.0099,
#             'ZAR': 0.075, 'IDR': 0.000070, 'ILS': 0.278, 'INR': 0.0146, 'JOD': 1.4093, 'KHR': 0.00025,
#             'JPY': 0.0082, "KRW": 0.0009, "KWD": 3.3106, "KZT": 0.0029, "LAK": 0.000085, "LKR": 0.0065,
#             "MXN": 0.0521, "MYR": 0.239, "NGN": 0.00274, "NOK": 0.11, "NZD": 0.6923, "OMR": 2.6017,
#             "PHP": 0.019, "PKR": 0.0083, "PLN": 0.2658, "QAR": 0.2746, "AED": 0.27224, "ARS" : 0.02662,
#             "AUD": 0.7475, "BDT": 1 / 83.874, "BHD": 2.6507, "BRL": 1 / 3.6535, "CAD": 0.7717, 
#             "CHF": 1.0225, "CNY": 0.1514, "CZK":  21.7429, "DKK": 1 / 6.3177, "EGP": 1/ 17.8071,
#             "EUR": 1.1811, "FJD": 0.4805, "GBP": 1.3349, "HKD": 0.1276, "HUF": 1 / 279.99}

# df['price_norm'] = df.original_selling_amount * df.original_payment_currency.apply(lambda x: currency[x])