# LogReg on **is_canceled**/ LinReg on **adr** (selected features)
**___**
## Preprocessing.

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

df = pd.read_csv("train.csv")
print(df.loc[0])
df_req = df[['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month',
             'is_canceled', 'adr', 'stays_in_weekend_nights', 'stays_in_week_nights']]
df_req = df_req.assign(rev = lambda x: (x.is_canceled==0)*x.adr*(x.stays_in_weekend_nights+x.stays_in_week_nights))
print(df_req.head())

ID                                           0
hotel                             Resort Hotel
is_canceled                                  0
lead_time                                  342
arrival_date_year                         2015
arrival_date_month                        July
arrival_date_week_number                    27
arrival_date_day_of_month                    1
stays_in_weekend_nights                      0
stays_in_week_nights                         0
adults                                       2
children                                     0
babies                                       0
meal                                        BB
country                                    PRT
market_segment                          Direct
distribution_channel                    Direct
is_repeated_guest                            0
previous_cancellations                       0
previous_bookings_not_canceled               0
reserved_room_type                           C
assigned_room

In [2]:
sort_dict_m = {'January':'01', 'February':'02', 'March':'03', 'April':'04', 'May':'05', 'June':'06',
               'July':'07', 'August':'08', 'September':'09', 'October':'10', 'November':'11', 'December':'12'}
sort_dict_d = {1:'01', 2:'02', 3:'03', 4:'04', 5:'05', 6:'06', 7:'07', 8:'08', 9:'09'}
df_req = df_req.replace({'arrival_date_month':sort_dict_m})
df_req = df_req.replace({'arrival_date_day_of_month':sort_dict_d})
df_req = df_req.assign(arrival_date = lambda x: x["arrival_date_year"].astype(str)+'-'+x["arrival_date_month"]+'-'+x["arrival_date_day_of_month"].astype(str))

rev = df_req[['arrival_date', 'rev']]
rev = rev.groupby(['arrival_date']).sum()
print(rev)

                       rev
arrival_date              
2015-07-01    20311.186621
2015-07-02    16530.645277
2015-07-03    12966.714164
2015-07-04    17480.654256
2015-07-05    19591.458478
...                    ...
2017-03-27    26217.381380
2017-03-28    16185.177703
2017-03-29    24002.255525
2017-03-30    33095.297394
2017-03-31    36062.103164

[640 rows x 1 columns]


___
## LogReg on **is_canceled**/ LinReg on **adr** with features selected by human brain.

In [3]:
df_test = pd.read_csv("test.csv")

df_test_fisc = df_test[['hotel', 'lead_time', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled',
                        'days_in_waiting_list', 'deposit_type']]
df_test_fisc_encoded = pd.get_dummies(df_test_fisc, columns=['hotel', 'deposit_type'])
poly = PolynomialFeatures(interaction_only=True)    # only interaction features are produced
df_test_fisc_inter = poly.fit_transform(df_test_fisc_encoded)

In [4]:
# selected features that affect is_canceled
X_fisc = df[['hotel', 'lead_time', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled',
             'days_in_waiting_list', 'deposit_type']]
X_fisc = X_fisc.dropna()
X_fisc_encoded = pd.get_dummies(X_fisc, columns=['hotel', 'deposit_type'])

isc = df['is_canceled']
X_fisc_inter = poly.fit_transform(X_fisc_encoded)
# print(np.shape(X_fisc_inter))
reg = LogisticRegression(random_state=0).fit(X_fisc_inter, isc)
print(reg.score(X_fisc_inter, isc))

isc_pred = reg.predict(df_test_fisc_inter)
print(isc_pred)



0.7911308736930658
[0 0 0 ... 0 0 0]


In [5]:
df_test_fadr = df_test[['stays_in_weekend_nights', 'stays_in_week_nights', 'reserved_room_type', 'assigned_room_type', 'customer_type']]
df_test_fadr_encoded = pd.get_dummies(df_test_fadr, columns=['reserved_room_type', 'assigned_room_type', 'customer_type'])
# add missing columns
df_test_fadr_encoded.insert(10, 'reserved_room_type_L', 0)
df_test_fadr_encoded.insert(22, 'assigned_room_type_L', 0)

df_test_fadr_inter = poly.fit_transform(df_test_fadr_encoded)

In [6]:
# selected features that affect adr
X_fadr = df[['stays_in_weekend_nights', 'stays_in_week_nights', 'reserved_room_type', 'assigned_room_type', 'customer_type']]
X_fadr = X_fadr.dropna()
X_fadr_encoded = pd.get_dummies(X_fadr, columns=['reserved_room_type', 'assigned_room_type', 'customer_type'])

adr = df['adr']
X_fadr_inter = poly.fit_transform(X_fadr_encoded)
# print(np.shape(X_fadr_inter))
reg = LinearRegression(normalize=True).fit(X_fadr_inter, adr)
print(reg.score(X_fadr_inter, adr))

adr_pred = reg.predict(df_test_fadr_inter)
print(adr_pred)

0.18371867578406753
[98. 60. 58. ... 64. 76. 76.]


___
## Calculate **revenue**.

In [7]:
# Calculate rev from predicted is_canceled and adr
rev_pred = np.zeros(np.shape(isc_pred)[0])
for i in range (np.shape(isc_pred)[0]):
    rev_pred[i] = (isc_pred[i]==0)*adr_pred[i]*(df_test['stays_in_weekend_nights'][i] + df_test['stays_in_week_nights'][i])
print(rev_pred)

[686. 840. 406. ... 192. 532.  76.]


In [8]:
# Sum daily revenues
df_test.insert(np.shape(df_test)[1], 'rev', rev_pred)
df_test = df_test.replace({'arrival_date_month':sort_dict_m})
df_test = df_test.replace({'arrival_date_day_of_month':sort_dict_d})
df_test = df_test.assign(arrival_date = lambda x: x["arrival_date_year"].astype(str)+'-'+x["arrival_date_month"]+'-'+x["arrival_date_day_of_month"].astype(str))

df_test_rev = df_test[['arrival_date', 'rev']]
df_test_rev = df_test_rev.groupby(['arrival_date']).sum()
print(df_test_rev)

                  rev
arrival_date         
2017-04-01    49962.0
2017-04-02    34222.0
2017-04-03    51670.0
2017-04-04    24264.0
2017-04-05    47382.0
...               ...
2017-08-27    54086.0
2017-08-28    75314.0
2017-08-29    36204.0
2017-08-30    27490.0
2017-08-31    43962.0

[153 rows x 1 columns]


___
## From **revenue** to **ranking**.

In [9]:
df_y = pd.read_csv("train_label.csv")
rank = df_y['label']

In [10]:
poly = PolynomialFeatures(2)    # default degree: 2, may change
rev_inter = poly.fit_transform(rev)
reg = LinearRegression(normalize=True).fit(rev_inter, rank)
print(reg.score(rev_inter, rank))

df_test_rev_inter = poly.fit_transform(df_test_rev)
rank_pred = reg.predict(df_test_rev_inter)
for i in range(np.shape(rank_pred)[0]):
    if (rank_pred[i] < 0):
        rank_pred[i] = 0
    elif (rank_pred[i] > 9):
        rank_pred[i] = 9
    else:
        rank_pred[i] = round(rank_pred[i])
# print(rank_pred)

0.96217008017884


In [11]:
# Generate submission file
ans = df_test_rev
ans['label'] = rank_pred
ans.drop(columns=['rev'], inplace=True)
print(ans)
# ans.to_csv('out.csv')

              label
arrival_date       
2017-04-01      4.0
2017-04-02      3.0
2017-04-03      5.0
2017-04-04      2.0
2017-04-05      4.0
...             ...
2017-08-27      5.0
2017-08-28      7.0
2017-08-29      3.0
2017-08-30      2.0
2017-08-31      4.0

[153 rows x 1 columns]
