In [1]:
import pandas as pd
import numpy as np
from HTML import config as Config

In [2]:
month_number = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}
def parse_date(year, month, date):
    return '{:04d}-{:02d}-{:02d}'.format(year, month_number[month], date)

In [3]:
def add_arrival_date(x):
    date = [None] * len(x)
    for i, row in x[['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month']].iterrows():
        date[i] = parse_date(row[0], row[1], row[2])
    x['arrival_date'] = date
    return x

In [4]:
x_train = pd.read_csv(Config.train_path)
N = x_train.shape[0]
x_test = pd.read_csv(Config.test_path)
X = add_arrival_date(pd.concat((x_train, x_test), ignore_index=True))

In [5]:
nominal = [
    'hotel',
    'is_canceled',
    'arrival_date_month',
    'arrival_date_week_number',
    'arrival_date_day_of_month',
    'country',
    'market_segment',
    'distribution_channel',
    'is_repeated_guest',
    'reserved_room_type',
    'assigned_room_type',
    'agent',
    'company',
    'customer_type',
    'reservation_status'
]
ordinal = ['meal', 'deposit_type']
interval = ['arrival_date_year']
ratio = [
    'lead_time',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults',
    'children',
    'babies',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'booking_changes',
    'days_in_waiting_list',
    'adr',
    'required_car_parking_spaces',
    'total_of_special_requests'
]

In [6]:
dummies = pd.get_dummies(
    data=X.drop(columns=['ID', 'reservation_status_date']),
    columns=nominal+ordinal,
    drop_first=True,
    dummy_na=True)

In [7]:
dummies = dummies.fillna(dummies.mean())

In [8]:
x_train_processed = dummies.iloc[:N,:].groupby('arrival_date').mean()
x_test_processed = dummies.iloc[N:,:].groupby('arrival_date').mean()

In [9]:
y_train = pd.read_csv(Config.train_label_path)

In [10]:
x_train_processed.merge(y_train, left_index=True, right_on='arrival_date')

Unnamed: 0,lead_time,arrival_date_year,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,meal_FB,meal_HB,meal_SC,meal_Undefined,meal_nan,deposit_type_Non Refund,deposit_type_Refundable,deposit_type_nan,arrival_date,label
0,186.745902,2015.0,0.368852,2.745902,1.827869,0.016393,0.000000,0.090164,0.073770,0.270492,...,0.008197,0.622951,0.000000,0.000000,0.0,0.000000,0.0,0.0,2015-07-01,2.0
1,153.129032,2015.0,0.806452,3.129032,2.000000,0.043011,0.000000,0.505376,0.032258,0.043011,...,0.010753,0.096774,0.000000,0.000000,0.0,0.000000,0.0,0.0,2015-07-02,1.0
2,72.857143,2015.0,0.946429,3.339286,2.000000,0.071429,0.000000,0.017857,0.000000,0.107143,...,0.000000,0.339286,0.017857,0.000000,0.0,0.000000,0.0,0.0,2015-07-03,1.0
3,154.965909,2015.0,1.863636,3.125000,1.943182,0.090909,0.022727,0.363636,0.000000,0.090909,...,0.011364,0.125000,0.000000,0.000000,0.0,0.261364,0.0,0.0,2015-07-04,1.0
4,83.641509,2015.0,2.283019,3.679245,1.943396,0.188679,0.000000,0.150943,0.000000,0.207547,...,0.000000,0.264151,0.018868,0.000000,0.0,0.000000,0.0,0.0,2015-07-05,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,92.825137,2017.0,1.049180,1.781421,1.846995,0.010929,0.000000,0.005464,0.098361,0.169399,...,0.000000,0.027322,0.087432,0.000000,0.0,0.245902,0.0,0.0,2017-03-27,2.0
636,61.102273,2017.0,0.522727,3.500000,1.761364,0.034091,0.011364,0.011364,0.488636,0.125000,...,0.000000,0.079545,0.125000,0.000000,0.0,0.000000,0.0,0.0,2017-03-28,1.0
637,44.642857,2017.0,0.246753,2.590909,1.675325,0.051948,0.012987,0.012987,0.467532,0.201299,...,0.000000,0.045455,0.155844,0.000000,0.0,0.000000,0.0,0.0,2017-03-29,2.0
638,115.849162,2017.0,0.541899,2.849162,1.737430,0.050279,0.016760,0.005587,0.217877,0.268156,...,0.000000,0.039106,0.173184,0.089385,0.0,0.094972,0.0,0.0,2017-03-30,3.0


In [11]:
y_test = pd.read_csv(Config.test_nolabel_path)

In [12]:
x_test_processed.merge(y_test, left_index=True, right_on='arrival_date')

Unnamed: 0,lead_time,arrival_date_year,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,reservation_status_nan,meal_FB,meal_HB,meal_SC,meal_Undefined,meal_nan,deposit_type_Non Refund,deposit_type_Refundable,deposit_type_nan,arrival_date
0,95.265152,2017.0,1.704545,2.265152,2.068182,0.151515,0.007576,0.000000,0.053030,0.189394,...,1.0,0.000000,0.030303,0.151515,0.015152,0.0,0.000000,0.000000,0.0,2017-04-01
1,73.364583,2017.0,1.937500,1.885417,1.947917,0.083333,0.010417,0.010417,0.104167,0.302083,...,1.0,0.000000,0.093750,0.145833,0.000000,0.0,0.000000,0.000000,0.0,2017-04-02
2,76.995671,2017.0,1.086580,2.220779,1.761905,0.082251,0.004329,0.008658,0.082251,0.220779,...,1.0,0.000000,0.056277,0.064935,0.000000,0.0,0.207792,0.000000,0.0,2017-04-03
3,104.544000,2017.0,0.208000,2.840000,1.912000,0.152000,0.000000,0.008000,0.248000,0.312000,...,1.0,0.000000,0.040000,0.072000,0.000000,0.0,0.320000,0.000000,0.0,2017-04-04
4,119.769841,2017.0,0.214286,3.269841,1.702381,0.051587,0.000000,0.003968,0.158730,0.738095,...,1.0,0.000000,0.011905,0.063492,0.003968,0.0,0.186508,0.000000,0.0,2017-04-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,93.954023,2017.0,1.643678,1.534483,1.948276,0.258621,0.000000,0.000000,0.022989,0.293103,...,1.0,0.011494,0.143678,0.178161,0.005747,0.0,0.000000,0.000000,0.0,2017-08-27
149,139.222749,2017.0,1.180095,3.194313,1.696682,0.151659,0.004739,0.000000,0.028436,0.644550,...,1.0,0.014218,0.469194,0.037915,0.000000,0.0,0.000000,0.004739,0.0,2017-08-28
150,141.672000,2017.0,0.416000,3.472000,1.816000,0.168000,0.000000,0.024000,0.440000,0.512000,...,1.0,0.000000,0.080000,0.176000,0.000000,0.0,0.080000,0.000000,0.0,2017-08-29
151,98.303371,2017.0,0.516854,3.404494,1.955056,0.213483,0.011236,0.022472,0.112360,0.382022,...,1.0,0.000000,0.134831,0.078652,0.011236,0.0,0.000000,0.000000,0.0,2017-08-30
