In [13]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from random import randrange

# Extract Data from time period 

In [4]:
raw_data = pd.read_csv('hotel_bookings.csv')

In [5]:
def clean_data(raw_data):
    #--> Map the name of the months with their corresponding numbers
    def map_months(x):
        if x == 'January':
            x = 1
        elif x == 'February':
            x = 2
        elif x == 'March':
            x = 3
        elif x == 'April':
            x = 4
        elif x == 'May':
            x = 5
        elif x == 'June':
            x = 6
        elif x == 'July':
            x = 7
        elif x == 'August':
            x = 8
        elif x == 'September':
            x = 9
        elif x == 'October':
            x = 10
        elif x == 'November':
            x = 11
        elif x == 'December':
            x = 12
        return x

    # Use the above mapping to create a new feature
    raw_data['arrival_date_month_number'] = raw_data['arrival_date_month'].apply(lambda x : map_months(x))
    raw_data['arrival_date'] = raw_data['arrival_date_day_of_month'].astype('str') + '/' + raw_data['arrival_date_month_number'].astype('str') + '/'+ raw_data['arrival_date_year'].astype('str')
    # Convert into datetime variable
    raw_data['arrival_date'] = pd.to_datetime(raw_data['arrival_date'])
    #compute number of days each stay is 
    raw_data['length_stay_days'] = raw_data['stays_in_week_nights'] + raw_data['stays_in_weekend_nights']
    #compute number occupants
    raw_data['occupants'] = raw_data['adults'] + raw_data['children']
    #get rid of samples with length_stay == 0 or adr == 0 
    raw_data = raw_data[raw_data['length_stay_days']>=1]
    raw_data = raw_data[raw_data['adr'] > 0]
    #get rid of a few samples where country is na 
    raw_data = raw_data[~raw_data['country'].isna()]
    #get rid of samples with 0 occupants 
    raw_data = raw_data[raw_data['occupants'] > 0]
    return raw_data

In [6]:
cleaned_data = clean_data(raw_data)

In [7]:
max(cleaned_data['arrival_date'])

Timestamp('2017-12-08 00:00:00')

In [6]:
"""
data cleaning notes: 

-remove bookings with length_stay_days == 0  (only 715 samples)
-remove bookings with adr == (1959 samples)
-remove bookings with na in the following: country (only 488 samples)
"""

'\ndata cleaning notes: \n\n-remove bookings with length_stay_days == 0  (only 715 samples)\n-remove bookings with adr == (1959 samples)\n-remove bookings with na in the following: country (only 488 samples)\n'

In [50]:
def generate_prescriptive_params(raw_data,start_day,time_window,selected_features):
    """
    Function to extract all the parameters we need for the prescriptive model, which will be run in Julia.
    
    @param raw_data: dataframe with raw bookings data. Must have the column 'arrival_date' which is a datetime variable
    @param start_day: first day of window we consider for accepting/rejecting bookings.
                      format should be string of the form "<year>-<month>-<day>"
    @param time_window: Number of days we add onto start_day to create the time window we consider. We get data on 
                        every booking with an arrival date which falls into our window.
    @param selected_features: list of features to pull for each booking. Should match whatever predictive model
                              we trained 
                    
    @return p: vector length n. The (total) price of each booking 
    @return s: vector length n (integers in {0,...,T }). The starting day of each booking
    @return e: vector length n (integers in {0,...,T}). The ending day of each booking
    @return t: vector length n. This gives the number of people that each booking is for
    @return X: n x p matrix containing selected features of the booking
    @return y: whether the booking was canceled or not
    """    
    start_day = datetime.strptime(start_day,'%Y-%m-%d')
    #get bookings in specified time window
    window_data = raw_data[(raw_data['arrival_date'] >=  start_day) &
                           (raw_data['arrival_date'] <= start_day + timedelta(days=time_window))]
    window_data = window_data.sort_values(by='arrival_date',ascending=True)
    print("window_data shape", window_data.shape)
    #compute arrival date as an int ()
    timedeltas_s = window_data['arrival_date'] - start_day
    print("time_deltas_s len: ", len(timedeltas_s))
    s = pd.Series([int(x.days) for x in timedeltas_s])
    print("s shape: ", s.shape)
    #compute end date as an int
    print("len window_data[length_stay_days] shape",(window_data['length_stay_days']).shape)
    e = np.array(s) + np.array(window_data['length_stay_days'])
    print("e len: ", len(e))
    p = window_data['adr']*window_data['length_stay_days']
    t = window_data['occupants']
    X = pd.get_dummies(window_data[selected_features])
    y = window_data['is_canceled']
    return (window_data, np.array(s), np.array(e), 
           np.array(p), np.array(t), X, y)
    

# Fit kNN model

In [51]:
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [52]:
def get_training_data(raw_data,start_date,end_date,selected_features):
    """
    Function to generate training data for a predictive model (OCT,kNN)
    which will later be used for prescriptions
    
    @param raw_data: dataframa holding raw booking data
    @param start_date: format "<year>-<month>-<day>"
    @param end_date: format "<year>-<month>-<day>"
    @param selected_features: list of features model should be fitted on 
    """
    start_date = datetime.strptime(start_date,'%Y-%m-%d')
    end_date = datetime.strptime(end_date,'%Y-%m-%d')
    #get bookings in specified time window
    window_data = raw_data[(raw_data['arrival_date'] >=  start_date) &
                           (raw_data['arrival_date'] <= end_date)]
    window_data = window_data.sort_values(by='arrival_date',ascending=True)
    X = window_data[selected_features]
    X = pd.get_dummies(X)
    y = np.array(window_data['is_canceled'])
    return X,y

In [53]:
def get_prescriptive_params_knn(data,start_date_knn,end_date_knn,
                                start_date_test,time_window,
                                selected_features,k):
    window_data, s, e, p, t, X_new, y_new = generate_prescriptive_params(data,start_date_test,time_window,selected_features)

    X,y = get_training_data(raw_data,start_date_knn,end_date_knn,selected_features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    sclr = StandardScaler()
    sclr.fit(X_train)
    X_train_std, X_test_std = sclr.transform(X_train), sclr.transform(X_test)
    nbrs = NearestNeighbors(n_neighbors=k).fit(X_train_std)
    distance, indices = nbrs.kneighbors(X_new)
    y_neighbors = y[indices]
    
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train_std,y_train)
    y_pred = clf.predict_proba(X_test_std)
    auc = roc_auc_score(y_test, y_pred[:, 1])
    print("kNN test auc: ", auc)
    
    return window_data, s, e, p, t, X_new, np.array(y_new), y_neighbors


In [52]:
start_date_knn = "2017-01-01"
end_date_knn = "2017-05-30"
start_date_test = "2017-07-01"
time_window = 21
selected_features = ['lead_time','length_stay_days','adults','children','babies','meal',
                     'market_segment','distribution_channel','is_repeated_guest','previous_cancellations',
                     'reserved_room_type','booking_changes','deposit_type',
                     'customer_type','adr','required_car_parking_spaces','total_of_special_requests',
                     'occupants']

window_data, s, e, p, t, X_new, y_new, y_neighbors =  get_prescriptive_params_knn(cleaned_data,start_date_knn,end_date_knn,
                                                                                    start_date_test,time_window,
                                                                                    selected_features,
                                                                                 k=15)

window_data shape (2724, 36)
time_deltas_s len:  2724
s shape:  (2724,)
len window_data[length_stay_days] shape (2724,)
e len:  2724


  f"X has feature names, but {self.__class__.__name__} was fitted without"


kNN test auc:  0.8652269938344447


In [12]:
window_data[window_data['arrival_date'] == '2017-07-10']

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date_month_number,arrival_date,length_stay_days,occupants


In [13]:
cleaned_data[cleaned_data['arrival_date'] == '2017-07-10']

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date_month_number,arrival_date,length_stay_days,occupants


In [55]:
np.savetxt('e.csv',e,delimiter=',')
np.savetxt('s.csv',s,delimiter=',')
np.savetxt('p.csv',p,delimiter=',')
np.savetxt('t.csv',t,delimiter=',')
X_new.to_csv('X_new.csv',header=True)
np.savetxt('y_new.csv',y_new,delimiter=',')
np.savetxt('y_neighbors.csv',y_neighbors,delimiter=',')

# Get params of consecutive time windows for $\alpha$ testing

In [50]:
def get_prescriptive_params_knn_consecutive(data,start_date_knn,end_date_knn,
                                            start_date_test,
                                            time_window1, time_window2,
                                            selected_features,k):
    window_data1, s1, e1, p1, t1, X_new1, y_new1 = generate_prescriptive_params(data,start_date_test,time_window1,selected_features)
    
    start_date_period2 = (datetime.strptime(start_date_test,'%Y-%m-%d') + timedelta(days=time_window1+1)).strftime("%Y-%m-%d")
    print("start_date_period2: ",start_date_period2)
    window_data2, s2, e2, p2, t2, X_new2, y_new2 = generate_prescriptive_params(data,start_date_period2,time_window2,selected_features)

    X_new1, X_new2 = X_new1.align(X_new2, join='inner', axis=1)
    
    X,y = get_training_data(raw_data,start_date_knn,end_date_knn,selected_features)
    X, X_new1 = X.align(X_new1, join='inner', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    sclr = StandardScaler()
    sclr.fit(X_train)
    X_train_std, X_test_std = sclr.transform(X_train), sclr.transform(X_test)
    nbrs = NearestNeighbors(n_neighbors=k).fit(X_train_std)
    
    distance1, indices1 = nbrs.kneighbors(X_new1)
    y_neighbors1 = y[indices1]
    
    distance2, indices2 = nbrs.kneighbors(X_new2)
    y_neighbors2 = y[indices2]
    
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train_std,y_train)
    y_pred = clf.predict_proba(X_test_std)
    auc = roc_auc_score(y_test, y_pred[:, 1])
    print("kNN test auc: ", auc)
    
    return (window_data1, s1, e1, p1, t1, X_new1, np.array(y_new1), y_neighbors1,
            window_data2, s2, e2, p2, t2, X_new2, np.array(y_new2), y_neighbors2)
            



In [55]:
start_date_knn = "2016-01-01"
end_date_knn = "2017-04-30"
start_date_test = "2017-05-01"
time_window1 = 20
time_window2 = 20
selected_features = ['lead_time','length_stay_days','adults','children','babies','meal',
                     'market_segment','distribution_channel','is_repeated_guest','previous_cancellations',
                     'reserved_room_type','booking_changes','deposit_type',
                     'customer_type','adr','required_car_parking_spaces','total_of_special_requests',
                     'occupants']
k = 15

In [56]:
(window_data1, s1, e1, p1, t1, X_new1, y_new1, y_neighbors1,
            window_data2, s2, e2, p2, t2, X_new2, y_new2, y_neighbors2) = get_prescriptive_params_knn_consecutive(cleaned_data,start_date_knn,end_date_knn,
                                                                                                            start_date_test,
                                                                                                            time_window1, time_window2,
                                                                                                            selected_features,k)

window_data shape (3164, 36)
time_deltas_s len:  3164
s shape:  (3164,)
len window_data[length_stay_days] shape (3164,)
e len:  3164
start_date_period2:  2017-05-22
window_data shape (3303, 36)
time_deltas_s len:  3303
s shape:  (3303,)
len window_data[length_stay_days] shape (3303,)
e len:  3303
kNN test auc:  0.8806863012255388


In [57]:
np.savetxt('e1.csv',e1,delimiter=',')
np.savetxt('s1.csv',s1,delimiter=',')
np.savetxt('p1.csv',p1,delimiter=',')
np.savetxt('t1.csv',t1,delimiter=',')
X_new1.to_csv('X_new1.csv',header=True)
np.savetxt('y_new1.csv',y_new1,delimiter=',')
np.savetxt('y_neighbors1.csv',y_neighbors1,delimiter=',')

np.savetxt('e2.csv',e2,delimiter=',')
np.savetxt('s2.csv',s2,delimiter=',')
np.savetxt('p2.csv',p2,delimiter=',')
np.savetxt('t2.csv',t2,delimiter=',')
X_new2.to_csv('X_new2.csv',header=True)
np.savetxt('y_new2.csv',y_new2,delimiter=',')
np.savetxt('y_neighbors2.csv',y_neighbors2,delimiter=',')

# Sampling several days

In [None]:
"""
1. Set N = number of days to sample 
2. Randomly sample 10 arrival dates between 2017-04-30 and 2017-12-08
"""

In [54]:
def random_date(start, end):
    """
    This function will return a random datetime between two string dates
    """
    
    start = datetime.strptime(start,'%Y-%m-%d')
    end = datetime.strptime(end,'%Y-%m-%d')
    delta = end - start
    int_delta = delta.days 
    random_day = randrange(int_delta)
    return start + timedelta(days=random_day)

In [59]:
def sample_random_days(start,end,time_window=21,N=100):
    start_date_knn = "2016-01-01"
    end_date_knn = "2017-04-30"
    selected_features = ['lead_time','length_stay_days','adults','children','babies','meal',
                     'market_segment','distribution_channel','is_repeated_guest','previous_cancellations',
                     'reserved_room_type','booking_changes','deposit_type',
                     'customer_type','adr','required_car_parking_spaces','total_of_special_requests',
                     'occupants']
    selected_days = []
    for i in range(N):
        rand_date = random_date(start, end)
        selected_days.append(rand_date)
        rand_date = rand_date.strftime("%Y-%m-%d")
        
        window_data, s, e, p, t, X_new, y_new, y_neighbors =  get_prescriptive_params_knn(cleaned_data,start_date_knn,end_date_knn,
                                                                                    rand_date,time_window,
                                                                                    selected_features,
                                                                                 k=15)
        np.savetxt('100days/e' + str(i+1) + '.csv',e,delimiter=',')
        np.savetxt('100days/s' + str(i+1) + '.csv',s,delimiter=',')
        np.savetxt('100days/p' + str(i+1) + '.csv',p,delimiter=',')
        np.savetxt('100days/t' + str(i+1) + '.csv',t,delimiter=',')
        X_new.to_csv('100days/X_new' + str(i+1) + '.csv',header=True)
        np.savetxt('100days/y_new' + str(i+1) + '.csv',y_new,delimiter=',')
        np.savetxt('100days/y_neighbors' + str(i+1) + '.csv',y_neighbors,delimiter=',')
    with open('100days/aa_selected_days.pickle','wb') as f:
        pickle.dump(selected_days,f)
    return selected_days
    

In [56]:
random_date("2017-04-30", "2017-11-15")

datetime.datetime(2017, 6, 21, 0, 0)

In [60]:
selected_days = sample_random_days("2017-04-30","2017-11-15",time_window=21,N=5)

window_data shape (2567, 36)
time_deltas_s len:  2567
s shape:  (2567,)
len window_data[length_stay_days] shape (2567,)
e len:  2567


  f"X has feature names, but {self.__class__.__name__} was fitted without"


ValueError: X has 42 features, but NearestNeighbors is expecting 44 features as input.