In [11]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
import time
import random
import xgboost as xgb

from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [12]:
# import datasets
test = pd.read_csv("../test.csv")
train = pd.read_csv("../train.csv")

In [13]:
def trim_searchid(ids, booking_bools):
    '''
    removes searchid's where no hotel was booked from training data
    '''
    no_booking = []
    booking = False
    
    for i in tqdm(range(len(ids))):
        
        if booking_bools[i] == 1:
            booking = True
        # calculate relative price to average price
        if ids[i-1] != ids[i] or i == len(ids)-1: 
            if booking == False:
                no_booking.append(ids[i-1])
            booking = False

        
    ids_to_be_removed = set(no_booking)
    return no_booking    

In [14]:
'''
ID's waar niks is geboekt worden weggehaald
'''
ids = train['srch_id'].to_numpy()
booking_bools = train['booking_bool'].to_numpy()
ids_to_be_removed = set(trim_searchid(ids, booking_bools))

print('number of entries with all hotels:',len(train.index))

train = train[~train['srch_id'].isin(ids_to_be_removed)]
# test = test[~test['srch_id'].isin(ids_to_be_removed)]
    
print('after entries with no booking are removed:',len(train.index))

100%|██████████| 4958347/4958347 [00:07<00:00, 671527.59it/s]


number of entries with all hotels: 4958347
after entries with no booking are removed: 3386771


In [15]:
def get_booked_hotels(hotels, booking_bools):
    '''werkt nog niet!!'''
    bookings = len(np.where(booking_bools==1)[0])
    proportion_booked = bookings / len(booking_bools)
    
    to_be_kept = []
    
    for i in tqdm(range(len(booking_bools))):
        if booking_bools[i] == 1:
            to_be_kept.append(hotels[i])

    return set(to_be_kept)
    
                
hotels = train['prop_id'].to_numpy()
booking_bools = train['booking_bool'].to_numpy()

hotels_to_keep = get_booked_hotels(hotels, booking_bools)
all_hotels = set(hotels)
non_booked_hotels = [hotel for hotel in all_hotels if hotel not in hotels_to_keep]


nr_of_non_booked_to_be_kept = len(hotels_to_keep)
hotels_to_keep = list(hotels_to_keep)

random.shuffle(non_booked_hotels)
hotels_to_keep.extend(non_booked_hotels[0:nr_of_non_booked_to_be_kept])

print('number of hotels before', len(set(hotels)))
print('number of hotels after', len(hotels_to_keep))


# print('with all hotels:', len(train.index))
# train = train[train.index.isin(indices)]
# print('after most hotels with no bookings are removed:',len(train.index))


100%|██████████| 3386771/3386771 [00:02<00:00, 1293666.10it/s]


number of hotels before 118839
number of hotels after 83346


In [16]:
print('entries with all hotels:', len(train.index))
train = train[train['prop_id'].isin(hotels_to_keep)]
print('after some hotels with no bookings are removed:',len(train.index))

entries with all hotels: 3386771
after some hotels with no bookings are removed: 2988344


In [8]:
#Hotel quality (2nd place feature engineering solution)
hotel_quality = pd.DataFrame(train.prop_id.value_counts(dropna = False))

hotel_quality = hotel_quality.join(pd.DataFrame(train.prop_id[train.booking_bool == 1].value_counts().astype(int)), rsuffix = "book")
hotel_quality = hotel_quality.join(pd.DataFrame(train.prop_id[train.click_bool == 1].value_counts().astype(int)), rsuffix = "click")
hotel_quality.columns = ["counts", "booked", "clicked"]


hotel_quality["%booked_prop"] = hotel_quality.booked / hotel_quality.counts * 100
hotel_quality["%clicked_prop"] = hotel_quality.clicked / hotel_quality.counts * 100


train = train.join(hotel_quality['%booked_prop'], on = "prop_id").fillna(0)
train = train.join(hotel_quality['%clicked_prop'], on = "prop_id").fillna(0)
test = test.join(hotel_quality['%booked_prop'], on = "prop_id").fillna(0)
test = test.join(hotel_quality['%clicked_prop'], on = "prop_id").fillna(0)
test

Unnamed: 0.1,Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,orig_destination_distance,random_bool,comp_rate,comp_inv,comp_percent,month,daypart,prop_location_score2,%booked_prop,%clicked_prop
0,0,1,24,216,0.0,0.0,219,3180,3,4.5,...,8.0,0,0.00,0.0,0.000000,2,3,0.07,0.000000,0.000000
1,1,1,24,216,0.0,0.0,219,5543,3,4.5,...,8.0,0,0.00,0.0,0.000000,2,3,0.08,6.000000,6.000000
2,2,1,24,216,0.0,0.0,219,14142,2,3.5,...,8.0,0,0.50,0.0,10.000000,2,3,0.06,1.600000,2.400000
3,3,1,24,216,0.0,0.0,219,22393,3,4.5,...,8.0,0,0.00,0.0,0.000000,2,3,0.06,1.694915,1.694915
4,4,1,24,216,0.0,0.0,219,24194,3,4.5,...,8.0,0,0.00,0.0,0.000000,2,3,0.20,1.764706,1.764706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4959178,4959178,332787,24,216,0.0,0.0,117,32019,4,3.5,...,8.0,0,0.50,0.0,49.500000,5,2,0.06,0.000000,0.000000
4959179,4959179,332787,24,216,0.0,0.0,117,33959,4,3.0,...,8.0,0,0.25,0.0,19.000000,5,2,0.30,11.111111,11.111111
4959180,4959180,332787,24,216,0.0,0.0,117,35240,4,0.0,...,8.0,0,0.25,0.0,24.666667,5,2,0.00,0.000000,0.000000
4959181,4959181,332787,24,216,0.0,0.0,117,94437,4,0.0,...,8.0,0,0.00,0.0,27.500000,5,2,0.09,0.000000,0.000000


## Hele dateset:

In [17]:
'''
drop cols we don't want to use for the classifier
'''

print(len(train.columns), 'cols left')
print('-----------------------')


booking_bool = train['booking_bool'].to_numpy()
test_ids = test['srch_id'].to_numpy()

train_copy = train
for colname in train.columns:
    if colname not in test.columns:
        print('dropping', colname)
        train_copy = train_copy.drop(colname, axis=1)


print(len(train_copy.columns), 'cols left')
print('-----------------------')


cols_to_be_removed = ['Unnamed: 0',
                      'srch_id',
                      'visitor_hist_starrating',
                      'visitor_hist_adr_usd',
                      'srch_query_affinity_score',
                      'comp_percent']

for colname in cols_to_be_removed:
    print('dropping', colname)
    train_copy = train_copy.drop(colname, axis=1)
    test = test.drop(colname, axis=1)
    


print('-----------------------')
print(len(train_copy.columns), 'cols left')




34 cols left
-----------------------
dropping click_bool
dropping booking_bool
dropping position
31 cols left
-----------------------
dropping Unnamed: 0
dropping srch_id
dropping visitor_hist_starrating
dropping visitor_hist_adr_usd
dropping srch_query_affinity_score
dropping comp_percent
-----------------------
25 cols left


In [None]:
clf = RandomForestClassifier().fit(train_copy, booking_bool)
predictions = clf.predict_proba(test)

# clf = LogisticRegression().fit(train_copy, booking_bool)
# predictions = clf.predict_proba(test)

boos = xgb.XGBRegressor().fit(train_copy, booking_bool)
boost_predictions = boost.predict(test)

In [None]:
test_results = pd.DataFrame()
test_results['srch_id'] = test_ids
test_results['prop_id'] = [int(x) for x in test['prop_id']] #somehow it changes the prop_id values into floats?

pred_1 = []
for i in range(len(predictions)):
    pred_1.append(predictions[i][1])
    
test_results['Probability_of_booking'] = pred_1
test_results

In [None]:
final_results = test_results.sort_values(by = ['srch_id','Probability_of_booking'], ascending=[True,False])
final_results = final_results.drop('Probability_of_booking', 1)
print(final_results)


In [None]:
final_results
final_results.to_csv('results/randomforest2.csv', index=False)