In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('./dataset/training_set_VU_DM.csv')
df_test = pd.read_csv('./dataset/test_set_VU_DM.csv')

In [2]:
df['target_label'] = 0

# 5 - The user purchased a room at this hotel
# 1 - The user clicked through to see more information on this hotel
# 0 - The user neither clicked on this hotel nor purchased a room at this hotel
df.loc[df['click_bool'] == 1, 'target_label'] = 1
df.loc[df['booking_bool'] == 1, 'target_label'] = 5

# Dropping columns 
For starters, we can drop columns that do not provide valuable information or are missing a lot of values

- Dropping the different company attribute since the majority of the values are missing, and they seem to provide little information (the only thing that comes to mind perhaps is if someone visits expedia, therefore, they trust the brand more, and thus seeing them having more expensive options would not change their mind... WAY too hard to capture... perhaps better to drop for now)
- date_time: since no one cares about when search took place (might matter in some cases like trends and seasonality but will be also extremely hard to do)
- gross_booking: since  our model should not care about how much they spent on the hotel, and only if they purchase or not
- click_bool, booking_book: transformed into a relevant target_bool column
- srch_affinity_score: I do not understand this attribute or how it's supposed to be relevant. Will remove now, revisit late 
- srch_booking_window: Irrelevant for ranking
- prop_location_score2: missing 22%, seems very valuable, find suitable imputation method
- 'orig_destination_distance', 'srch_query_affinity_score': LOTS of missing values, Might impute later to test
- random_bool: an interesting attribute but fail to see its relevance, will revisit!

In [3]:
drop_columns_train = ['date_time', 'site_id', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff', 'click_bool', 'booking_bool', 'gross_bookings_usd', 'orig_destination_distance', 'random_bool']

drop_columns_test = ['date_time', 'site_id', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff', 'orig_destination_distance', 'random_bool']


df.drop(columns=drop_columns_train, inplace=True)
df_test.drop(columns=drop_columns_test, inplace=True)

# Experimenting with imputation
User history rating, THE majority is null. Use mean imputation to fill these values. The idea is that the majority of the people will not rate too high or too low. 
Another approache besides mean imputation is to learn the distribution of the data (for example normal distribution for the average rating) and impute based on that, so it would retain it's normal distribution.... 

In [4]:
# for column in ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_review_score', 'prop_country_id']:
#     mean_value = df[column].mean()
#     df[column].fillna(mean_value, inplace=True)
# 
# for column in ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_review_score', 'prop_country_id']:
#     mean_value = df_test[column].mean()
#     df_test[column].fillna(mean_value, inplace=True)

# Normalization

# Removing the unique identifiers

In [5]:
unique_ids = ['visitor_location_country_id', 'srch_destination_id']

df.drop(columns=unique_ids, inplace=True)
df_test.drop(columns=unique_ids, inplace=True)

df.to_csv("./dataset/train_clean_v1_noIDs.csv", index=False)
df_test.to_csv("./dataset/test_clean_v1_noIDs.csv", index=False)

# Aggregated features 
will try to aggregate some of the features, since it might be easier for the model to understand


In [6]:
# aggregating the price of hotels over the entire timeline 
df['MEAN_price_per_prop'] = df.groupby('prop_id')['price_usd'].transform('mean')
df['SUB_price_MEAN'] = df['price_usd'] - df['MEAN_price_per_prop']

# aggregating the starrating of hotels per query, might be more interesting for model if positives show an above average hotel, etc...
# MEAN_startrating_per_query = df.groupby('srch_id')['prop_starrating'].transform('mean')
# df['SUB_starrating_MEAN'] = df['prop_starrating'] - MEAN_startrating_per_query

# aggregating the starrating of hotels per query, might be more interesting for model if positives show an above average hotel, etc...
MEAN_propscore2_per_query = df.groupby('srch_id')['prop_location_score2'].transform('mean')
df['SUB_propscore2_MEAN'] = df['prop_location_score2'] - MEAN_propscore2_per_query


df_test['MEAN_price_per_prop'] = df_test.groupby('prop_id')['price_usd'].transform('mean')

# Step 2: Subtract the mean price from each price_usd
df_test['SUB_price_MEAN'] = df_test['price_usd'] - df_test['MEAN_price_per_prop']

# # Aggregating the starrating of hotels per query
# MEAN_startrating_per_query = df_test.groupby('srch_id')['prop_starrating'].transform('mean')
# df_test['SUB_starrating_MEAN'] = df_test['prop_starrating'] - MEAN_startrating_per_query

# Aggregating the location score of hotels per query
MEAN_propscore2_per_query = df_test.groupby('srch_id')['prop_location_score2'].transform('mean')
df_test['SUB_propscore2_MEAN'] = df_test['prop_location_score2'] - MEAN_propscore2_per_query


In [7]:
df

Unnamed: 0,srch_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,target_label,MEAN_price_per_prop,SUB_price_MEAN,SUB_propscore2_MEAN
0,1,,,219,893,3,3.5,1,2.83,0.0438,...,0,4,0,1,1,,0,118.758742,-13.988742,-0.005192
1,1,,,219,10404,4,4.0,1,2.20,0.0149,...,0,4,0,1,1,,0,152.054082,18.685918,-0.034092
2,1,,,219,21315,3,4.5,1,2.20,0.0245,...,0,4,0,1,1,,0,168.540871,11.259129,-0.024492
3,1,,,219,27348,2,4.0,1,2.83,0.0125,...,0,4,0,1,1,,0,82.598870,520.171130,-0.036492
4,1,,,219,29604,4,3.5,1,2.64,0.1241,...,0,4,0,1,1,,0,137.648135,5.931865,0.075108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,,,219,77700,3,4.0,1,1.61,0.0471,...,21,3,0,1,0,,0,131.241702,-13.241702,-0.023325
4958343,332785,,,219,88083,3,4.0,1,1.95,0.1520,...,21,3,0,1,0,,0,84.545789,4.454211,0.081575
4958344,332785,,,219,94508,3,3.5,1,1.10,0.0164,...,21,3,0,1,0,,0,116.537209,-17.537209,-0.054025
4958345,332785,,,219,128360,3,5.0,1,1.95,0.0662,...,21,3,0,1,0,,5,150.336757,-11.336757,-0.004225


# Data output

In [8]:
df.to_csv("./dataset/train_new_feature.csv", index=False)
df_test.to_csv("./dataset/test_new_feature.csv", index=False)