In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [40]:
df_train = pd.read_csv("./dataset/training_set_VU_DM.csv")
df_test = pd.read_csv("./dataset/test_set_VU_DM.csv")

# Get only the bets inv column (if any other company has better deals
inv_columns = [f'comp{i}_rate' for i in range(1, 9)]
# Get only the best rate (the lowest)
rate_columns = [f'comp{i}_rate_percent_diff' for i in range(1, 9)]

print(inv_columns)
print(rate_columns)

['comp1_rate', 'comp2_rate', 'comp3_rate', 'comp4_rate', 'comp5_rate', 'comp6_rate', 'comp7_rate', 'comp8_rate']
['comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate_percent_diff', 'comp5_rate_percent_diff', 'comp6_rate_percent_diff', 'comp7_rate_percent_diff', 'comp8_rate_percent_diff']


In [41]:
# Compute the maximum value across these columns for each row
df_train['comp_inv'] = df_train[inv_columns].max(axis=1)
df_test['comp_inv'] = df_test[inv_columns].max(axis=1)
df_train['comp_rate_percent_diff'] = df_train[rate_columns].max(axis=1)
df_test['comp_rate_percent_diff'] = df_train[rate_columns].max(axis=1)
print(df_train.head(n=10))

   srch_id            date_time  site_id  visitor_location_country_id  visitor_hist_starrating  visitor_hist_adr_usd  prop_country_id  prop_id  prop_starrating  prop_review_score  prop_brand_bool  prop_location_score1  prop_location_score2  prop_log_historical_price  position  price_usd  promotion_flag  srch_destination_id  srch_length_of_stay  srch_booking_window  srch_adults_count  srch_children_count  srch_room_count  srch_saturday_night_bool  srch_query_affinity_score  orig_destination_distance  random_bool  comp1_rate  comp1_inv  comp1_rate_percent_diff  comp2_rate  comp2_inv  comp2_rate_percent_diff  comp3_rate  comp3_inv  comp3_rate_percent_diff  comp4_rate  comp4_inv  comp4_rate_percent_diff  comp5_rate  comp5_inv  comp5_rate_percent_diff  comp6_rate  comp6_inv  comp6_rate_percent_diff  comp7_rate  comp7_inv  comp7_rate_percent_diff  comp8_rate  comp8_inv  comp8_rate_percent_diff  click_bool  gross_bookings_usd  booking_bool  comp_inv  comp_rate_percent_diff
0        1  2013-04

In [42]:

# Min Max scale
scaler = MinMaxScaler()
print(df_train["prop_starrating"].head(n=5))
print(df_train["prop_review_score"].head(n=5))
df_train['prop_starrating'] = scaler.fit_transform(df_train[['prop_starrating']])
df_test['prop_starrating'] = scaler.fit_transform(df_test[['prop_starrating']])
df_train['prop_review_score'] = scaler.fit_transform(df_train[['prop_review_score']])
df_test['prop_review_score'] = scaler.fit_transform(df_test[['prop_review_score']])
print(df_train["prop_starrating"].head(n=5))
print(df_train["prop_review_score"].head(n=5))

0    3
1    4
2    3
3    2
4    4
Name: prop_starrating, dtype: int64
0    3.5
1    4.0
2    4.5
3    4.0
4    3.5
Name: prop_review_score, dtype: float64
0    0.6
1    0.8
2    0.6
3    0.4
4    0.8
Name: prop_starrating, dtype: float64
0    0.7
1    0.8
2    0.9
3    0.8
4    0.7
Name: prop_review_score, dtype: float64


In [43]:
print(df_train["prop_location_score1"].head(n=5))
df_train['prop_location_score1'] = scaler.fit_transform(df_train[['prop_location_score1']])
df_test['prop_location_score1'] = scaler.fit_transform(df_test[['prop_location_score1']])
print(df_train["prop_location_score1"].head(n=5))

0    2.83
1    2.20
2    2.20
3    2.83
4    2.64
Name: prop_location_score1, dtype: float64
0    0.405444
1    0.315186
2    0.315186
3    0.405444
4    0.378223
Name: prop_location_score1, dtype: float64


In [44]:

print(df_train["prop_log_historical_price"].head(n=5))
df_train['prop_log_historical_price'] = scaler.fit_transform(df_train[['prop_log_historical_price']])
df_test['prop_log_historical_price'] = scaler.fit_transform(df_test[['prop_log_historical_price']])
print(df_train["prop_log_historical_price"].head(n=5))

df_train = df_train[df_test["price_usd"] < 1000000]

0    4.95
1    5.03
2    4.92
3    4.39
4    4.93
Name: prop_log_historical_price, dtype: float64
0    0.797101
1    0.809984
2    0.792271
3    0.706924
4    0.793881
Name: prop_log_historical_price, dtype: float64


  df_train = df_train[df_test["price_usd"] < 1000000]


In [45]:

# Interaction should be -1 if nothing is done, -0.6 if it's clicked, 1 if it's booked
def get_interaction(row):
    if row["booking_bool"] == 1:
        return 1
    elif row["click_bool"] == 1:
        return -0.6
    else:
        return -1
    
df_train["interaction"] = df_train.apply(get_interaction, axis=1)
print(df_train[["booking_bool", "click_bool", "interaction"]])

         booking_bool  click_bool  interaction
0                   0           0         -1.0
1                   0           0         -1.0
2                   0           0         -1.0
3                   0           0         -1.0
4                   0           0         -1.0
...               ...         ...          ...
4958342             0           0         -1.0
4958343             0           0         -1.0
4958344             0           0         -1.0
4958345             1           1          1.0
4958346             0           0         -1.0

[4958287 rows x 3 columns]


In [46]:
drop_cols = [ 'date_time', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'srch_booking_window',
    'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 
    'comp2_rate', 'comp2_inv','comp2_rate_percent_diff', 
    'comp3_rate', 'comp3_inv','comp3_rate_percent_diff', 
    'comp4_rate', 'comp4_inv','comp4_rate_percent_diff', 
    'comp5_rate', 'comp5_inv','comp5_rate_percent_diff', 
    'comp6_rate', 'comp6_inv','comp6_rate_percent_diff', 
    'comp7_rate', 'comp7_inv','comp7_rate_percent_diff', 
    'comp8_rate', 'comp8_inv','comp8_rate_percent_diff'
    ]
df_train = df_train.drop(columns=drop_cols)
df_train = df_train.drop(columns=['gross_bookings_usd', 'click_bool', 'gross_bookings_usd'])
df_test = df_test.drop(columns=drop_cols)

In [47]:
df_train.to_csv("./dataset/train_LFM.csv")
df_test.to_csv("./dataset/test_LFM.csv")