In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import gc

In [2]:
train = r"VU_DM_data/training_set_VU_DM.csv"
df_train = pd.read_csv(train)

# Complete Preprocessing & Engineering of Training

## 1) Imputing values for features with < 50 % missing values (orig_destination_distance, prop_location_score2, prop_review_score)
#### All values with > 50 % null/NAN will be excluded from modelling

In [7]:
#Gets score from the 25th quantile for each location
loc_score_q = df_train.groupby("prop_country_id")["prop_location_score2"].quantile(q=0.25)

In [8]:
df_train.reset_index(inplace=True)

In [9]:
df_train.sort_values(by = 'prop_country_id', ascending = False, inplace = True)
df_train.head()

Unnamed: 0,index,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
3882746,3882746,260622,2012-11-30 14:05:55,5,219,,,230,7093,2,...,,,,,,,,0,,0
4538553,4538553,304468,2013-04-18 15:45:51,5,219,,,230,31239,4,...,,,,,,,,0,,0
4538551,4538551,304468,2013-04-18 15:45:51,5,219,,,230,10722,3,...,,,,,,,,0,,0
4538550,4538550,304468,2013-04-18 15:45:51,5,219,,,230,5218,4,...,,,,,,,,0,,0
4538549,4538549,304468,2013-04-18 15:45:51,5,219,,,230,1626,3,...,,,,,,,,0,,0


In [10]:
loc_score_q = pd.DataFrame(loc_score_q)

In [11]:
loc_score_q.sort_values(by = 'prop_country_id', ascending = False, inplace = True)
loc_score_q.reset_index(inplace = True)

In [12]:
loc_score_q.head()

Unnamed: 0,prop_country_id,prop_location_score2
0,230,0.0251
1,229,0.0641
2,226,0.0804
3,225,0.013
4,224,0.0561


In [13]:
new_df = pd.DataFrame()
props = df_train['prop_country_id'].unique()

In [14]:
for item in props:
    df = df_train[df_train['prop_country_id'] == item][['index','prop_country_id','prop_location_score2']]
    df['QUANT_prop_location_score2'] = float(loc_score_q[loc_score_q['prop_country_id'] == item]['prop_location_score2'])
    new_df = pd.concat([new_df,df])
    print(item)

230
229
226
225
224
223
222
221
220
219
217
216
215
214
213
212
211
208
207
206
205
204
203
202
201
200
197
196
195
194
192
190
189
187
186
185
184
182
181
180
179
178
177
173
172
171
169
167
165
164
163
162
161
160
159
158
157
156
155
154
153
152
151
149
146
145
143
141
140
138
137
136
135
134
132
131
129
128
127
126
125
124
123
122
120
119
117
113
112
111
109
108
106
103
102
100
99
98
97
96
93
92
91
90
89
88
87
86
83
82
81
80
79
77
76
74
73
72
71
70
69
68
65
63
62
61
60
59
57
56
55
53
51
50
48
47
46
45
44
42
41
40
39
38
37
35
34
33
32
31
30
27
26
25
23
22
21
20
19
18
17
16
15
14
13
11
10
9
7
4
2
1


In [15]:
new_df["prop_location_score2"] = new_df["prop_location_score2"].fillna(new_df["QUANT_prop_location_score2"])

In [16]:
new_df["QUANT_prop_location_score2"].isnull().sum()

2

In [17]:
new_df.fillna(-1, inplace = True)

In [18]:
new_df.isnull().sum()

index                         0
prop_country_id               0
prop_location_score2          0
QUANT_prop_location_score2    0
dtype: int64

In [19]:
new_df.sort_values(by = 'index', ascending = True, inplace = True)
df_train.sort_values(by = 'index', ascending = True, inplace = True)

In [20]:
df_train['prop_location_score2'] = new_df["QUANT_prop_location_score2"]

In [21]:
df_train.drop('index', axis = 1, inplace = True)

In [22]:
df_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


In [23]:
### prop_location_score2 imputation finished

## Removing unimportant features 

In [24]:
#Remove all competitor information

remove = []
for i in range(1,9):
  remove += ["comp%d_rate" % i, "comp%d_inv" % i, "comp%d_rate_percent_diff" % i]


#Remove other features with high % missing values
remove += ["srch_query_affinity_score","visitor_hist_adr_usd","visitor_hist_starrating"]

#Remove un-useful data
remove+= ['date_time','site_id']

#Remove from dataframe:
print("Removed labels:\n")
for label in remove:
  print(label)
  df_train.drop(label, axis = 1, inplace = True)

Removed labels:

comp1_rate
comp1_inv
comp1_rate_percent_diff
comp2_rate
comp2_inv
comp2_rate_percent_diff
comp3_rate
comp3_inv
comp3_rate_percent_diff
comp4_rate
comp4_inv
comp4_rate_percent_diff
comp5_rate
comp5_inv
comp5_rate_percent_diff
comp6_rate
comp6_inv
comp6_rate_percent_diff
comp7_rate
comp7_inv
comp7_rate_percent_diff
comp8_rate
comp8_inv
comp8_rate_percent_diff
srch_query_affinity_score
visitor_hist_adr_usd
visitor_hist_starrating
date_time
site_id


In [25]:
df_train.columns

Index(['srch_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'orig_destination_distance', 'random_bool',
       'click_bool', 'gross_bookings_usd', 'booking_bool'],
      dtype='object')

## Imputing Remaining Missing Values

In [26]:
df_train["prop_review_score"].fillna(df_train["prop_review_score"].median(), inplace = True)

In [27]:
df_train["orig_destination_distance"].fillna(df_train["orig_destination_distance"].median(), inplace = True)

## Frequency Encoding

In [28]:
freq1 = df_train['visitor_location_country_id'].value_counts()
freq1 = np.log(freq1)
df_train['visitor_location_country_id'] = df_train['visitor_location_country_id'].apply(lambda x : freq1[x])

In [29]:
freq2 = df_train['prop_country_id'].value_counts()
freq2 = np.log(freq2)
df_train['prop_country_id'] = df_train['prop_country_id'].apply(lambda x : freq2[x])

In [30]:
#Remove gross_bookings_usd
df_train.drop('gross_bookings_usd',axis = 1, inplace = True)

# Norm value by group type Engineering

In [31]:
def normalize_features(input_df, group_key, target_column, take_log10=False):

    # for numerical stability
    epsilon = 1e-4
    if take_log10:
        input_df[target_column] = np.log10(input_df[target_column] + epsilon)
    methods = ["mean", "std"]

    df = input_df.groupby(group_key).agg({target_column: methods})

    df.columns = df.columns.droplevel()
    col = {}
    for method in methods:
        col[method] = target_column + "_" + method

    df.rename(columns=col, inplace=True)
    df_merge = input_df.merge(df.reset_index(), on=group_key)
    df_merge[target_column + "_norm_by_" + group_key] = (
        df_merge[target_column] - df_merge[target_column + "_mean"]
    ) / df_merge[target_column + "_std"]
    df_merge = df_merge.drop(labels=[col["mean"], col["std"]], axis=1)

    gc.collect()
    return df_merge

In [32]:
df_train = normalize_features(df_train,'srch_id','price_usd',take_log10=True)
df_train = normalize_features(df_train, group_key="prop_id", target_column="price_usd")
df_train  = normalize_features(df_train, group_key="srch_id", target_column="prop_starrating")

In [33]:
df_train = normalize_features(
        df_train, group_key="srch_id", target_column="prop_location_score1"
    )
df_train = normalize_features(
        df_train, group_key="srch_id", target_column="prop_review_score"
    )

In [34]:
gc.collect()

20

# Estimated Position engineering

In [35]:
#Dictionary of srch_destination_id : prop_id pairs with estimated position
#This will be applied to the test data to create a new estimated_position feature
srch_id_dest_id_dict = df_train.loc[df_train["random_bool"] == 0]
srch_id_dest_id_dict = df_train.groupby(["srch_destination_id", "prop_id"]).agg(
        {"position": "mean"})

In [36]:
def input_estimated_position(training_data, srch_id_dest_id_dict):
    training_data = training_data.merge(
        srch_id_dest_id_dict, how="left", on=["srch_destination_id", "prop_id"]
    )
    print(training_data.head())
    return training_data

In [37]:
srch_id_dest_id_dict = srch_id_dest_id_dict.rename(
        index=str, columns={"position": "estimated_position"}
    ).reset_index()
srch_id_dest_id_dict["srch_destination_id"] = (
        srch_id_dest_id_dict["srch_destination_id"].astype(str).astype(int)
    )
srch_id_dest_id_dict["prop_id"] = (
        srch_id_dest_id_dict["prop_id"].astype(str).astype(int)
    )
srch_id_dest_id_dict["estimated_position"] = (
        1 / srch_id_dest_id_dict["estimated_position"]
    )
df_train = input_estimated_position(df_train, srch_id_dest_id_dict)

   srch_id  visitor_location_country_id  prop_country_id  prop_id  \
0        1                      7.26543        14.924114      893   
1        1                      7.26543        14.924114    10404   
2        1                      7.26543        14.924114    21315   
3        1                      7.26543        14.924114    27348   
4        1                      7.26543        14.924114    29604   

   prop_starrating  prop_review_score  prop_brand_bool  prop_location_score1  \
0                3                3.5                1                  2.83   
1                4                4.0                1                  2.20   
2                3                4.5                1                  2.20   
3                2                4.0                1                  2.83   
4                4                3.5                1                  2.64   

   prop_location_score2  prop_log_historical_price  ...  \
0                0.0182                      

In [38]:
df_train.head()

Unnamed: 0,srch_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,...,orig_destination_distance,random_bool,click_bool,booking_bool,price_usd_norm_by_srch_id,price_usd_norm_by_prop_id,prop_starrating_norm_by_srch_id,prop_location_score1_norm_by_srch_id,prop_review_score_norm_by_srch_id,estimated_position
0,1,7.26543,14.924114,893,3,3.5,1,2.83,0.0182,4.95,...,386.6,1,0,0,-0.90623,-0.762514,-0.093205,1.022407,0.016094,0.037809
1,1,7.26543,14.924114,10404,4,4.0,1,2.2,0.0182,5.03,...,386.6,1,0,0,0.345195,0.836064,1.211671,-0.192089,0.466718,0.042151
2,1,7.26543,14.924114,21315,3,4.5,1,2.2,0.0182,4.92,...,386.6,1,0,0,0.477681,0.565701,-0.093205,-0.192089,0.917342,0.044553
3,1,7.26543,14.924114,27348,2,4.0,1,2.83,0.0182,4.39,...,386.6,1,0,0,3.577428,7.967538,-1.398082,1.022407,0.466718,0.033579
4,1,7.26543,14.924114,29604,4,3.5,1,2.64,0.0182,4.93,...,386.6,1,0,0,-0.098745,0.714899,1.211671,0.65613,0.016094,0.132398


In [39]:
df_train.columns

Index(['srch_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'orig_destination_distance', 'random_bool',
       'click_bool', 'booking_bool', 'price_usd_norm_by_srch_id',
       'price_usd_norm_by_prop_id', 'prop_starrating_norm_by_srch_id',
       'prop_location_score1_norm_by_srch_id',
       'prop_review_score_norm_by_srch_id', 'estimated_position'],
      dtype='object')

In [40]:
df_train.drop('position', axis = 1, inplace = True)

In [41]:
df_train.isnull().sum()

srch_id                                     0
visitor_location_country_id                 0
prop_country_id                             0
prop_id                                     0
prop_starrating                             0
prop_review_score                           0
prop_brand_bool                             0
prop_location_score1                        0
prop_location_score2                        0
prop_log_historical_price                   0
price_usd                                   0
promotion_flag                              0
srch_destination_id                         0
srch_length_of_stay                         0
srch_booking_window                         0
srch_adults_count                           0
srch_children_count                         0
srch_room_count                             0
srch_saturday_night_bool                    0
orig_destination_distance                   0
random_bool                                 0
click_bool                        

In [42]:
df_train = df_train.fillna(-1)

In [43]:
for item in df_train.columns:
    print(item, np.isinf(df_train[item]).sum())

srch_id 0
visitor_location_country_id 0
prop_country_id 0
prop_id 0
prop_starrating 0
prop_review_score 0
prop_brand_bool 0
prop_location_score1 0
prop_location_score2 0
prop_log_historical_price 0
price_usd 0
promotion_flag 0
srch_destination_id 0
srch_length_of_stay 0
srch_booking_window 0
srch_adults_count 0
srch_children_count 0
srch_room_count 0
srch_saturday_night_bool 0
orig_destination_distance 0
random_bool 0
click_bool 0
booking_bool 0
price_usd_norm_by_srch_id 0
price_usd_norm_by_prop_id 1610
prop_starrating_norm_by_srch_id 0
prop_location_score1_norm_by_srch_id 414
prop_review_score_norm_by_srch_id 0
estimated_position 0


In [44]:
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train = df_train.fillna(-1)

# Apply all the same processing to Test Data

In [45]:
# Complete Preprocessing & Engineering of Training
test = r"VU_DM_data/test_set_VU_DM.csv"
df_test = pd.read_csv(test)


In [46]:
#Gets score from the 25th quantile for each location
loc_score_q = df_test.groupby("prop_country_id")["prop_location_score2"].quantile(q=0.25)

In [47]:
df_test.reset_index(inplace=True)

In [48]:
df_test.sort_values(by = 'prop_country_id', ascending = False, inplace = True)
df_test.head()

Unnamed: 0,index,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,...,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff
2152986,2152986,144912,2013-01-02 19:55:57,5,219,,,230,136160,3,...,,,,,,,,,,
1114461,1114461,74788,2013-03-30 00:42:48,5,219,,,230,19539,3,...,,,,,,,,,,
1114463,1114463,74788,2013-03-30 00:42:48,5,219,,,230,53173,4,...,,,,,,,,,,
1114464,1114464,74788,2013-03-30 00:42:48,5,219,,,230,61354,3,...,,,,,,,,,,
1114465,1114465,74788,2013-03-30 00:42:48,5,219,,,230,78477,3,...,,,,,,,,,,


In [49]:
loc_score_q = pd.DataFrame(loc_score_q)

In [50]:
loc_score_q.sort_values(by = 'prop_country_id', ascending = False, inplace = True)
loc_score_q.reset_index(inplace = True)

In [51]:
new_df = pd.DataFrame()
props = df_test['prop_country_id'].unique()

In [52]:
for item in props:
    df = df_test[df_test['prop_country_id'] == item][['index','prop_country_id','prop_location_score2']]
    df['QUANT_prop_location_score2'] = float(loc_score_q[loc_score_q['prop_country_id'] == item]['prop_location_score2'])
    new_df = pd.concat([new_df,df])
    print(item)

230
229
226
225
224
223
222
221
220
219
217
216
215
214
213
212
211
208
207
206
205
204
203
202
201
200
197
196
195
194
192
190
189
187
186
185
184
182
181
180
179
178
177
173
172
171
169
167
165
164
163
162
161
160
159
158
157
156
155
154
153
152
151
149
146
145
143
141
140
138
137
136
135
134
132
131
129
128
127
126
125
124
123
122
120
119
117
113
112
109
108
106
103
102
100
99
98
97
96
93
92
91
90
89
88
86
83
82
81
80
77
76
74
73
72
71
70
69
68
65
63
62
61
60
59
57
56
55
53
51
50
48
47
46
45
44
42
41
40
39
38
37
35
34
33
32
31
30
27
26
25
23
22
21
20
18
17
16
15
14
13
11
10
9
4
2
1


In [53]:
new_df["prop_location_score2"] = new_df["prop_location_score2"].fillna(new_df["QUANT_prop_location_score2"])

In [54]:
new_df["QUANT_prop_location_score2"].isnull().sum()

5

In [55]:
new_df.fillna(-1, inplace = True)

In [56]:
new_df.isnull().sum()

index                         0
prop_country_id               0
prop_location_score2          0
QUANT_prop_location_score2    0
dtype: int64

In [57]:
new_df.sort_values(by = 'index', ascending = True, inplace = True)
df_test.sort_values(by = 'index', ascending = True, inplace = True)

In [58]:
df_test['prop_location_score2'] = new_df["QUANT_prop_location_score2"]

In [59]:
df_test.drop('index', axis = 1, inplace = True)

In [60]:
df_test.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff
0,1,2013-02-02 15:27:40,24,216,,,219,3180,3,4.5,...,,,,,,,,,,
1,1,2013-02-02 15:27:40,24,216,,,219,5543,3,4.5,...,,,,,,,,,,
2,1,2013-02-02 15:27:40,24,216,,,219,14142,2,3.5,...,10.0,,,,,,,,,
3,1,2013-02-02 15:27:40,24,216,,,219,22393,3,4.5,...,,,,,,,,,,
4,1,2013-02-02 15:27:40,24,216,,,219,24194,3,4.5,...,,,,,,,,,,


# Removing Unimportant features

In [61]:
#Remove all competitor information

remove = []
for i in range(1,9):
  remove += ["comp%d_rate" % i, "comp%d_inv" % i, "comp%d_rate_percent_diff" % i]


#Remove other features with high % missing values
remove += ["srch_query_affinity_score","visitor_hist_adr_usd","visitor_hist_starrating"]

#Remove un-useful data
remove+= ['date_time','site_id']

#Remove from dataframe:
print("Removed labels:\n")
for label in remove:
  print(label)
  df_test.drop(label, axis = 1, inplace = True)

Removed labels:

comp1_rate
comp1_inv
comp1_rate_percent_diff
comp2_rate
comp2_inv
comp2_rate_percent_diff
comp3_rate
comp3_inv
comp3_rate_percent_diff
comp4_rate
comp4_inv
comp4_rate_percent_diff
comp5_rate
comp5_inv
comp5_rate_percent_diff
comp6_rate
comp6_inv
comp6_rate_percent_diff
comp7_rate
comp7_inv
comp7_rate_percent_diff
comp8_rate
comp8_inv
comp8_rate_percent_diff
srch_query_affinity_score
visitor_hist_adr_usd
visitor_hist_starrating
date_time
site_id


In [62]:
df_test.columns

Index(['srch_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id',
       'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'orig_destination_distance', 'random_bool'],
      dtype='object')

# Impute remaining missing values

In [63]:
df_test["prop_review_score"].fillna(df_test["prop_review_score"].median(), inplace = True)

In [64]:
df_test["orig_destination_distance"].fillna(df_test["orig_destination_distance"].median(), inplace = True)

# Frequency Encoding

In [65]:
freq1 = df_test['visitor_location_country_id'].value_counts()
freq1 = np.log(freq1)
df_test['visitor_location_country_id'] = df_test['visitor_location_country_id'].apply(lambda x : freq1[x])

In [66]:
freq2 = df_test['prop_country_id'].value_counts()
freq2 = np.log(freq2)
df_test['prop_country_id'] = df_test['prop_country_id'].apply(lambda x : freq2[x])

# Norm values by group type engineering

In [67]:
df_test = normalize_features(df_test,'srch_id','price_usd',take_log10=True)
df_test = normalize_features(df_test, group_key="prop_id", target_column="price_usd")
df_test  = normalize_features(df_test, group_key="srch_id", target_column="prop_starrating")

In [68]:
df_test = normalize_features(
        df_test, group_key="srch_id", target_column="prop_location_score1"
    )
df_test = normalize_features(
        df_test, group_key="srch_id", target_column="prop_review_score"
    )

In [69]:
gc.collect()

20

# Estimated position dict imputation to test data

In [70]:
df_test = input_estimated_position(df_test, srch_id_dest_id_dict)

   srch_id  visitor_location_country_id  prop_country_id  prop_id  \
0        1                    12.288496        14.921711     3180   
1        1                    12.288496        14.921711     5543   
2        1                    12.288496        14.921711    14142   
3        1                    12.288496        14.921711    22393   
4        1                    12.288496        14.921711    24194   

   prop_starrating  prop_review_score  prop_brand_bool  prop_location_score1  \
0                3                4.5                1                  2.94   
1                3                4.5                1                  2.64   
2                2                3.5                1                  2.71   
3                3                4.5                1                  2.40   
4                3                4.5                1                  2.94   

   prop_location_score2  prop_log_historical_price  ...  srch_room_count  \
0                0.0182     

In [71]:
df_test.isnull().sum()

srch_id                                      0
visitor_location_country_id                  0
prop_country_id                              0
prop_id                                      0
prop_starrating                              0
prop_review_score                            0
prop_brand_bool                              0
prop_location_score1                         0
prop_location_score2                         0
prop_log_historical_price                    0
price_usd                                    0
promotion_flag                               0
srch_destination_id                          0
srch_length_of_stay                          0
srch_booking_window                          0
srch_adults_count                            0
srch_children_count                          0
srch_room_count                              0
srch_saturday_night_bool                     0
orig_destination_distance                    0
random_bool                                  0
price_usd_nor

In [72]:
df_test['estimated_position'].fillna(0.0575, inplace = True)

In [73]:
df_test = df_test.fillna(0)

In [74]:
for item in df_test.columns:
    print(item, np.isinf(df_test[item]).sum())

srch_id 0
visitor_location_country_id 0
prop_country_id 0
prop_id 0
prop_starrating 0
prop_review_score 0
prop_brand_bool 0
prop_location_score1 0
prop_location_score2 0
prop_log_historical_price 0
price_usd 0
promotion_flag 0
srch_destination_id 0
srch_length_of_stay 0
srch_booking_window 0
srch_adults_count 0
srch_children_count 0
srch_room_count 0
srch_saturday_night_bool 0
orig_destination_distance 0
random_bool 0
price_usd_norm_by_srch_id 0
price_usd_norm_by_prop_id 1403
prop_starrating_norm_by_srch_id 0
prop_location_score1_norm_by_srch_id 332
prop_review_score_norm_by_srch_id 0
estimated_position 0


In [84]:
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.fillna(0, inplace = True)

In [76]:
print(len(df_test.columns),len(df_train.drop(['click_bool', 'booking_bool'],axis = 1).columns))
#df_test.drop('random_bool', axis = 1, inplace = True)

for item1, item2 in zip(df_test.columns,df_train.drop(['click_bool', 'booking_bool'],axis = 1).columns):
    m = 0
    if item1 != item2:
        m = 'MISMATCH'
    print(m, item1,'____',item2)

27 27
0 srch_id ____ srch_id
0 visitor_location_country_id ____ visitor_location_country_id
0 prop_country_id ____ prop_country_id
0 prop_id ____ prop_id
0 prop_starrating ____ prop_starrating
0 prop_review_score ____ prop_review_score
0 prop_brand_bool ____ prop_brand_bool
0 prop_location_score1 ____ prop_location_score1
0 prop_location_score2 ____ prop_location_score2
0 prop_log_historical_price ____ prop_log_historical_price
0 price_usd ____ price_usd
0 promotion_flag ____ promotion_flag
0 srch_destination_id ____ srch_destination_id
0 srch_length_of_stay ____ srch_length_of_stay
0 srch_booking_window ____ srch_booking_window
0 srch_adults_count ____ srch_adults_count
0 srch_children_count ____ srch_children_count
0 srch_room_count ____ srch_room_count
0 srch_saturday_night_bool ____ srch_saturday_night_bool
0 orig_destination_distance ____ orig_destination_distance
0 random_bool ____ random_bool
0 price_usd_norm_by_srch_id ____ price_usd_norm_by_srch_id
0 price_usd_norm_by_prop_id 

# Save to Training & Test csv files

In [77]:
df_train.to_csv('/Users/jerenolsen/desktop/Train_Final.csv',index=False)

In [87]:
df_test.to_csv('/Users/jerenolsen/desktop/Test_Final.csv',index=False)