#Imports

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import lightgbm

#Loading and checking data

In [None]:
df = pd.read_csv('drive/MyDrive/training_set_VU_DM.csv', nrows=1000000)

In [None]:
del df # delete from memory after training to prevent runtime crashes and restarts 

In [None]:
df_test = pd.read_csv('drive/MyDrive/test_set_VU_DM.csv')#, nrows=100000)

In [None]:
print(df_test.columns)
print(df.shape)

Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv

#Making Training and evaluation data for the model

In [None]:
df = df.dropna(axis=1)
print(df.shape)
df_train = df[:800000]
df_eval = df[800000:]

qids_train = df_train.groupby('srch_id')['srch_id'].count().to_numpy()
y_train = df_train['booking_bool']
X_train = df_train.drop(['date_time', 'random_bool', 'click_bool', 'booking_bool', 'position', 'site_id'], axis=1)

qids_eval = df_eval.groupby('srch_id')['srch_id'].count().to_numpy()
y_eval = df_eval['booking_bool']
X_eval = df_eval.drop(['date_time', 'random_bool', 'click_bool', 'booking_bool', 'position', 'site_id'], axis=1)

(1000000, 23)


#Training a simple lightgbm model with the objective lambdaRank

In [None]:
model = lightgbm.LGBMRanker(objective='lambdarank', metric='ndcg')
model.fit(
    X=X_train,
    y=y_train,
    group=qids_train,
    eval_set=[(X_eval, y_eval)],
    eval_group=[qids_eval],
    eval_at=[5, 10, 20, 50],
    early_stopping_rounds=100,
)

[1]	valid_0's ndcg@5: 0.492969	valid_0's ndcg@10: 0.540524	valid_0's ndcg@20: 0.582007	valid_0's ndcg@50: 0.60121
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's ndcg@5: 0.494726	valid_0's ndcg@10: 0.543152	valid_0's ndcg@20: 0.583886	valid_0's ndcg@50: 0.602951
[3]	valid_0's ndcg@5: 0.49785	valid_0's ndcg@10: 0.547382	valid_0's ndcg@20: 0.587057	valid_0's ndcg@50: 0.605709
[4]	valid_0's ndcg@5: 0.501605	valid_0's ndcg@10: 0.550864	valid_0's ndcg@20: 0.590038	valid_0's ndcg@50: 0.608289
[5]	valid_0's ndcg@5: 0.502322	valid_0's ndcg@10: 0.551538	valid_0's ndcg@20: 0.59035	valid_0's ndcg@50: 0.608615
[6]	valid_0's ndcg@5: 0.50541	valid_0's ndcg@10: 0.553088	valid_0's ndcg@20: 0.592502	valid_0's ndcg@50: 0.610396
[7]	valid_0's ndcg@5: 0.507229	valid_0's ndcg@10: 0.553765	valid_0's ndcg@20: 0.593329	valid_0's ndcg@50: 0.611254
[8]	valid_0's ndcg@5: 0.507806	valid_0's ndcg@10: 0.555363	valid_0's ndcg@20: 0.594544	valid_0's ndcg@50: 0.612608
[9]	valid_0's ndcg@5:

LGBMRanker(metric='ndcg', objective='lambdarank')

#Making predictions to rank prop_ids by srch_ids

In [None]:
df_test = df_test.dropna(axis=1)
X_test = df_test.drop(['date_time', 'site_id', 'random_bool'], axis=1, inplace=False)

results = model.predict(X_test)
print(len(results.tolist()))

4959183


##making new dataframe to eventually export to csv

In [None]:
new_df = df_test[['prop_id', 'srch_id']]
new_df['predictions'] = results
print(new_df.head(30))

    prop_id  srch_id  predictions
0      3180        1    -0.044488
1      5543        1    -0.194896
2     14142        1    -0.280203
3     22393        1    -0.369562
4     24194        1     0.080896
5     28181        1    -0.017819
6     34263        1     0.082528
7     37567        1    -0.070235
8     50162        1    -0.017675
9     54937        1     0.268866
10    56050        1    -0.596198
11    61632        1    -0.579347
12    61934        1     0.291902
13    63894        1    -0.116910
14    72090        1     0.100238
15    73666        1    -0.543963
16    74045        1    -0.015865
17    78599        1    -0.129128
18    82231        1    -0.105815
19    89466        1    -0.368643
20    90385        1     0.146368
21    94729        1    -0.370000
22    95031        1     0.108740
23    99484        1     0.277868
24   123675        1    -0.180872
25   128085        1    -0.069352
26   128871        1    -0.372044
27   134992        1    -0.498895
28   139162   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
sorted_df = new_df.sort_values(['srch_id', 'predictions'], ascending=[True, False])
print(sorted_df.head(30))

    prop_id  srch_id  predictions
12    61934        1     0.291902
23    99484        1     0.277868
9     54937        1     0.268866
20    90385        1     0.146368
22    95031        1     0.108740
14    72090        1     0.100238
6     34263        1     0.082528
4     24194        1     0.080896
16    74045        1    -0.015865
8     50162        1    -0.017675
5     28181        1    -0.017819
0      3180        1    -0.044488
25   128085        1    -0.069352
7     37567        1    -0.070235
18    82231        1    -0.105815
13    63894        1    -0.116910
17    78599        1    -0.129128
24   123675        1    -0.180872
1      5543        1    -0.194896
2     14142        1    -0.280203
19    89466        1    -0.368643
3     22393        1    -0.369562
21    94729        1    -0.370000
26   128871        1    -0.372044
28   139162        1    -0.472094
27   134992        1    -0.498895
15    73666        1    -0.543963
11    61632        1    -0.579347
10    56050   

Export the predictions csv


In [None]:
export_df = sorted_df.drop(['predictions'], axis=1)
export_df = export_df[['srch_id', 'prop_id']] # swap columns for required format

compression_opts = dict(method='zip', archive_name='out.csv')  
export_df.to_csv('out.zip', index=False, compression=compression_opts) # we can submit .zip instead of the raw .csv, which saves some space.