## Create a validation set

We create train and validation  by sampling 1% and 10% of all srch_id's to form a validation set, and equally sample 50K and 5K instances from both classes.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure Visualization Defaults
%matplotlib inline

In [None]:
# Load the data
train = pd.read_csv('data/training_set_VU_DM.csv')

### Create a validation set from 10 percent of all search ids

In [None]:
validation = train.loc[train['srch_id']%10==1]
validation = validation.copy()

# compute rank score
a = 5*validation['booking_bool'].to_numpy()
b = validation['click_bool'].to_numpy()
validation['rank_score'] = np.maximum(a,b)

# rank the validation set
validation.sort_values(['srch_id','rank_score'],ascending=[True,False],inplace=True)

# save the validation set
validation.to_csv('validation_10perc_ranked.csv',index=False)

In [None]:
# create a corresponding training set
training = train.loc[train['srch_id']%10==2]
training = training.copy()

# compute rank score
a = 5*training['booking_bool'].to_numpy()
b = training['click_bool'].to_numpy()
training['rank_score'] = np.maximum(a,b)

training.sort_values(['srch_id','rank_score'],ascending=[True,False],inplace=True)

# save the training set
training.to_csv('training_10perc_ranked.csv',index = False)

Balanced datasets

In [None]:
# create a corresponding balanced training set with 100K datapoints
training = train.loc[train['srch_id']%10!=1]
sub1 = training[training['booking_bool'] == 1].sample(50000)
sub0 = training[training['booking_bool'] == 0].sample(50000)
training = pd.concat([sub0, sub1])

# compute rank score
a = 5*training['booking_bool'].to_numpy()
b = training['click_bool'].to_numpy()
training['rank_score'] = np.maximum(a,b)

training.sort_values(['srch_id','rank_score'],ascending=[True,False],inplace=True)

# save the training set
training.to_csv('training_10perc_ranked_balanced_100K.csv',index = False)

In [None]:
# create a corresponding balanced training set with 10K datapoints
training = train.loc[train['srch_id']%10!=1]
sub1 = training[training['booking_bool'] == 1].sample(5000)
sub0 = training[training['booking_bool'] == 0].sample(5000)
training = pd.concat([sub0, sub1])

# compute rank score
a = 5*training['booking_bool'].to_numpy()
b = training['click_bool'].to_numpy()
training['rank_score'] = np.maximum(a,b)

training.sort_values(['srch_id','rank_score'],ascending=[True,False],inplace=True)

# save the training set
training.to_csv('training_10perc_ranked_balanced_10K.csv',index = False)

### Create a validation set from 1 percent of all search ids

In [None]:
validation = train.loc[train['srch_id']%100==1]
validation = validation.copy()

# compute rank score
a = 5*validation['booking_bool'].to_numpy()
b = validation['click_bool'].to_numpy()
validation['rank_score'] = np.maximum(a,b)

# rank the validation set
validation.sort_values(['srch_id','rank_score'],ascending=[True,False],inplace=True)

# save the validation set
validation.to_csv('validation_1perc_ranked.csv',index=False)

In [None]:
# create a corresponding training set
training = train.loc[train['srch_id']%100==2]
training = training.copy()

# compute rank score
a = 5*training['booking_bool'].to_numpy()
b = training['click_bool'].to_numpy()
training['rank_score'] = np.maximum(a,b)

training.sort_values(['srch_id','rank_score'],ascending=[True,False],inplace=True)

# save the training set
training.to_csv('training_1perc_ranked.csv',index = False)

Balanced datasets

In [None]:
# create a corresponding balanced training set with 100K datapoints
training = train.loc[train['srch_id']%100!=1]
sub1 = training[training['booking_bool'] == 1].sample(50000)
sub0 = training[training['booking_bool'] == 0].sample(50000)
training = pd.concat([sub0, sub1])

# compute rank score
a = 5*training['booking_bool'].to_numpy()
b = training['click_bool'].to_numpy()
training['rank_score'] = np.maximum(a,b)

training.sort_values(['srch_id','rank_score'],ascending=[True,False],inplace=True)

# save the training set
training.to_csv('training_1perc_ranked_balanced_100K.csv',index = False)

In [None]:
# create a corresponding balanced training set with 10K datapoints
training = train.loc[train['srch_id']%100!=1]
sub1 = training[training['booking_bool'] == 1].sample(5000)
sub0 = training[training['booking_bool'] == 0].sample(5000)
training = pd.concat([sub0, sub1])

# compute rank score
a = 5*training['booking_bool'].to_numpy()
b = training['click_bool'].to_numpy()
training['rank_score'] = np.maximum(a,b)

training.sort_values(['srch_id','rank_score'],ascending=[True,False],inplace=True)

# save the training set
training.to_csv('training_1perc_ranked_balanced_10K.csv',index = False)

### Evaluate the ranking

In [None]:
def dcgm38(sequence):
    score = 0
    i = 1
    for _, s in sequence.iterrows():
        score += (2**s['rank_score']-1)/np.log2(i+1)
        i += 1
    return score

In [None]:
srch_ids = np.unique(validation['srch_id'])
dcgm = np.empty(len(srch_ids))

for i, s in enumerate(srch_ids):
    dcgm[i] = dcgm38(validation.loc[validation['srch_id']==s])

In [None]:
validation_shuffle = validation.copy()

srch_ids = np.unique(validation_shuffle['srch_id'])
dcgm_shuffle = np.empty(len(srch_ids))
for i, s in enumerate(srch_ids):
    a = validation_shuffle.loc[validation_shuffle['srch_id']==s].copy()
    b = a['rank_score'].to_numpy()
    np.random.shuffle(b)
    a['rank_score'] = b
    a.sort_values(['srch_id'],ascending=True)
    dcgm_shuffle[i] = dcgm38(a)

In [None]:
print('mean NDCG unsorted:', np.mean(np.divide(dcgm_shuffle, dcgm)))