# Introduction: Setting up
---

In [1]:
# Imports
import sklearn as sk
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import Lasso, RidgeClassifier, LogisticRegression
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC, SVC
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA, TruncatedSVD

Config parameters:

nrows: default dataframelength: 4958347

classifiers: (SVC, linearSVC, RidgeClassifier)
    - SVC params: C (0-1), kernel (rbf, linear, poly), max_iter (-1, int), random_state (int)
    - linearSVC params: C (0-1), penalty (l1, l2), max_iter (1000, int), random_state (int) 
    - RidgeClassifier params: max_iter (1000, int), random_state (int) 
    
feature_selection: (SelectFromModel, SelectKBest, RFE)
    - SelectFromModel params: threshold (0-int), max_features (0-int)
    - SelectKBest params: threshold (0-int), k (0-int)
    - RFE params: n_features_to_select (0-int), step

In [2]:
import importlib
import config
importlib.reload(config)
from config import Config

# Config Settings
config = Config(
    nrows=1000,
    pre_feature_selection=False,
    train_data_subset=0.8,
    classifier=SVC,
    classifier_dict={'C' : 1, 'kernel' : 'rbf', 'random_state' : 2},
    feature_selection=SelectFromModel,
    feature_selection_dict={'threshold' : 1},
    dimensionality_reduc=True,
    dimension_features=25,
    feature_engineering=True,  
    naive_imputing=True #todo faster method for averaging nan values if naive=False
)

In [3]:
if config.nrows is not None:
    train_data = pd.read_csv('data/training_set_VU_DM.csv', nrows=config.nrows)
else:
    train_data = pd.read_csv('data/training_set_VU_DM.csv')
original_columns = train_data.columns
train_data.head(5) # Show top 5

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0


# Manual Column exploration
---
## Main columns
- `search_id` seems to represent each individual 'user'.
- `booking_bool` is essentially the answer.

## Categorical features
The following features are categorical (to be onehot-encoded):

User-specific
- `site_id`: category of website Expedia used
- `visitor_location_country_id`: categories of which country user is from
- `srch_destination_id`: where did the user search from
- `srch_saturday_night_bool`: boolean if stay includes staturday

Hotel-specific:
- `prop_id`: categories of associated hotels
- `prop_brand_bool`: boolean if hotel is part of chain or not
- `promotion_flag`: displaying promotion or not

Expedia-specific vs competitors 1_8:
- `comp{i}_rate`: if expedia has a lower price, do +1, 0 if same, -1 price is higher, null if no competitive data
- `comp{i}_inv`: if competitor has no availability, +1, 0 if both have availability, null if no competitive data

## Numerical features

User-specific
- `visitor_hist_starrating`: average of previous stars of associated user
- `visitor_hist_adr_usd`: average price per night of hotels of associated user
- `srch_length_of_stay`: number of nights stays **searched** 
- `srch_booking_window`: number of days ahead the start of booking window **searched**
- `srch_adults_count`: number of adults **searched**
- `srch_children_count`: number of children **searched**
- `srch_room_count`: number of rooms **searched**
- `random_bool`: if sort was random at time of search
- `gross_booking_usd`: ❗Training-only❗ payment includign taxes, etc for hotel

Hotel-specific
- `prop_starrating`: star rating of hotel (1-5)
- `prop_review_score`: average review score of hotel (1-5)
- `prop_location_score_1`: score1 of hotel's location desirability
- `prop_location_score_2`: score2 of hotel's location desirability
- `prop_log_historical_price`: logarithm of average price of hotel lately (0 == not sold)
- `price_usd`: displayed price of hotel.
    - ❗ Important: Different countries have different conventions.
    - Value can change per night
- `srch_query_affinity_score`: log probability a hotel is clicked in internet searches

User-hotel coupled:
- `orig_destination_distance`: distance between hotel and customer at search-time (null means no distance calculated)

Expedia-specific vs competitors 1_8:
- `comp{i}_rate_percent_diff`: absolute difference between expedia and competitor's price, with null being no competitive data


## Unknown type
- `date_time`

Putting all columns except competitor columns in either categorical list or numerical list.

In [4]:
categorical_cols = ['srch_id', 'date_time', 'site_id', 'visitor_location_country_id', 'prop_country_id',
                    'prop_id', 'prop_brand_bool', 'promotion_flag', 'position',
                    'srch_destination_id', 'srch_saturday_night_bool', 'random_bool',
                    'click_bool', 'booking_bool'                  
                   ]
numerical_cols = ['visitor_hist_starrating', 'visitor_hist_adr_usd', 
                  'prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2',
                  'prop_log_historical_price', 'price_usd', 
                  'srch_length_of_stay', 'srch_booking_window', 
                  'srch_adults_count', 'srch_children_count',
                  'srch_room_count', 'srch_query_affinity_score', 
                  'orig_destination_distance', 'gross_bookings_usd'
                  ]

# Feature Preprocessing
---

## Feature Engineering

When engineering we want to add them to either the numerical or categorical columnlist and append them to the dataframe

In [5]:
if config.feature_engineering:

    time = pd.to_datetime(train_data['date_time'])
    train_data['month']=time.dt.month
    train_data['year']=time.dt.year
    train_data['same_country_visitor_prop']=np.where(train_data['visitor_location_country_id'] == train_data['prop_country_id'],1,0)
    train_data['viable_comp']= np.where(
                      (train_data['comp1_rate']== -1)& (train_data['comp1_inv']== 0) |
                      (train_data['comp2_rate']== -1)& (train_data['comp2_inv']== 0) |
                      (train_data['comp3_rate']== -1)& (train_data['comp3_inv']== 0) |
                      (train_data['comp4_rate']== -1)& (train_data['comp4_inv']== 0) |
                      (train_data['comp5_rate']== -1)& (train_data['comp5_inv']== 0) |
                      (train_data['comp6_rate']== -1)& (train_data['comp6_inv']== 0) |
                      (train_data['comp7_rate']== -1)& (train_data['comp7_inv']== 0) |
                      (train_data['comp8_rate']== -1)& (train_data['comp8_inv']== 0) 
                      ,1,0)

    mcol=train_data.loc[:,['prop_location_score1', 'prop_location_score2']]
    train_data['prop_mean_score'] = mcol.mean(axis=1)
    categorical_engi = ['same_country_visitor_prop', 'viable_comp']
    numerical_engi = ['prop_mean_score', 'month', 'year']
    for col in categorical_engi:
        categorical_cols.append(col)
    for col in numerical_engi:
        numerical_cols.append(col)


    

In [6]:
for column in train_data.columns:
    print (column, train_data[column].dtype)

srch_id int64
date_time object
site_id int64
visitor_location_country_id int64
visitor_hist_starrating float64
visitor_hist_adr_usd float64
prop_country_id int64
prop_id int64
prop_starrating int64
prop_review_score float64
prop_brand_bool int64
prop_location_score1 float64
prop_location_score2 float64
prop_log_historical_price float64
position int64
price_usd float64
promotion_flag int64
srch_destination_id int64
srch_length_of_stay int64
srch_booking_window int64
srch_adults_count int64
srch_children_count int64
srch_room_count int64
srch_saturday_night_bool int64
srch_query_affinity_score float64
orig_destination_distance float64
random_bool int64
comp1_rate float64
comp1_inv float64
comp1_rate_percent_diff float64
comp2_rate float64
comp2_inv float64
comp2_rate_percent_diff float64
comp3_rate float64
comp3_inv float64
comp3_rate_percent_diff float64
comp4_rate float64
comp4_inv float64
comp4_rate_percent_diff float64
comp5_rate float64
comp5_inv float64
comp5_rate_percent_diff floa

If we engineer new features, we might want to remove their old columns from the dataframe and columnlists.

In [7]:
if config.feature_engineering:
    train_data = train_data.drop(columns=['date_time', 'visitor_location_country_id', 'prop_country_id', 
                             'prop_location_score1', 'prop_location_score2'])
    for i in range(8):
        train_data = train_data.drop(columns=['comp' + str(i+1) + '_rate'])
        train_data = train_data.drop(columns=['comp' + str(i+1) + '_inv'])
        train_data = train_data.drop(columns=['comp' + str(i+1) + '_rate_percent_diff'])
    categorical_to_remove = ['visitor_location_country_id', 'date_time', 'prop_country_id']
    numerical_to_remove = ['prop_location_score1', 'prop_location_score2']
    
    for col in categorical_to_remove:
        categorical_cols.remove(col)
    for col in numerical_to_remove:
        numerical_cols.remove(col)        

## Data cleanup: Imputing missing values

In [8]:
# We will have to cleanup our data next up. Let's first impute the missing columns. 
# To do this we search for the columns with nans
na_cols = train_data.isna().any()
nan_cols = train_data.columns[na_cols]
nan_cols

Index(['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_review_score',
       'srch_query_affinity_score', 'orig_destination_distance',
       'gross_bookings_usd'],
      dtype='object')

Aside from `comp{i}_rate` and `comp2_inv`, all of these columns are numerical features. We could, initially,
simply replace all these values with -1 for the moment.

❗ Important: Note, this is actually incorrect, but might work for the moment.

In [9]:
# Simple numerical impute: select numerical data, fill it with -1
if config.naive_imputing:    
    imputed_numerical_data = train_data[nan_cols].filter(regex='[^comp\d_(rate|inv)$]')
    imputed_numerical_data = imputed_numerical_data.fillna(-1)
    train_data.update(imputed_numerical_data)

    # Manual cleanup to ensure no problem with space
    del imputed_numerical_data
    train_data.head(5)

In [10]:
# Simple naive categorical impute
if config.naive_imputing:    
    na_cols = train_data.columns[train_data.isna().any()]
    imputed_categorical_data = train_data[na_cols].fillna(-2)
    train_data.update(imputed_categorical_data)

    # Cleanup
    del imputed_categorical_data
    train_data.head(5)

A second, less naive approach is to average numerical values grouped by either their hotel (prop_id) or the user (srch_id).
On top of that we would want to remove columns with over 50% null Values (refence for this?)

In [11]:
#remove columns with over 50% nans
if config.naive_imputing == False:

    for column in train_data.columns:
        if train_data[column].isnull().sum()/len(train_data) > 0.5:
            train_data = train_data.drop(columns=column, axis=1)

train_data.isnull().sum()/len(train_data)
    #remove data with > 0.50 nans

srch_id                      0.0
site_id                      0.0
visitor_hist_starrating      0.0
visitor_hist_adr_usd         0.0
prop_id                      0.0
prop_starrating              0.0
prop_review_score            0.0
prop_brand_bool              0.0
prop_log_historical_price    0.0
position                     0.0
price_usd                    0.0
promotion_flag               0.0
srch_destination_id          0.0
srch_length_of_stay          0.0
srch_booking_window          0.0
srch_adults_count            0.0
srch_children_count          0.0
srch_room_count              0.0
srch_saturday_night_bool     0.0
srch_query_affinity_score    0.0
orig_destination_distance    0.0
random_bool                  0.0
click_bool                   0.0
gross_bookings_usd           0.0
booking_bool                 0.0
month                        0.0
year                         0.0
same_country_visitor_prop    0.0
viable_comp                  0.0
prop_mean_score              0.0
dtype: flo

### slow method of averaging mean values

In [12]:
if config.naive_imputing == False:

    #fill in nans with mean values:
    na_cols = train_data.isna().any()
    nan_cols = train_data.columns[na_cols]
    for column in nan_cols:
        print (column)
        if column in ['visitor_hist_starrating', 'visitor_hist_adr_usd',  
                         'srch_length_of_stay', 'srch_booking_window', 
                         'srch_adults_count', 'srch_children_count',
                         'srch_room_count'                      
                        ]:
            train_data[column] = train_data.groupby('srch_id').transform(lambda x: x.fillna(x.mean()))
        elif column in ['prop_starrating', 'prop_review_score', 
                           'prop_location_score1', 'prop_location_score2', 
                           'prop_log_historical_price', 'price_usd',
                           'search_', 'orig_destination_distance',  
                           'srch_query_affinity_score'
                          ]:
            train_data[column] = train_data.groupby('prop_id').transform(lambda x: x.fillna(x.mean()))

    train_data.isnull().sum()/len(train_data)
    
if config.naive_imputing == True:
        # Here we definehow we would like to encode

    # For One-Hot Encoding
    # Onehot encode the categorical variables
    oh_impute = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-2)
    oh_encoder = OneHotEncoder(handle_unknown='ignore')
    oh_pipeline = Pipeline([
        ('impute', oh_impute),
        ('encode', oh_encoder)
    ])

    # Encode the numerical values
    num_impute = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
    num_scale_encoder = StandardScaler()
    num_pipeline = Pipeline([
        ('impute', num_impute),
        ('encode', num_scale_encoder)
    ])




## Feature encoding

In [13]:
# Imports for feature transformation
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [14]:
# Manual feature-selection
# We do a preselection of columns that we feel will become useful features after encoding
if config.pre_feature_selection:
    chosen_columns = ['prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                      'prop_log_historical_price', 'price_usd', 'srch_query_affinity_score',  'promotion_flag']
    if config.feature_engineering:
        chosen_columns = ['prop_starrating', 'prop_review_score', 'prop_mean_score', 'same_country_visitor_prop'
                          'prop_log_historical_price', 'price_usd', 'srch_query_affinity_score',  'promotion_flag', 
                          'month', 'viable_comp', 'year']
else:
    chosen_columns = categorical_cols + numerical_cols

for column in categorical_cols:
    train_data[column]=train_data[column].astype('category')
    
    
chosen_columns = list(set(chosen_columns) & set(train_data.columns))
# Select the chosen columns, and 
# define the corresponding transformer's transformations to their columns
chosen_oh_cols = list(set(chosen_columns) & set(categorical_cols))
chosen_num_cols = list(set(chosen_columns) & set(numerical_cols))

In [15]:
chosen_train_data = train_data[chosen_columns]

df_transformer = ColumnTransformer([
    ('oh', oh_pipeline, chosen_oh_cols),
    ('num', num_pipeline, chosen_num_cols),
], remainder='drop')

# We fit this transformer on our training data, and transform/encode our training data
encoded_X = df_transformer.fit_transform(chosen_train_data)

# We also represent this same X using the original columns.
new_oh_columns = df_transformer.named_transformers_.oh.named_steps.encode.get_feature_names(chosen_oh_cols)
encoded_columns = [ *new_oh_columns, *chosen_num_cols]

## Feature selection

In [16]:
# We extract the y-target in general
X_only = train_data.copy()
y = X_only.pop('booking_bool')

In [17]:
##### We apply feature selection using the model from our config
if config.dimensionality_reduc:
    pca = TruncatedSVD(n_components=config.dimension_features)
    feature_encoded_X = pca.fit_transform(encoded_X)
else:
    classifier = config.classifier(**config.classifier_dict)
    feature_selector = config.feature_selection(classifier, **config.feature_selection_dict)
    feature_encoded_X = feature_selector.fit_transform(encoded_X, y)
    # TODO: support_ method might not work on every featureselector we choose, test this
    bool_vec = feature_selector.support_

In [18]:
train_data.isna().any()
# Utility cell to investigate data elements
# Data elements we have available
# Encoded data:
    # - df_encoded_X: Dataframe that contains the preprocessed features
    # - encoded_X: numpy version of `encoded_df`
# Original data:
    # - train_data: training data, but cleaned up
    # - X_only: `train_data` without `booking_bool`

srch_id                      False
site_id                      False
visitor_hist_starrating      False
visitor_hist_adr_usd         False
prop_id                      False
prop_starrating              False
prop_review_score            False
prop_brand_bool              False
prop_log_historical_price    False
position                     False
price_usd                    False
promotion_flag               False
srch_destination_id          False
srch_length_of_stay          False
srch_booking_window          False
srch_adults_count            False
srch_children_count          False
srch_room_count              False
srch_saturday_night_bool     False
srch_query_affinity_score    False
orig_destination_distance    False
random_bool                  False
click_bool                   False
gross_bookings_usd           False
booking_bool                 False
month                        False
year                         False
same_country_visitor_prop    False
viable_comp         

# Training a model

We will now try a various amount of models with parameters.

In [19]:
# Utility functions

# Gets the sizes of same search-id chunks.
get_user_groups_from_df = lambda df: df.groupby('srch_id').size().tolist()

In [20]:
# Reassign `srch_id`
# feature_encoded_X['srch_id'] = train_data['srch_id'].astype(int)
srch_id_col = train_data['srch_id'].astype(int)
srch_id_col = np.array(srch_id_col)
# srch_id_col = srch_id_col.reshape(-1,1)
# print (srch_id_col.shape)
# print(feature_encoded_X.shape)

# feature_encoded_X_2 = np.vstack((feature_encoded_X, srch_id_col))
# print (feature_encoded_X_2.shape)

### Learn-to-rank with LGBMRanker

If we decide to split our data into train/val, we can do it this way.

In [21]:
from sklearn.model_selection import GroupShuffleSplit

# feature_encoded_X = encoded_X
# Split data into 80% train and 20% validation, maintaining the groups however.
train_inds, val_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7).split(feature_encoded_X, groups=srch_id_col))

# Split train / validation by their indices
df_X_train = feature_encoded_X[train_inds]
y_train = y[train_inds]
df_X_val = feature_encoded_X[val_inds]
y_val = y[val_inds]

# Get the groups related to `srch_id`
query_train = get_user_groups_from_df(train_data.iloc[train_inds])
query_val = get_user_groups_from_df(train_data.iloc[val_inds])

# Remove srch_id
# df_X_train.pop('srch_id')
# df_X_val.pop('srch_id')
print("Ready to rank!")

Ready to rank!


In [22]:
y_val

28     0
29     0
30     0
31     0
32     0
      ..
954    0
955    0
956    0
957    0
958    0
Name: booking_bool, Length: 220, dtype: category
Categories (2, int64): [0, 1]

In [23]:
for column in feature_encoded_X:
    print (column.dtype)

float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64


In [None]:
#tidreassigning datatypes of x/y train/val (bool, np.array, to_list(), int) bugs gbm.fit


In [24]:
# We define our ranker (default parameters)
gbm = lgb.LGBMRanker()

gbm.fit(df_X_train, y_train, group=query_train,
        eval_set=[(df_X_val, y_val)], eval_group=[query_val],
        eval_at=[5, 10, 20], early_stopping_rounds=50)

ValueError: Series.dtypes must be int, float or bool

In [None]:
# Save model
import os
def ensure_path(path_to_file):
    os.makedirs(os.path.dirname(path_to_file), exist_ok=True)

ensure_path('storage/best_gbm.txt')
gbm.booster_.save_model('storage/best_gbm.txt')

In [None]:
del train_data, encoded_X, feature_encoded_X, X_only

In [None]:
import gc
from guppy import hpy
gc.collect()
h=hpy()
h.heap()

# Testing
---

## Testing with LGBM-Ranker

In [None]:
# Read test data, and use the same columns as was used for training
df_test_data = pd.read_csv('data/test_set_VU_DM.csv')
chosen_test_data = df_test_data[chosen_columns]

# Apply transformations (encoding + selection)
encoded_test_data = df_transformer.transform(chosen_test_data)

if config.PCA_use:
    filtered_test_data = pca.transform(encoded_test_data)
else:
    filtered_test_data = encoded_test_data[:, bool_vec]
                                    
X_test = filtered_test_data

In [None]:
filtered_test_data

#### Predicting on a per-group basis: slow as hell (skip section for faster method)

In [None]:
# # Split test-data into groups based on the original data
# groups = df_test_data.groupby('srch_id').indices
# groups_by_idxs = list(groups.values())
# print (groups_by_idxs)

In [None]:
# Predictions
def predict_for_group(X_test, group_idxs, df_test_data):
    # Use gbm to predict
    X_test_group = X_test[group_idxs]
    preds = gbm.predict(X_test_group)
    preds = preds.argsort()[::-1] # Reverses
    
    # Get th
    pred_idxs = group_idxs[preds]
    pred_props = df_test_data.loc[pred_idxs, ['srch_id', 'prop_id']]
    
    return pred_props

In [None]:
# Doing it on a 'per-group basis'
# Commented because it is slow.
# # Perform the prediction (Can take a while, shitton of predictions)
# result = []

# for i, idx_group in enumerate(groups_by_idxs):
#     preds = predict_for_group(X_test, idx_group, df_test_data)
#     result.append(preds)
    
#     if i % 10000 == 0:
#         print(f"Doing group {i + 1} / {len(groups_by_idxs)} now")

In [None]:
# len(result)

#### Predicting all at once: More performant

In [None]:
pred_all = gbm.predict(X_test)
df_test_data['pred'] = pred_all

In [None]:
pred_all

In [None]:
# Sort predictions based on srch_id and pred
sorted_preds = df_test_data[['srch_id', 'prop_id', 'pred']].sort_values(by=['srch_id', 'pred'], ascending=[True, False]).reset_index()

# Save
sorted_preds[['srch_id', 'prop_id']].to_csv('results.csv', index=False)

In [None]:
sorted_preds

In [None]:
pd.read_csv('results.csv', nrows=100)