# Introduction: Setting up
---

In [1]:
# Imports
import sklearn as sk
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import Lasso, RidgeClassifier, LogisticRegression
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE
from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC, SVC

Config parameters:

nrows: default dataframelength: 4958347

classifiers: (SVC, linearSVC, RidgeClassifier)
    - SVC params: C (0-1), kernel (rbf, linear, poly), max_iter (-1, int), random_state (int)
    - linearSVC params: C (0-1), penalty (l1, l2), max_iter (1000, int), random_state (int) 
    - RidgeClassifier params: max_iter (1000, int), random_state (int) 
    
feature_selection: (SelectFromModel, SelectKBest, RFE)
    - SelectFromModel params: threshold (0-int), max_features (0-int)
    - SelectKBest params: threshold (0-int), k (0-int)
    - RFE params: n_features_to_select (0-int), step

In [2]:
import importlib
import config
importlib.reload(config)
from config import Config

# Config Settings
config = Config(
    nrows=None,
    pre_feature_selection=False, #todo Bug in prefeature selection = False
    train_data_subset=0.8,
    classifier=RidgeClassifier,
    classifier_dict={'max_iter' : 1000, 'random_state' : 2},
    feature_selection=RFE,
    feature_selection_dict={'n_features_to_select' : 5, 'step' : 1} 
)

In [3]:
if config.nrows is not None:
    train_data = pd.read_csv('data/training_set_VU_DM.csv', nrows=config.nrows)
else:
    train_data = pd.read_csv('data/training_set_VU_DM.csv')
original_columns = train_data.columns
train_data.head(5) # Show top 5
train_data_nans = train_data

In [4]:
# #remove columns with over 50% nans
# for column in train_data_nans.columns:
#     if train_data_nans[column].isnull().sum()/len(train_data_nans) > 0.5:
#         train_data_nans = train_data_nans.drop(columns=column, axis=1)
        
# train_data_nans.isnull().sum()/len(train_data_nans)
# #remove data with > 0.50 nans

In [5]:
#fill in nans with mean values:
# na_cols = train_data_nans.isna().any()
# nan_cols = train_data_nans.columns[na_cols]
# for column in nan_cols:
#     print (column)
#     if column in ['visitor_hist_starrating', 'visitor_hist_adr_usd',  
#                      'srch_length_of_stay', 'srch_booking_window', 
#                      'srch_adults_count', 'srch_children_count',
#                      'srch_room_count'                      
#                     ]:
#         train_data_nans[column] = train_data_nans.groupby('srch_id').transform(lambda x: x.fillna(x.mean()))
#     elif column in ['prop_starrating', 'prop_review_score', 
#                        'prop_location_score1', 'prop_location_score2', 
#                        'prop_log_historical_price', 'price_usd',
#                        'search_', 'orig_destination_distance',  
#                        'srch_query_affinity_score'
#                       ]:
#         train_data_nans[column] = train_data_nans.groupby('prop_id').transform(lambda x: x.fillna(x.mean()))

    

# train_data_nans.isnull().sum()/len(train_data_nans)




In [6]:
print (train_data_nans.head(5))

   srch_id            date_time  site_id  visitor_location_country_id  \
0        1  2013-04-04 08:32:15       12                          187   
1        1  2013-04-04 08:32:15       12                          187   
2        1  2013-04-04 08:32:15       12                          187   
3        1  2013-04-04 08:32:15       12                          187   
4        1  2013-04-04 08:32:15       12                          187   

   visitor_hist_starrating  visitor_hist_adr_usd  prop_country_id  prop_id  \
0                      NaN                   NaN              219      893   
1                      NaN                   NaN              219    10404   
2                      NaN                   NaN              219    21315   
3                      NaN                   NaN              219    27348   
4                      NaN                   NaN              219    29604   

   prop_starrating  prop_review_score  ...  comp6_rate_percent_diff  \
0                3   

# Manual Column exploration
---
## Main columns
- `search_id` seems to represent each individual 'user'.
- `booking_bool` is essentially the answer.

## Categorical features
The following features are categorical (to be onehot-encoded):

User-specific
- `site_id`: category of website Expedia used
- `visitor_location_country_id`: categories of which country user is from
- `srch_destination_id`: where did the user search from
- `srch_saturday_night_bool`: boolean if stay includes staturday

Hotel-specific:
- `prop_id`: categories of associated hotels
- `prop_brand_bool`: boolean if hotel is part of chain or not
- `promotion_flag`: displaying promotion or not

Expedia-specific vs competitors 1_8:
- `comp{i}_rate`: if expedia has a lower price, do +1, 0 if same, -1 price is higher, null if no competitive data
- `comp{i}_inv`: if competitor has no availability, +1, 0 if both have availability, null if no competitive data

## Numerical features

User-specific
- `visitor_hist_starrating`: average of previous stars of associated user
- `visitor_hist_adr_usd`: average price per night of hotels of associated user
- `srch_length_of_stay`: number of nights stays **searched** 
- `srch_booking_window`: number of days ahead the start of booking window **searched**
- `srch_adults_count`: number of adults **searched**
- `srch_children_count`: number of children **searched**
- `srch_room_count`: number of rooms **searched**
- `random_bool`: if sort was random at time of search
- `gross_booking_usd`: ❗Training-only❗ payment includign taxes, etc for hotel

Hotel-specific
- `prop_starrating`: star rating of hotel (1-5)
- `prop_review_score`: average review score of hotel (1-5)
- `prop_location_score_1`: score1 of hotel's location desirability
- `prop_location_score_2`: score2 of hotel's location desirability
- `prop_log_historical_price`: logarithm of average price of hotel lately (0 == not sold)
- `price_usd`: displayed price of hotel.
    - ❗ Important: Different countries have different conventions.
    - Value can change per night
- `srch_query_affinity_score`: log probability a hotel is clicked in internet searches

User-hotel coupled:
- `orig_destination_distance`: distance between hotel and customer at search-time (null means no distance calculated)

Expedia-specific vs competitors 1_8:
- `comp{i}_rate_percent_diff`: absolute difference between expedia and competitor's price, with null being no competitive data


## Unknown type
- `date_time`

# Feature Preprocessing
---

## Data cleanup: Imputing missing values

In [7]:
# We will have to cleanup our data next up. Let's first impute the missing columns. 
# To do this we search for the columns with nans
na_cols = train_data.isna().any()
nan_cols = train_data.columns[na_cols]
nan_cols

Index(['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_review_score',
       'prop_location_score2', 'srch_query_affinity_score',
       'orig_destination_distance', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff', 'gross_bookings_usd'],
      dtype='object')

Aside from `comp{i}_rate` and `comp2_inv`, all of these columns are numerical features. We could, initially,
simply replace all these values with -1 for the moment.

❗ Important: Note, this is actually incorrect, but might work for the moment.

In [8]:
# Simple numerical impute: select numerical data, fill it with -1
imputed_numerical_data = train_data[nan_cols].filter(regex='[^comp\d_(rate|inv)$]')
imputed_numerical_data = imputed_numerical_data.fillna(-1)
train_data.update(imputed_numerical_data)

# Manual cleanup to ensure no problem with space
del imputed_numerical_data
train_data.head(5)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,893,3,3.5,...,-1.0,,,-1.0,0.0,0.0,-1.0,0,-1.0,0
1,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,10404,4,4.0,...,-1.0,,,-1.0,0.0,0.0,-1.0,0,-1.0,0
2,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,21315,3,4.5,...,-1.0,,,-1.0,0.0,0.0,-1.0,0,-1.0,0
3,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,27348,2,4.0,...,-1.0,,,-1.0,-1.0,0.0,5.0,0,-1.0,0
4,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,29604,4,3.5,...,-1.0,,,-1.0,0.0,0.0,-1.0,0,-1.0,0


In [9]:
# Simple naive categorical impute
na_cols = train_data.columns[train_data.isna().any()]
imputed_categorical_data = train_data[na_cols].fillna(-2)
train_data.update(imputed_categorical_data)

# Cleanup
del imputed_categorical_data
train_data.head(5)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,893,3,3.5,...,-1.0,-2.0,-2.0,-1.0,0.0,0.0,-1.0,0,-1.0,0
1,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,10404,4,4.0,...,-1.0,-2.0,-2.0,-1.0,0.0,0.0,-1.0,0,-1.0,0
2,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,21315,3,4.5,...,-1.0,-2.0,-2.0,-1.0,0.0,0.0,-1.0,0,-1.0,0
3,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,27348,2,4.0,...,-1.0,-2.0,-2.0,-1.0,-1.0,0.0,5.0,0,-1.0,0
4,1,2013-04-04 08:32:15,12,187,-1.0,-1.0,219,29604,4,3.5,...,-1.0,-2.0,-2.0,-1.0,0.0,0.0,-1.0,0,-1.0,0


## Feature encoding

In [10]:
# Imports for feature transformation
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [11]:
import numpy as np
# Here we definehow we would like to encode

# For One-Hot Encoding
# Onehot encode the categorical variables
oh_columns = ['site_id', 'visitor_location_country_id', 'prop_country_id', 
              'prop_id', 'prop_brand_bool', 'promotion_flag', 
              'srch_destination_id', 'srch_saturday_night_bool', 'random_bool', 'click_bool'
             ]
oh_impute = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-2)
oh_encoder = OneHotEncoder(handle_unknown='ignore')
oh_pipeline = Pipeline([
    ('impute', oh_impute),
    ('encode', oh_encoder)
])
# TODO: competitor columns
for column in oh_columns:
    train_data[column]=train_data[column].astype('category')


# Encode the numerical values
num_scale_columns = ['visitor_hist_starrating', 'visitor_hist_adr_usd', 
                     'prop_starrating', 'prop_review_score', 
                     'prop_location_score1', 'prop_location_score2', 
                     'prop_log_historical_price', 'price_usd', 
                     'srch_length_of_stay', 'srch_booking_window', 
                     'srch_adults_count', 'srch_children_count',
                     'srch_room_count', 'srch_query_affinity_score', 
                     'orig_destination_distance' 
                    ]
num_impute = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
num_scale_encoder = StandardScaler()
num_pipeline = Pipeline([
    ('impute', num_impute),
    ('encode', num_scale_encoder)
])

In [12]:
# Manual feature-selection
# We do a preselection of columns that we feel will become useful features after encoding
if config.pre_feature_selection == True:
    chosen_columns = ['prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2', 
                      'prop_log_historical_price', 'price_usd', 'srch_query_affinity_score',  'promotion_flag']
else:
    chosen_columns = oh_columns + num_scale_columns

# Select the chosen columns, and 
# define the corresponding transformer's transformations to their columns
chosen_oh_cols = list(set(chosen_columns) & set(oh_columns))
chosen_num_cols = list(set(chosen_columns) & set(num_scale_columns))

In [13]:
chosen_train_data = train_data[chosen_columns]

df_transformer = ColumnTransformer([
    ('oh', oh_pipeline, chosen_oh_cols),
    ('num', num_pipeline, chosen_num_cols),
], remainder='drop')

# We fit this transformer on our training data, and transform/encode our training data
encoded_X = df_transformer.fit_transform(chosen_train_data)

# We also represent this same X using the original columns.
new_oh_columns = df_transformer.named_transformers_.oh.named_steps.encode.get_feature_names(chosen_oh_cols)
encoded_columns = [ *new_oh_columns, *chosen_num_cols]
df_encoded_X = pd.DataFrame(encoded_X, columns=encoded_columns)

ValueError: Shape of passed values is (4958347, 1), indices imply (4958347, 147681)

## Feature selection

In [None]:
# We extract the y-target in general
X_only = train_data.copy()
y = X_only.pop('booking_bool')

In [None]:
##### We apply feature selection using the model from our config
classifier = config.classifier(**config.classifier_dict)
feature_selector = config.feature_selection(classifier, **config.feature_selection_dict)
feature_encoded_X = feature_selector.fit_transform(df_encoded_X, y)
# TODO: support_ method might not work on every featureselector we choose, test this
bool_vec = feature_selector.support_
feature_cols = np.array(encoded_columns)[bool_vec]
df_feature_encoded = pd.DataFrame(feature_encoded_X, columns=feature_cols)

In [None]:
# Utility cell to investigate data elements
# Data elements we have available
# Encoded data:
    # - df_encoded_X: Dataframe that contains the preprocessed features
    # - encoded_X: numpy version of `encoded_df`
# Original data:
    # - train_data: training data, but cleaned up
    # - X_only: `train_data` without `booking_bool`

# Training a model

We will now try a various amount of models with parameters.

In [None]:
# Utility functions

# Gets the sizes of same search-id chunks.
get_user_groups_from_df = lambda df: df.groupby('srch_id').size().tolist()

In [None]:
# Reassign `srch_id`
df_feature_encoded['srch_id'] = train_data['srch_id'].astype(int)

### Learn-to-rank with LGBMRanker

If we decide to split our data into train/val, we can do it this way.

In [None]:
from sklearn.model_selection import GroupShuffleSplit

# Split data into 80% train and 20% validation, maintaining the groups however.
train_inds, val_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7).split(df_feature_encoded, groups=df_feature_encoded['srch_id']))

# Split train / validation by their indices
df_X_train = df_feature_encoded.iloc[train_inds]
y_train = y[train_inds]
df_X_val = df_feature_encoded.iloc[val_inds]
y_val = y[val_inds]

# Get the groups related to `srch_id`
query_train = get_user_groups_from_df(df_X_train)
query_val = get_user_groups_from_df(df_X_val)

# Remove srch_id
df_X_train.pop('srch_id')
df_X_val.pop('srch_id')
print("Ready to rank!")

In [None]:
# We define our ranker (default parameters)
gbm = lgb.LGBMRanker()

gbm.fit(df_X_train, y_train, group=query_train,
        eval_set=[(df_X_val, y_val)], eval_group=[query_val],
        eval_at=[5, 10, 20], early_stopping_rounds=50)

In [None]:
# Save model
import os
def ensure_path(path_to_file):
    os.makedirs(os.path.dirname(path_to_file), exist_ok=True)

ensure_path('storage/best_gbm.txt')
gbm.booster_.save_model('storage/best_gbm.txt')

# Testing
---

## Testing with LGBM-Ranker

In [None]:
# Read test data, and use the same columns as was used for training
df_test_data = pd.read_csv('data/test_set_VU_DM.csv')
chosen_test_data = df_test_data[chosen_columns]

# Apply transformations (encoding + selection)
encoded_test_data = df_transformer.transform(chosen_test_data)
df_encoded_test_data = pd.DataFrame(encoded_test_data, columns=encoded_columns)

feature_cols = (list(feature_cols))
filtered_test_data = df_encoded_test_data[feature_cols]
                                    
X_test = filtered_test_data

#### Predicting on a per-group basis: slow as hell (skip section for faster method)

In [None]:
# Split test-data into groups based on the original data
groups = df_test_data.groupby('srch_id').indices
groups_by_idxs = list(groups.values())
print (groups_by_idxs)

In [None]:
# Predictions
def predict_for_group(X_test, group_idxs, df_test_data):
    # Use gbm to predict
    X_test_group = X_test[group_idxs]
    preds = gbm.predict(X_test_group)
    preds = preds.argsort()[::-1] # Reverses
    
    # Get th
    pred_idxs = group_idxs[preds]
    pred_props = df_test_data.loc[pred_idxs, ['srch_id', 'prop_id']]
    
    return pred_props

In [None]:
# Doing it on a 'per-group basis'
# Commented because it is slow.
# # Perform the prediction (Can take a while, shitton of predictions)
# result = []

# for i, idx_group in enumerate(groups_by_idxs):
#     preds = predict_for_group(X_test, idx_group, df_test_data)
#     result.append(preds)
    
#     if i % 10000 == 0:
#         print(f"Doing group {i + 1} / {len(groups_by_idxs)} now")

In [None]:
# len(result)

#### Predicting all at once: More performant

In [None]:
pred_all = gbm.predict(X_test)
df_test_data['pred'] = pred_all

In [None]:
pred_all

In [None]:
# Sort predictions based on srch_id and pred
sorted_preds = df_test_data[['srch_id', 'prop_id', 'pred']].sort_values(by=['srch_id', 'pred'], ascending=[True, False]).reset_index()

# Save
sorted_preds[['srch_id', 'prop_id']].to_csv('results.csv', index=False)

In [None]:
sorted_preds

In [None]:
pd.read_csv('results.csv', nrows=100)