### Useful Resources

- [A Practical Guide to LambdaMART in LightGbm](https://medium.datadriveninvestor.com/a-practical-guide-to-lambdamart-in-lightgbm-f16a57864f6)
- [lightgbm.LGBMRanker Documentation](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html#)
- [LightGBM Parameter Tuning Guide](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)
- [How To Use Optuna to Tune LightGBM Hyperparameters](https://forecastegy.com/posts/how-to-use-optuna-to-tune-lightgbm-hyperparameters/)
- [Optuna](https://optuna.readthedocs.io/en/stable/tutorial/index.html)

### Global Constants

In [1]:
# --------------- Model --------------- #
# k for NDCG@k
K = 5    
# Number of iterations/trees for LightGBM              
NUM_ITERATIONS = 50 # 1000

# --------------- Tuning --------------- #
# number of trials for Optuna
N_TRIALS = 10   # 30   

### Libraries

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import pickle

import psutil
import os
from pathlib import Path
from datetime import datetime
import json

from data import read_processed_train, read_processed_val, read_processed_test

  from .autonotebook import tqdm as notebook_tqdm


### Read Data

In [3]:
train_df = read_processed_train()

In [4]:
val_df = read_processed_val(num=1)

In [5]:
val2_df = read_processed_val(num=2)

In [6]:
test_df = read_processed_test()

### Change Target

**Function Definitions**
- Take in train_df or val_df and modify the target column
- All columns can be used
- Do not change the row order
- **target must be an integer**

In [7]:
def base_target(df):
    """
    Applies the weighting given in the assignment (no discounting)
    """
    df['target'] = 0

    mask = df['click_bool'] == 1
    df.loc[mask, 'target'] = 1

    mask = df['booking_bool'] == 1
    df.loc[mask, 'target'] = 2

    return df

In [8]:
# Add others...

**Apply Functions**

- Apply one of the above functions to train_df and val_df

In [9]:
# Choice of target function
target_func = base_target

# ---------------------------------- #

# Apply target function
train_df = target_func(train_df)
val_df = target_func(val_df)
val2_df = target_func(val2_df)

### Feature Engineering

**Function Defintions**
- Take in a dataframe and modify/add columns
- Must be applicable to train_df, val_df and test_df
- Do not change the row order

In [10]:
def hour_day_month_year(df):
    """
    Function to create day, month, year columns from date_time column.
    """
    df['date_time'] = pd.to_datetime(df['date_time'])

    df['hour'] = df['date_time'].dt.hour
    df['day'] = df['date_time'].dt.day
    df['month'] = df['date_time'].dt.month
    df['year'] = df['date_time'].dt.year

    return df

In [11]:
# Features that will receive a rank column
ranked_cols = ['prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'orig_destination_distance']

def add_ranks(df):
    """
    Function to add rank of certain features within each srch_id. Values are
    ranked in ascending order, with ties receiving the same rank.
    """
    for ranked_col in ranked_cols:
        df[f'{ranked_col}_rank'] = df.groupby('srch_id')[ranked_col].rank(method='dense')
    
    return df

In [12]:
def starrating_review_diff(df):
    """
    Function to create absolute difference between 'prop_starrating' and 'prop_review_score'.
    """
    df['starrating_review_diff'] = abs(df['prop_starrating'] - df['prop_review_score'])
    
    return df

In [13]:
def sum_adults_children(df):
    """
    Function to create sum of adults and children counts
    """
    df['sum_adults_children'] = df['srch_adults_count'] + df['srch_children_count']
    
    return df

In [14]:
# Estimate click and booking probability for each property

# Join train and val sets
train_val_df = pd.concat([train_df, val_df, val2_df], ignore_index=True)

# Probability of click and book for each prop_id
train_val_df['click_prob'] = train_val_df.groupby('prop_id')['click_bool'].transform('mean')
train_val_df['booking_prob'] = train_val_df.groupby('prop_id')['booking_bool'].transform('mean')

# Create lookup tables
prob_lookup = train_val_df[['prop_id', 'click_prob', 'booking_prob']].drop_duplicates().set_index('prop_id')

def add_probabilities(df):
    """
    Function to add click and booking probabilities to each row. Unseen prop_ids
    are given -1.
    """
    df = df.join(prob_lookup, on='prop_id')
    df[['click_prob', 'booking_prob']] = df[['click_prob', 'booking_prob']].fillna(-1)
    
    return df

In [15]:
# Jenia's features

def log_price(df):
    """
    Function to take log of price_usd column
    """
    df['log_price_usd'] = np.log(df['price_usd'] + 1)
    
    return df

In [16]:
# Add others...

**Apply Functions**

- Apply some combination of the above functions to train_df, val_df and test_df

In [17]:
# List of functions to apply in order
# feature_functions = [hour_day_month_year, add_ranks, starrating_review_diff, sum_adults_children, add_probabilities, log_price]
feature_functions = [hour_day_month_year, add_ranks, starrating_review_diff, sum_adults_children, add_probabilities, log_price]

# ---------------------------------- #

# Apply listed functions
for func in feature_functions:
    train_df = func(train_df)
    val_df = func(val_df)
    val2_df = func(val2_df)
    test_df = func(test_df)

Jenia's normalisation code

In [18]:
# by_cols_norm = [
#     'srch_id',
#     'srch_destination_id',
#     # 'srch_booking_window',
#     'prop_id',
#     'prop_country_id',
#     # 'month',
#     'site_id'
# ]

# cols = [
#     # 'price_usd',
#     'log_price_usd',
#     'prop_review_score',
#     'prop_location_score1',
#     'prop_location_score2',
#     'prop_log_historical_price'
# ]

# def fit(df, by_col, columns):
#     print(f'Fitting means ans stds... - {by_col}')
#     all_columns = [by_col] + columns
#     groups = train_df[all_columns].groupby(by_col)

#     means = groups.mean()
#     stds = groups.std()

#     return dict(zip(means.index, means.values)), dict(zip(stds.index, stds.values))

# def normalise_by_cols(df, means, stds, by_col, columns):
#     print('Transformong columns...')
#     for idx, col in enumerate(columns):
#         print(col)
#         upd_colname = f'norm_{col}_{by_col}'

#         df[upd_colname] = df[by_col].map(lambda x: -means[x][idx]  if x in means else 0)
#         df[upd_colname] = df[upd_colname] + df[col]
#         df[upd_colname] = df[upd_colname] / df[by_col].map(lambda x: -stds[x][idx] if x in stds else 1)
    
#     return df

# for by_col in by_cols_norm:
#     means, stds = fit(train_df, by_col, cols)

#     train_df = normalise_by_cols(train_df, means, stds, by_col, cols)
#     # val_a_df = normalise_by_cols(val_a_df, means, stds, by_col, cols)
#     # val_b_df = normalise_by_cols(val_b_df, means, stds, by_col, cols)
#     val_df = normalise_by_cols(val_df, means, stds, by_col, cols)
#     test_df = normalise_by_cols(test_df, means, stds, by_col, cols)

#     print('---')

### Final Data Preparation

Specify categorical variables

In [19]:
all_cat_cols = ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_brand_bool', 'promotion_flag', 'srch_destination_id', 'srch_saturday_night_bool', 'random_bool', 'month', 'year']

In [20]:
def specify_categorical(df):
    """
    Function to explicitly specify categorical variables.
    """
    df_cols = list(df.columns)

    for col in df_cols:
        if col in all_cat_cols:
            df[col] = df[col].astype('category')
    
    return df

In [21]:
train_df = specify_categorical(train_df)
val_df = specify_categorical(val_df)
val2_df = specify_categorical(val2_df)
test_df = specify_categorical(test_df)

Removing unwanted columns and preparing the data for the model

In [22]:
# Specify additional columns to remove
add_drop_cols = ['srch_id', 'date_time', 'prop_id']

# ---------------------------------- #

# Remove columns that won't be used as features
train_only_cols = ['position', 'click_bool', 'booking_bool', 'target'] + add_drop_cols

# Train data
group_train = train_df.groupby("srch_id")["srch_id"].count().to_numpy()
X_train = train_df.drop(columns=train_only_cols)
y_train = train_df['target'].astype(int)

# Validation data
group_val = val_df.groupby("srch_id")["srch_id"].count().to_numpy()
X_val = val_df.drop(columns=train_only_cols)
y_val = val_df['target'].astype(int)

# Validation 2 data
group_val2 = val2_df.groupby("srch_id")["srch_id"].count().to_numpy()
X_val2 = val2_df.drop(columns=train_only_cols)
y_val2 = val2_df['target'].astype(int)

# Test data
X_test = test_df.drop(columns=add_drop_cols).to_numpy(copy=True)

Adding custom evaluation metric

In [None]:
# def dcg_at_k(r, k):
#     r = np.asfarray(r)[:k]
#     if r.size:
#         return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    
#     return 0.

# def ndcg_at_k(y_true, y_pred, k=5):
#     # Calculate the ideal NDCG
#     rank_true = sorted(y_true, reverse=True)
#     dcg_max = dcg_at_k(rank_true, k)

#     # Calculate the NDCG for the predictions
#     rank_pred = [x for _, x in sorted(zip(y_pred, y_true), reverse=True)]
#     dcg_pred = dcg_at_k(rank_pred, k)

#     # Return the NDCG
#     return dcg_pred / dcg_max

In [None]:
# def custom_eval_metric(y_true, y_pred, weight, group):
#     # Calculate the NDCG for each group
#     ndcg = []
#     start = 0
#     for grp in group:
#         end = start + grp
#         y_true_group = y_true[start:end]
#         y_pred_group = y_pred[start:end]

#         ndcg.append(ndcg_at_k(y_true_group, y_pred_group))

#         start = end

#     # Calculate the overall NDCG
#     ndcg_avg = np.average(ndcg, weights=weight)
#     return 'cust_ndcg', ndcg_avg, True # (eval_name, eval_result, is_higher_better)

### Tuning

In [None]:
# Create the folder to store the results of the tuning
tune_folder_path = Path("./tuned_models/" + datetime.now().strftime("%Y%m%d_%H%M%S"))
tune_folder_path.mkdir(parents=True, exist_ok=True)

Constant hyperparameters

In [None]:
const_params = {
    "objective": "lambdarank",
    "boosting_type": "dart",
    "metric": "ndcg",
    "n_estimators": NUM_ITERATIONS,
    'max_depth': -1,
    "importance_type": "gain",
    "label_gain": [i for i in range(max(y_train.max(), y_val.max()) + 1)],
    "bagging_freq": 1,
    "n_jobs": psutil.cpu_count(logical=False) - 1,
    "verbosity": -1,
    # new params
    # "early_stopping_round": int(NUM_ITERATIONS/10),  # stops if 10% of iterations val score doesn't improve
    }

Define the objective

Changes:
- num_leaves [10, 35]

In [None]:
def objective(trial):
    tuned_params = {
        "num_leaves": trial.suggest_int("num_leaves", 10, 35),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.01, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 5000),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.01, 1, log=True),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 0.9),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.01, 0.5),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
    }

    params = const_params | tuned_params

    model = lgb.LGBMRanker(**params)
    model.fit(
        X=X_train,
        y=y_train,
        group=group_train,
        eval_set=[(X_train, y_train),(X_val, y_val)],
        eval_group=[group_train, group_val],
        eval_at=[K], # k for NDCG@k
        verbose=False,
      )
    
    # Save the trained model
    with open(tune_folder_path / f"trial_{trial.number}.pkl", "wb") as f:
        pickle.dump(model, f)

    best_val_score = model.best_score_['valid_1'][f'ndcg@{K}']
    return best_val_score

Optimise the objective

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS)

Save the results

In [None]:
# Keep only the best model (and rename it)
for trial in study.trials:
    if trial.number != study.best_trial.number:
        os.remove(tune_folder_path / f"trial_{trial.number}.pkl")
    else:
        os.rename(tune_folder_path / f"trial_{trial.number}.pkl", tune_folder_path / "best_model.pkl")

In [None]:
# Save the best hyperparameters
with open(tune_folder_path / "best_params.json", "w") as f:
    best_params = const_params | study.best_params
    json.dump(best_params, f)

# Save some other relevant information
with open(tune_folder_path / "info.txt", "w") as f:
    f.write(f"{10 * '-'} Global constants {10 * '-'}\n\n")
    f.write(f"NUM_ITERATIONS = {NUM_ITERATIONS}\n")
    f.write(f"K = {K}\n")
    f.write(f"N_TRIALS = {N_TRIALS}\n\n")
    f.write(f"{10 * '-'} Features {10 * '-'}\n\n")
    f.write(f"{list(X_train.columns)}\n\n")
    f.write(f"{10 * '-'} Study Info {10 * '-'}\n\n")
    f.write(f"Best value: {study.best_value}\n")
    f.write(f"Best trial number: {study.best_trial.number}\n")

In [None]:
# Add plots

### Training

**Load the model (choose one)**

1. Create a new model with hand-picked hyperparameters

In [23]:
# params = {
#         # Constant parameters
#         "objective": "lambdarank",
#         "boosting_type": "dart",
#         "metric": "ndcg",
#         "n_estimators": NUM_ITERATIONS, 
#         "importance_type": "gain",
#         "label_gain": [i for i in range(max(y_train.max(), y_val.max()) + 1)],
#         "bagging_freq": 1,
#         "n_jobs": psutil.cpu_count(logical=False),
#         "verbosity": -1,
#         # Tuning parameters
#         "num_leaves": 32,
#         'max_depth': -1,
#         "learning_rate": 0.01,
#         "subsample": 0.8,
#         "colsample_bytree": 0.8,
#         "min_data_in_leaf": 50,
#     }

model = lgb.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        n_estimators=NUM_ITERATIONS,
        learning_rate=0.12,
        max_position=5,
        label_gain=[0, 1, 2],
        boosting='dart',
    )
# model = lgb.LGBMRanker(**params)

2. Create a new model with the most recently tuned hyperparamters

In [None]:
root = Path("./tuned_models")
most_recent_dir = [str(d)[-15:] for d in sorted(root.iterdir()) if d.is_dir()][-1]

with open(root / most_recent_dir / "best_params.json", "r") as f:
    best_params = json.load(f)

model = lgb.LGBMRanker(**best_params)

3. Load the best model from the most recent tuning process

In [None]:
root = Path("./tuned_models")
most_recent_dir = [str(d)[-15:] for d in sorted(root.iterdir()) if d.is_dir()][-1]
model_path = root / most_recent_dir / "best_model.pkl"

model = pickle.load(open(model_path, "rb"))

4. Custom

In [None]:
# Create a lgb.LGBMRanker model however you like
root = Path("./tuned_models")
most_recent_dir = [str(d)[-15:] for d in sorted(root.iterdir()) if d.is_dir()][-1]

with open(root / most_recent_dir / "best_params.json", "r") as f:
    best_params = json.load(f)


best_params['n_estimators'] = 500

model = lgb.LGBMRanker(**best_params)
best_params

In [None]:
# model_path = "/Users/nathanjones/Development/VU/DMT/dmt_recom_sys/trained_models/20230527_230454/model.pkl"
# model = pickle.load(open(model_path, "rb"))

**Fitting the model** (skip if using a trained model)

In [24]:
model.fit(
      X=X_train,
      y=y_train,
      group=group_train,
      eval_set=[(X_train, y_train),(X_val, y_val)],
      eval_group=[group_train, group_val],
      eval_at=K, # k for NDCG@k
      early_stopping_rounds=200,
      # eval_metric=custom_eval_metric,
      )







[1]	training's ndcg@5: 0.394146	valid_1's ndcg@5: 0.393542




[2]	training's ndcg@5: 0.414761	valid_1's ndcg@5: 0.411565
[3]	training's ndcg@5: 0.423561	valid_1's ndcg@5: 0.41834
[4]	training's ndcg@5: 0.4316	valid_1's ndcg@5: 0.42473
[5]	training's ndcg@5: 0.434367	valid_1's ndcg@5: 0.427477
[6]	training's ndcg@5: 0.437484	valid_1's ndcg@5: 0.428363
[7]	training's ndcg@5: 0.441781	valid_1's ndcg@5: 0.431528
[8]	training's ndcg@5: 0.441547	valid_1's ndcg@5: 0.431953
[9]	training's ndcg@5: 0.444465	valid_1's ndcg@5: 0.434062
[10]	training's ndcg@5: 0.446024	valid_1's ndcg@5: 0.434921
[11]	training's ndcg@5: 0.447892	valid_1's ndcg@5: 0.435354
[12]	training's ndcg@5: 0.448058	valid_1's ndcg@5: 0.435789
[13]	training's ndcg@5: 0.45021	valid_1's ndcg@5: 0.436111
[14]	training's ndcg@5: 0.451948	valid_1's ndcg@5: 0.436443
[15]	training's ndcg@5: 0.453285	valid_1's ndcg@5: 0.437048
[16]	training's ndcg@5: 0.454423	valid_1's ndcg@5: 0.437891
[17]	training's ndcg@5: 0.456136	valid_1's ndcg@5: 0.438164
[18]	training's ndcg@5: 0.45855	valid_1's ndcg@5: 0.4

### Plots

In [None]:
ax_metric = lgb.plot_metric(model, figsize = (12,8))

In [None]:
ax_importance = lgb.plot_importance(model, xlabel='Importance (Gain)', figsize = (12,8))

### Validating

Use the second (unseen) validation set to estimate the final NDCG@5 of the model

Make predictions

In [None]:
X_val2_copy = X_val2.to_numpy(copy=True)

prediction = model.predict(X_val2_copy)
val2_df['prediction'] = prediction
pred_rank = val2_df.sort_values(by=['srch_id', 'prediction'], ascending=[True, False])

Calculate NDCG@5

In [None]:
def dcg(df, k):
    # Add discounted gain column
    df['i'] = df.groupby('srch_id').cumcount() + 1
    df['dg'] = ((2 ** df['target']) - 1) / np.log2(df['i'] + 1)

    # calculate dcg scores for each srch_id
    mask = df['i'] <= k
    return df[mask].groupby('srch_id')['dg'].sum().reset_index()

In [None]:
# Finds the DCG scores for the predicted ranking
pred_dcg = dcg(pred_rank, K)

# Finds the DCG scores for the true ranking
true_rank = val2_df.sort_values(by=['srch_id', 'target'], ascending=[True, False]).reset_index()
true_dcg = dcg(true_rank, K)

# Calculate avergae NDCG
ndcg_val2 = (pred_dcg['dg'] / true_dcg['dg']).mean()
print(f"Val 2 NDCG@{K} = {ndcg_val2}")

### Testing

In [None]:
prediction = model.predict(X_test)
test_df['prediction'] = prediction
test_df = test_df.sort_values(by=['srch_id', 'prediction'], ascending=[True, False])

### Save Results

In [None]:
# Create the folder to store the results of the training
train_folder_path = Path("./trained_models/" + datetime.now().strftime("%Y%m%d_%H%M%S"))
train_folder_path.mkdir(parents=True, exist_ok=True)

# Save the model
pickle.dump(model, open(train_folder_path / "model.pkl", "wb"))

# Save the parameters
with open(train_folder_path / "params.txt", "w") as f:
    f.write(json.dumps(model.get_params()))

# Save the feature names
with open(train_folder_path / "feature_names.txt", "w") as f:
    f.write(str(list(X_train.columns)))

# Save the figures
ax_metric.figure.savefig(train_folder_path / "learning_curve.pdf", bbox_inches="tight")
ax_importance.figure.savefig(train_folder_path / "feature_importance.pdf", bbox_inches="tight")

# Save best evaluation scores
with open(train_folder_path / "best_scores.txt", "w") as f:
    f.write("Best training score: " + str(model.best_score_['training'][f'ndcg@{K}']) + "\n")
    f.write("Best validation 1 score: " + str(model.best_score_['valid_1'][f'ndcg@{K}']) + "\n")
    f.write("Best validation 2 score: " + str(ndcg_val2) + "\n")

# Save the submission file
submission_file = train_folder_path / "submission.csv"
test_df[['srch_id', 'prop_id']].to_csv(submission_file, index=False)

### Playground