In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GroupKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ndcg_score
from sklearn.linear_model import Ridge
import xgboost as xgb
from catboost import CatBoostRanker, Pool
import joblib
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation,LGBMRanker
import itertools

## Data

In [None]:
def load_dataset(filename):
    return pd.read_csv(filename)

training_file_raw = 'training_set_VU_DM.csv'
test_file_raw = 'test_set_VU_DM.csv'
training_file_stats = 'training_set_stats_VU_DM.csv'
test_file_stats = 'test_set_stats_VU_DM.csv'

In [None]:
df = load_dataset(training_file_raw)
print(df.columns)

#### Features
|Feature  |Type | Description |Potential predictor|
|:------- |:----|:------------|------------------:|
|srch_id                     |int      |ID of search/user| |
|date_time                   |Datetime |Time of search| |
|site_id                     |int      |ID of Expedia link (.com/.co.uk/.co.jp...)| |
|visitor_location_country_id |int      |ID of user's country| *|
|visitor_hist_starrating     |float    |mean star rating of the customer's hotel purchases| *|
|visitor_hist_adr_usd        |float    |mean price per night of the customer's hotel purchases| *|
|prop_country_id             |int      |ID of the hotel's country| *|
|prop_id                     |int      |ID of hotel| |
|prop_starrating             |int      |star rating of hotel| *|
|prop_review_score           |float    |review score of hotel (rounded to 0.5)| **|
|prop_brand_bool             |int      |part of major hotel chain (1) or not (0)| *|
|prop_location_score1        |float    |desirability score of the hotel's location (primary score)                   |                  ** |
|prop_location_score2        |float    |desirability score of the hotel's location (secondary score)                 |                  ** |
|prop_log_historical_price   |float    |log of mean price of hotel in the last trading period                        |                   |
|price_usd                   |float    |displayed price of the hotel                                                 |                  ** |
|promotion_flag              |int      |1 if hotel had a sale price promotion                                        |                  ** |
|srch_destination_id         |int      |ID of the searched destination                                               |                   |
|srch_length_of_stay         |int      |number of nights in the stay                                                 |                  * |
|srch_booking_window         |int      |days between search and stay start                                           |                  * |
|srch_adults_count           |int      |number of adults in the search                                               |                  * |
|srch_children_count         |int      |number of children in the search                                             |                  * |
|srch_room_count             |int      |number of rooms in the search                                                |                  * |
|srch_saturday_night_bool    |bool     |1 if the stay includes a Saturday night                                      |                   |
|srch_query_affinity_score   |float    |log probability a hotel will be clicked on the internet                      |                   |
|orig_destination_distance   |float    |physical distance between hotel and customer                                 |                   |
|random_bool                 |bool     |1 if results were shown in random order                                      |                   |
|comp1_rate                  |int      |price comparison vs. competitor 1 (-1: higher, 0: same, +1: lower)           |                   |
|comp1_inv                   |int      |availability vs. competitor 1 (+1: competitor unavailable, 0: both available)|                   |
|comp1_rate_percent_diff     |float    |absolute percentage price difference with competitor 1                       |                   |
|comp2_rate                  |int      |same as comp1_rate for competitor 2                                          |                   |
|comp2_inv                   |int      |same as comp1_inv for competitor 2                                           |                   |
|comp2_rate_percent_diff     |float    |same as comp1_rate_percent_diff for competitor 2                             |                   |
|...                         |...      |same structure for competitors 3 through 8                                   |                   |
|position                    |int      |rank of hotel in search results (training data only)                         |                   |
|click_bool                  |bool     |1 if user clicked on the hotel  (training data only)                         |                   |
|booking_bool                |bool     |1 if user booked the hotel      (training data only)                         |                   |
|gross_bookings_usd           |float    |actual value of the booking (includes taxes, fees, etc.) (training data only)|                   |    

In [None]:
def dataset_stats(df):
    print(f'Number of features: {len(df.columns)}')
    total_observations = len(df)
    print(f'Number of observations: {len(df)}')
    print(f'Number of rows with missing values: {df.isnull().any(axis=1).sum()}')
    print(f'Number of columns with missing values: {df.isnull().any(axis=0).sum()}')
    print(f'Percentage not-missing data for features with missing values:')
    for feature in df.columns[df.isnull().any()]:
        print(f"{feature}: {100*(total_observations - df[feature].isnull().sum())/total_observations:.2f}% not missing")

dataset_stats(df)

### Feature Engineering

In [None]:
df = load_dataset(training_file_raw)

In [None]:
df = load_dataset(test_file_raw)

In [None]:
def add_engineered_columns(df_raw):
    df = df_raw.copy()

    df.loc[df['price_usd'] > 2060.0355, 'price_usd'] = np.nan # 0.999 percent of data maar kunnen dit nog aanpassen
    df['price_per_night'] = df['price_usd'] / df['srch_length_of_stay']
    df['month'] = pd.to_datetime(df['date_time']).dt.month

    df['review_score_relative'] = (df['prop_review_score'] - df.groupby('srch_id')['prop_review_score'].transform('median'))
    df['price_relative'] = (df['price_usd'] - df.groupby('srch_id')['price_usd'].transform('median'))

    df['log_price_usd'] = np.log1p(df['price_usd'])
    df['log_price_per_night'] = np.log1p(df['price_per_night'])

    df['orig_dest_missing'] = df['orig_destination_distance'].isna().astype(int)
    df['loc_score2_missing'] = df['prop_location_score2'].isna().astype(int)

    #for col in ['orig_destination_distance', 'prop_location_score2']:
    #    if col in df.columns:
    #        df[col] = df[col].fillna(df[col].median())
    return df

In [None]:
def compute_prop_stats(df):
    exclude = [
        'srch_id', 'prop_id', 'position', 'click_bool', 'booking_bool',
        'gross_bookings_usd', 'relevance', 'price_usd', 'visitor_location_country_id',
        'prop_country_id', 'site_id', 'srch_destination_id'
    ]
    numeric = df.select_dtypes('number').columns.drop(exclude)
    means = df.groupby('prop_id')[numeric].mean().add_suffix('_mean')
    meds  = df.groupby('prop_id')[numeric].median().add_suffix('_median')
    stats = means.join(meds)
    return stats

def prepare_final_features(df, prop_stats):
    df = df.merge(prop_stats, on='prop_id', how='left')
    return df.drop(['position', 'click_bool', 'booking_bool', 'gross_bookings_usd', 'price_usd', 'date_time'], axis=1, errors='ignore') #ignores de waardes als er geen column is om te droppen

In [None]:
train_raw = pd.read_csv("training_set_VU_DM.csv")

In [None]:
train_fe = add_engineered_columns(train_raw)

train_fe['relevance'] = 0
train_fe.loc[train_fe['click_bool'] == 1, 'relevance'] = 1
train_fe.loc[train_fe['booking_bool'] == 1, 'relevance'] = 5

prop_stats = compute_prop_stats(train_fe)

In [None]:
train_final = prepare_final_features(train_fe, prop_stats)

In [None]:
train_final.to_csv(training_file_stats, index=False)

In [None]:
test_raw  = pd.read_csv("test_set_VU_DM.csv")

In [None]:
test_fe  = add_engineered_columns(test_raw)

In [None]:
test_final  = prepare_final_features(test_fe, prop_stats)

In [None]:
test_final.to_csv(test_file_stats, index=False)

## Models

### XGBoost rank (relevance score)

In [None]:
df = load_dataset(training_file_stats)

In [None]:
# Make sure training features don't include grouping feature srch_id or target feature relevance
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
# Make sure searches are not split across the training and test sets
groups = df.groupby('srch_id').size().to_numpy()
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx, test_idx = next(gss.split(X, y, groups=df['srch_id']))

In [None]:
X_train_all = X.iloc[train_idx]
y_train_all = y.iloc[train_idx]
groups_train_all = df.iloc[train_idx]['srch_id']

In [None]:
gss_val = GroupShuffleSplit(test_size=0.2, random_state=2)
train_idx_final, val_idx = next(gss_val.split(X_train_all, y_train_all, groups=groups_train_all))

In [None]:
X_train, y_train = X_train_all.iloc[train_idx_final], y_train_all.iloc[train_idx_final]

In [None]:
X_val, y_val = X_train_all.iloc[val_idx], y_train_all.iloc[val_idx]

In [None]:
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

In [None]:
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dval = xgb.DMatrix(X_val, y_val, enable_categorical=True)

In [None]:
# Set group info
groups_train = groups_train_all.iloc[train_idx_final]
groups_val = groups_train_all.iloc[val_idx]

group_train = groups_train.groupby(groups_train).size().to_numpy()
group_val = groups_val.groupby(groups_val).size().to_numpy()

In [None]:
dtrain.set_group(group_train)
dval.set_group(group_val)

In [None]:
# What to evaluate during training, last one is used for early stopping
evals = [(dtrain, "train"), (dval, "validation")]

#### Grid search

In [None]:
base_params = {
    "objective": "rank:ndcg",
    "tree_method": "hist",
    "device": "gpu",
    "eval_metric": "ndcg@5",
    "eta": 0.1,
    "subsample": 1,
    "colsample_bytree": 0.8,
    "min_child_weight": 5,
    "max_depth": 6,
    "lambda": 1
}

param_grid = {
    "alpha": [0.5, 0],
    "gamma": [0, 1]
}

# combinations of parameters
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

results = []
for i, combo in enumerate(param_combinations):
    print(f"Training model {i+1}/{len(param_combinations)} with params: {combo}")
    params = {**base_params, **combo}

    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=5000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=100
    )

    results.append({
        **combo,
        "valid_ndcg@5": model.best_score,
        "best_iteration": model.best_iteration
    })
    print(f'validation: {model.best_score}, best_it: {model.best_iteration}')
    
    del model # clear model before training next

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="valid_ndcg@5", ascending=False)
results_df.to_csv("xgb_grid_search_results_eta1.csv", index=False)
print(results_df.head())

#### Single model training

In [None]:
# Used in evaluating model parameters
def train_xgb_validation(dtrain, dval):
    params = {
        "objective": "rank:ndcg", # ranking as oppposed to regression or binary classification
        "tree_method": "hist",
        "device": "gpu",          # making use of gpu in training
        "eval_metric": "ndcg@5",
        "eta": 0.1,                  # 0.3
        "max_depth": 6,               # 6
        "min_child_weight": 5,       # 1
        "gamma": 0,                   # 0
        "subsample": 1,             # 1
        "colsample_bytree": 0.8,      # 1
        "lambda": 1,                # 1
        "alpha": 0.5,                 # 0
        # "lambdarank_num_pair_per_sample": 5,
        # "lambdarank_unbiased": "true"
    }
    evals = [(dtrain, "train"), (dval, "validation")]
    
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=1070,     # max rounds
        evals=evals,
        # early_stopping_rounds=50, # stop training if metric has not improved for 30 rounds (prevent overfitting)
        verbose_eval=50            # how often to print metrics
    )
    return model, model.predict(dval)

In [None]:
# Used for final training after model fitting
def train_xgb(dtrain):
    params = {
        "objective": "rank:ndcg", # ranking as oppposed to regression or binary classification
        "tree_method": "hist",
        "device": "gpu",          # making use of gpu in training
        "eval_metric": "ndcg@5",
        "eta": 0.1,                  # 0.3
        "max_depth": 6,               # 6
        "min_child_weight": 5,       # 1
        "gamma": 0,                   # 0
        "subsample": 1,             # 1
        "colsample_bytree": 0.8,      # 1
        "lambda": 1,                # 1
        "alpha": 0.5,                 # 0
        # "lambdarank_num_pair_per_sample": 5,
        # "lambdarank_unbiased": "true"
    }
    evals = [(dtrain, "train")]
    
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=1070,     # max rounds
        evals=evals,
        # early_stopping_rounds=50, # stop training if metric has not improved for 30 rounds (prevent overfitting)
        verbose_eval=50            # how often to print metrics
    )
    return model

In [None]:
df = load_dataset(training_file_stats)

In [None]:
# Make sure training features don't include grouping feature srch_id or target feature relevance
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
# final (all data)
dtrain = xgb.DMatrix(X, y, enable_categorical=True)
groups_train = df['srch_id']

In [None]:
# evaluation (train/test)
groups = df.groupby('srch_id').size().to_numpy()
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx, test_idx = next(gss.split(X, y, groups=df['srch_id']))

X_train_all = X.iloc[train_idx]
y_train_all = y.iloc[train_idx]
dtrain = xgb.DMatrix(X_train_all, y_train_all, enable_categorical=True)
groups_train = df.iloc[train_idx]['srch_id']

In [None]:
group_train = groups_train.groupby(groups_train).size().to_numpy()
dtrain.set_group(group_train)
xgb_model = train_xgb(dtrain)

In [None]:
# save model to json file to avoid having to train again
lgb_model.save_model('xgboost_model_final.json')

#### Evaluation

In [None]:
xgb_model = xgb.Booster()
xgb_model.load_model('xgboost_model_eta_0_1.json')

In [None]:
test_df = df.iloc[test_idx].copy()

In [None]:
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

In [None]:
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
preds = model.predict(dtest)
test_df['pred'] = preds

# Compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in test_df.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.5f}")

#### Results
mean/median + transformed data: 0.39962  
mean/std/median + transformed data feature selection (marinde): 0.40116  
mean/std/median + transformed data fitted: 0.40361  
mean/std/median + transformed data eta 0.1 fitted for 0.3: 0.40855  
mean/std/median + transformed data eta 0.1: 0.41082  

### Ensemble

**XGBoost**

In [None]:
xgb_model = xgb.Booster()
xgb_model.load_model('xgboost_model_final.json')

**LightGBM**

In [None]:
lgb_model = lgb.Booster(model_file="lgbm_full.txt")

**CatBoost (not finished)**

In [None]:
cat_model = CatBoostRanker()
cat_model.load_model("catboost_model.cbm")

----------------------------

In [None]:
df = load_dataset(training_file_stats)

In [None]:
# make sure training features are only the ones available in the test file and don't include target feature relevance
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx_all, test_idx = next(gss.split(X, y, groups=df['srch_id']))

In [None]:
X_train_all, y_train_all = X.iloc[train_idx_all], y.iloc[train_idx_all]
groups_train_all = df.iloc[train_idx_all]['srch_id']

In [None]:
gss_val = GroupShuffleSplit(test_size=0.2, random_state=2)
train_idx_final, val_idx = next(gss_val.split(X_train_all, y_train_all, groups=groups_train_all))
X_train, y_train = X_train_all.iloc[train_idx_final], y_train_all.iloc[train_idx_final]
X_val, y_val = X_train_all.iloc[val_idx], y_train_all.iloc[val_idx]

In [None]:
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

In [None]:
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
xgb_pred = model.predict(dtest)
lgb_pred = lgb_model.predict(X_test)

all_preds = np.vstack([xgb_pred, lgb_pred]).T
scaled = MinMaxScaler().fit_transform(all_preds)
xgb_scaled, lgb_scaled = scaled[:, 0], scaled[:, 1]

In [None]:
xgb_proportion = 0.7
lgb_proportion = 0.3

In [None]:
total_pred = xgb_proportion * xgb_scaled + lgb_proportion * lgb_scaled

In [None]:
test_df = df.iloc[test_idx].copy()

In [None]:
test_df['pred'] = total_pred

# compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in df_test.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.4f}")

#### Results
xgb/lgb 0.65/0.35: 0.4120  
xgb/lgb 0.7/0.3: 0.4120  
xgb/lgb 0.75/0.25: 0.4120  
xgb/lgb 0.8/0.2: 0.4115 (i think but lower)  

### LightGBM

In [None]:
df = load_dataset(training_file_stats)

In [None]:
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
groups = df.groupby('srch_id').size().to_numpy()
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx, test_idx = next(gss.split(X, y, groups=df['srch_id']))

In [None]:
X_train_all = X.iloc[train_idx]
y_train_all = y.iloc[train_idx]
groups_train_all = df.iloc[train_idx]['srch_id']

In [None]:
gss_val = GroupShuffleSplit(test_size=0.2, random_state=2)
train_idx_final, val_idx = next(gss_val.split(X_train_all, y_train_all, groups=groups_train_all))

In [None]:
X_train, y_train = X_train_all.iloc[train_idx_final], y_train_all.iloc[train_idx_final]
X_val, y_val = X_train_all.iloc[val_idx], y_train_all.iloc[val_idx]

In [None]:
train_group = groups_train_all.iloc[train_idx_final].value_counts().sort_index().to_numpy()
val_group   = groups_train_all.iloc[val_idx].value_counts().sort_index().to_numpy()

In [None]:
def train_lgb_val(X_train, y_train, train_group, X_val, y_val, val_group):
    model = LGBMRanker(
        objective='lambdarank',         # Change if classification/regression
        metric='ndcg',                  # Or 'binary_logloss', 'rmse', etc.
        ndcg_eval_at=[5],               # Only for ranking
        learning_rate=0.05,             # Lower for large data = better generalization
        num_leaves=63,                  # 2^6 - 1; adjusts capacity (correlates with depth)
        max_depth=6,                    # Prevents overly deep trees
        min_data_in_leaf=20,            # Helps generalization (reduce if underfitting)
        min_split_gain=0.0,             # Allow all splits (LightGBM often benefits from this)
        lambda_l1=0.0,                  # Add if sparse data or noise-prone
        lambda_l2=1.0,                  # Mild regularization
        bagging_fraction=0.8,           # Row subsampling (improves speed and generalization)
        bagging_freq=1,                 # Perform bagging every iteration
        feature_fraction=0.8,           # Column subsampling (helps with 200+ features)
        n_estimators=940,             # Large number to allow early stopping
        boosting_type='gbdt',
        verbosity=-1,
        device='gpu'
    )
    
    model.fit(
        X_train, y_train,
        group=train_group,
        eval_set=[(X_val, y_val)],
        eval_group=[val_group],
        callbacks=[log_evaluation(period=50)]
    )
    
    return model, model.predict(X_val)

In [None]:
def train_lgb(X_train, y_train, train_group):
    model = LGBMRanker(
        objective='lambdarank',         # Change if classification/regression
        metric='ndcg',                  # Or 'binary_logloss', 'rmse', etc.
        ndcg_eval_at=[5],               # Only for ranking
        learning_rate=0.05,             # Lower for large data = better generalization
        num_leaves=63,                  # 2^6 - 1; adjusts capacity (correlates with depth)
        max_depth=6,                    # Prevents overly deep trees
        min_data_in_leaf=20,            # Helps generalization (reduce if underfitting)
        min_split_gain=0.0,             # Allow all splits (LightGBM often benefits from this)
        lambda_l1=0.0,                  # Add if sparse data or noise-prone
        lambda_l2=1.0,                  # Mild regularization
        bagging_fraction=0.8,           # Row subsampling (improves speed and generalization)
        bagging_freq=1,                 # Perform bagging every iteration
        feature_fraction=0.8,           # Column subsampling (helps with 200+ features)
        n_estimators=940,             # Large number to allow early stopping
        boosting_type='gbdt',
        verbosity=-1,
        device='gpu'
    )
    
    model.fit(
        X_train, y_train,
        group=train_group,
        # eval_set=[(X_val, y_val)],
        # eval_group=[val_group],
        # callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=50)]
    )
    
    return model

In [None]:
group_sizes = df.groupby('srch_id').size().tolist()
model = train_lgb(X, y, group_sizes)
model.booster_.save_model('lgbm_full.txt')

In [None]:
model, preds = train_lgb_val(X_train, y_train, train_group, X_val, y_val, val_group)
model.booster_.save_model('lgbm_val.txt')

In [None]:
print(sum(val_group) == len(X_val))

#### Evaluation

In [None]:
del model, preds

In [None]:
lgb_model = lgb.Booster(model_file="lgbm_param_copy.txt")

In [None]:
test_df = df.iloc[test_idx].copy()

In [None]:
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

In [None]:
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
preds = lgb_model.predict(dtest)
test_df['pred'] = preds

# Compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in test_df.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.5f}")

### Meta Model

In [None]:
def train_xgb_kfold(X_train, y_train, group_train, X_val, y_val, group_val, val_idx):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtrain.set_group(group_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dval.set_group(group_val)

    model, preds = train_xgb_validation(dtrain, dval)
    return preds

In [None]:
def train_lgb_kfold(X_train, y_train, group_train, X_val, y_val, group_val, val_idx):
    model, preds = train_lgb_val(X_train, y_train, group_train, X_val, y_val, group_val)
    return preds

In [None]:
df = load_dataset(training_file_stats)

In [None]:
# Make sure training features are only the ones available in the test file and don't include target feature relevance
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
group = df['srch_id']
n_splits = 5

In [None]:
features = []
labels = []

In [None]:
kfold = GroupKFold(n_splits=n_splits)
for fold, (train_idx_fold, val_idx_fold) in enumerate(kfold.split(X, y, groups=group)):
    print(f"Fold {fold + 1}")

    X_train, X_val = X.iloc[train_idx_fold], X.iloc[val_idx_fold]
    y_train, y_val = y.iloc[train_idx_fold], y.iloc[val_idx_fold]

    # Calculate group sizes (number of rows per query) for train and val folds:
    group_train = y_train.groupby(group.iloc[train_idx_fold]).size().to_numpy()
    group_val = y_val.groupby(group.iloc[val_idx_fold]).size().to_numpy()

    preds_xgb = train_xgb_kfold(
        X_train, y_train, group_train,
        X_val, y_val, group_val,
        val_idx_fold
    )
    preds_lgb = train_lgb_kfold(
        X_train, y_train, group_train,
        X_val, y_val, group_val,
        val_idx_fold
    )
    features.append(np.vstack((preds_xgb, preds_lgb)).T)
    labels.append(y_val.values)

In [None]:
meta_X = np.vstack(features)
meta_y = np.hstack(labels)

scaler = MinMaxScaler()
scaled = scaler.fit_transform(meta_X)

meta_model = Ridge()
meta_model.fit(scaled, meta_y)

In [None]:
joblib.dump(scaler, 'meta_scaler.pkl')

In [None]:
joblib.dump(meta_model, 'meta_model.pkl')

#### Evaluation

In [None]:
model = joblib.load('meta_model.pkl')

In [None]:
dtest = xgb.DMatrix(X_test)

In [None]:
xgb_test_preds = xgb_model.predict(dtest)
lgb_test_preds = lgb_model.predict(X_test)
meta_test_X = np.vstack((xgb_test_preds, lgb_test_preds)).T

total_pred = meta_model.predict(meta_test_X)

In [None]:
test_df = df.iloc[test_idx].copy()

In [None]:
test_df['pred'] = total_pred

# compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in test_df.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.4f}")

#### Results
xgb/lgb: 

## Submission generator

In [None]:
del df, X, y, group, X_train, y_train, X_val, y_val, group_train, group_val

In [None]:
xgb_model = xgb.Booster()
xgb_model.load_model('xgboost_model_final.json')

In [None]:
cat_model = CatBoostRanker()
cat_model.load_model("catboost_model.cbm")

In [None]:
lgb_model = lgb.Booster(model_file="lgbm_full.txt")

In [None]:
meta_model = joblib.load('meta_model.pkl')

In [None]:
test_df = load_dataset(test_file_stats)

In [None]:
test_df_without_srch = test_df.drop('srch_id', axis=1)

In [None]:
dtest = xgb.DMatrix(test_df_without_srch, enable_categorical=True)

In [None]:
group_id = test_df['srch_id']
test_pool = Pool(test_df_without_srch, group_id=group_id)

In [None]:
xgb_pred = xgb_model.predict(dtest)

In [None]:
cat_pred = cat_model.predict(test_pool)

In [None]:
lgb_pred = lgb_model.predict(test_df_without_srch)

In [None]:
scaler_xgb = MinMaxScaler()
scaler_lgb = MinMaxScaler()

xgb_scaled = scaler_xgb.fit_transform(xgb_pred.reshape(-1, 1)).flatten()
lgb_scaled = scaler_lgb.fit_transform(lgb_pred.reshape(-1, 1)).flatten()

xgb_proportion = 0.7
lgb_proportion = 0.3
total_pred = xgb_proportion * xgb_scaled + lgb_proportion * lgb_scaled

In [None]:
scaler = joblib.load('meta_scaler.pkl')
meta_X = np.vstack((xgb_pred, lgb_pred)).T
scaled = scaler.transform(meta_X)

In [None]:
total_pred = meta_model.predict(scaled)

In [None]:
test_df['pred'] = total_pred

In [None]:
# Sort prop_ids within search (srch_id) by descending predicted relevance
test_df_filtered = test_df[['srch_id', 'prop_id', 'pred']]
test_df_sorted = test_df_filtered.sort_values(by=["srch_id", "pred"], ascending=[True, False])

In [None]:
test_df_sorted.head(30)

#### XGBoost rank

In [None]:
df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23 (1).csv', index=False)

#### Ensemble

In [None]:
df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23_ensemble.csv', index=False)

#### Meta

In [None]:
df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23_meta.csv', index=False)

----------------------------

In [None]:
df_submission.shape