In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ndcg_score
from sklearn.linear_model import Ridge
import xgboost as xgb
from catboost import CatBoostRanker, Pool
import joblib

# Data

In [None]:
def load_dataset(filename):
    return pd.read_csv(filename)

training_file_raw = 'training_set_VU_DM.csv'
test_file_raw = 'test_set_VU_DM.csv'
training_file_stats = 'training_set_stats_VU_DM.csv'
test_file_stats = 'test_set_stats_VU_DM.csv'

In [None]:
df = load_dataset(training_file_raw)
print(df.columns)

#### Features
|Feature  |Type | Description |Potential predictor|
|:------- |:----|:------------|------------------:|
|srch_id                     |int      |ID of search/user| |
|date_time                   |Datetime |Time of search| |
|site_id                     |int      |ID of Expedia link (.com/.co.uk/.co.jp...)| |
|visitor_location_country_id |int      |ID of user's country| *|
|visitor_hist_starrating     |float    |mean star rating of the customer's hotel purchases| *|
|visitor_hist_adr_usd        |float    |mean price per night of the customer's hotel purchases| *|
|prop_country_id             |int      |ID of the hotel's country| *|
|prop_id                     |int      |ID of hotel| |
|prop_starrating             |int      |star rating of hotel| *|
|prop_review_score           |float    |review score of hotel (rounded to 0.5)| **|
|prop_brand_bool             |int      |part of major hotel chain (1) or not (0)| *|
|prop_location_score1        |float    |desirability score of the hotel's location (primary score)                   |                  ** |
|prop_location_score2        |float    |desirability score of the hotel's location (secondary score)                 |                  ** |
|prop_log_historical_price   |float    |log of mean price of hotel in the last trading period                        |                   |
|price_usd                   |float    |displayed price of the hotel                                                 |                  ** |
|promotion_flag              |int      |1 if hotel had a sale price promotion                                        |                  ** |
|srch_destination_id         |int      |ID of the searched destination                                               |                   |
|srch_length_of_stay         |int      |number of nights in the stay                                                 |                  * |
|srch_booking_window         |int      |days between search and stay start                                           |                  * |
|srch_adults_count           |int      |number of adults in the search                                               |                  * |
|srch_children_count         |int      |number of children in the search                                             |                  * |
|srch_room_count             |int      |number of rooms in the search                                                |                  * |
|srch_saturday_night_bool    |bool     |1 if the stay includes a Saturday night                                      |                   |
|srch_query_affinity_score   |float    |log probability a hotel will be clicked on the internet                      |                   |
|orig_destination_distance   |float    |physical distance between hotel and customer                                 |                   |
|random_bool                 |bool     |1 if results were shown in random order                                      |                   |
|comp1_rate                  |int      |price comparison vs. competitor 1 (-1: higher, 0: same, +1: lower)           |                   |
|comp1_inv                   |int      |availability vs. competitor 1 (+1: competitor unavailable, 0: both available)|                   |
|comp1_rate_percent_diff     |float    |absolute percentage price difference with competitor 1                       |                   |
|comp2_rate                  |int      |same as comp1_rate for competitor 2                                          |                   |
|comp2_inv                   |int      |same as comp1_inv for competitor 2                                           |                   |
|comp2_rate_percent_diff     |float    |same as comp1_rate_percent_diff for competitor 2                             |                   |
|...                         |...      |same structure for competitors 3 through 8                                   |                   |
|position                    |int      |rank of hotel in search results (training data only)                         |                   |
|click_bool                  |bool     |1 if user clicked on the hotel  (training data only)                         |                   |
|booking_bool                |bool     |1 if user booked the hotel      (training data only)                         |                   |
|gross_bookings_usd           |float    |actual value of the booking (includes taxes, fees, etc.) (training data only)|                   |    

In [None]:
def dataset_stats(df):
    print(f'Number of features: {len(df.columns)}')
    total_observations = len(df)
    print(f'Number of observations: {len(df)}')
    print(f'Number of rows with missing values: {df.isnull().any(axis=1).sum()}')
    print(f'Number of columns with missing values: {df.isnull().any(axis=0).sum()}')
    print(f'Percentage not-missing data for features with missing values:')
    for feature in df.columns[df.isnull().any()]:
        print(f"{feature}: {100*(total_observations - df[feature].isnull().sum())/total_observations:.2f}% not missing")

dataset_stats(df)

## Feature Engineering

In [None]:
df = load_dataset(training_file_raw)

In [None]:
df = load_dataset(test_file_raw)

In [None]:
transform_df = df.copy()

We remove prices greater than 10000 (outliers)

In [None]:
transform_df.loc[df['price_usd'] > 10000, 'price_usd'] = np.nan

Prices are given in total price of stay, we transform to price per night.

In [None]:
transform_df['price_per_night'] = transform_df['price_usd'] / transform_df['srch_length_of_stay']

In [None]:
print(df['price_usd'].mean(), df['price_usd'].std(), df['price_usd'].median())
print(transform_df['price_usd'].mean(), transform_df['price_usd'].std(), transform_df['price_usd'].median())
print(transform_df['price_per_night'].mean(), transform_df['price_per_night'].std(), transform_df['price_per_night'].median())

Datetime has many unique values and a single date might not be informative. We transform to seasons, which might be more informative for booking.

In [None]:
def get_season_label(month):
    if month in [12, 1, 2]:
        return 1.0  # Winter
    elif month in [3, 4, 5]:
        return 2.0  # Spring
    elif month in [6, 7, 8]:
        return 3.0  # Summer
    else:
        return 4.0  # Fall

transform_df['season'] = pd.to_datetime(df['date_time']).dt.month.map(get_season_label)

In [None]:
transform_df['season']

Column 'relevance' is created containing the NDCG@5 relevance scores as explained in the assignment.

In [None]:
# Relevance scores for NDCG@5, used at model evaluation
transform_df['relevance'] = 0
transform_df.loc[transform_df['click_bool'] == 1, 'relevance'] = 1
transform_df.loc[transform_df['booking_bool'] == 1, 'relevance'] = 5

Per property id (prop_id), the mean, standard deviation, and median are computed for each numerical feature available in the test file. This allows for the model to learn relative differences between searches.

In [None]:
# Columns not to calculate mean/std/median for (ID features, price_usd, date_time and the training set exclusive features)
exclude_from_stat_cols = ['srch_id', 'prop_id', 'position',
                          'click_bool', 'booking_bool', 'gross_bookings_usd',
                          'relevance', 'price_usd', 'visitor_location_country_id',
                          'prop_country_id', 'site_id', 'srch_destination_id'
                         ]
numeric_cols = transform_df.select_dtypes(include='number').columns.drop(exclude_from_stat_cols)

In [None]:
# Columns not to calculate mean/std/median for (ID features, price_usd, date_time and the training set exclusive features)
exclude_from_stat_cols = ['srch_id', 'prop_id'
                         ]

In [None]:
numeric_cols = transform_df.select_dtypes(include='number').columns.drop(exclude_from_stat_cols)

In [None]:
prop_means = transform_df.groupby('prop_id')[numeric_cols].mean().add_suffix('_mean')

In [None]:
prop_stds = transform_df.groupby('prop_id')[numeric_cols].std().add_suffix('_std')

In [None]:
prop_medians = transform_df.groupby('prop_id')[numeric_cols].median().add_suffix('_median')

In [None]:
prop_stats = prop_means.join([prop_stds, prop_medians])

Join the created stat columns with the dataframe, drop all unused columns

In [None]:
exclude_from_training_cols = ['position', 'click_bool', 'booking_bool',
                              'gross_bookings_usd', 'price_usd', 'date_time']

In [None]:
df_with_stats = transform_df.merge(prop_stats, on='prop_id', how='left').drop(exclude_from_training_cols, axis=1)

In [None]:
df_with_stats.to_csv(training_file_stats, index=False)

In [None]:
exclude_from_test_cols = ['date_time']

In [None]:
df_with_stats = transform_df.merge(prop_stats, on='prop_id', how='left').drop(exclude_from_test_cols, axis=1)

In [None]:
df_with_stats.to_csv(test_file_stats, index=False)

^ merge + write to csv takes long (35+ mins for training set)

# Models

### XGBoost rank (relevance score)

In [None]:
df = load_dataset(training_file_stats)

In [None]:
# Make sure training features don't include grouping feature srch_id or target feature relevance
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
# Make sure searches are not split across the training and test sets
groups = df.groupby('srch_id').size().to_numpy()
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx, test_idx = next(gss.split(X, y, groups=df['srch_id']))

In [None]:
X_train_all = X.iloc[train_idx]
y_train_all = y.iloc[train_idx]
groups_train_all = df.iloc[train_idx]['srch_id']

In [None]:
gss_val = GroupShuffleSplit(test_size=0.2, random_state=2)
train_idx_final, val_idx = next(gss_val.split(X_train_all, y_train_all, groups=groups_train_all))

In [None]:
X_train, y_train = X_train_all.iloc[train_idx_final], y_train_all.iloc[train_idx_final]
X_val, y_val = X_train_all.iloc[val_idx], y_train_all.iloc[val_idx]

In [None]:
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

In [None]:
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dval = xgb.DMatrix(X_val, y_val, enable_categorical=True)

In [None]:
# Set group info
groups_train = groups_train_all.iloc[train_idx_final]
groups_val = groups_train_all.iloc[val_idx]

group_train = groups_train.groupby(groups_train).size().to_numpy()
group_val = groups_val.groupby(groups_val).size().to_numpy()

In [None]:
dtrain.set_group(group_train)
dval.set_group(group_val)

In [None]:
del model

In [None]:
# What to evaluate during training, last one is used for early stopping
evals = [(dtrain, "train"), (dval, "validation")]

In [None]:
params = {
    "objective": "rank:ndcg", # ranking as oppposed to regression or binary classification
    "tree_method": "hist",
    "device": "gpu",          # making use of gpu in training
    "eval_metric": "ndcg@5",
    "eta": 0.1,                  # 0.3
    "max_depth": 4,               # 6
    "min_child_weight": 15,       # 1
    "gamma": 1,                   # 0
    "subsample": 0.85,             # 1
    # "colsample_bytree": 0.8,      # 1
    # "lambda": 0.5,                # 1
    # "alpha": 0.5,                 # 0
    # "lambdarank_num_pair_per_sample": 5,
    # "lambdarank_unbiased": "true"
}

model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=3000,     # max rounds, made arbitrarily large in order to finish through early stopping
    evals=evals,
    early_stopping_rounds=30, # stop training if metric has not improved for 30 rounds (prevent overfitting)
    verbose_eval=5            # how often to print metrics
)

#### Evaluation 
**transformed set**  
Overfitting is determined by train-ndcg@5 - validation-ndcg@5 $> 0.05$  
  
Default values, proper train/test/val split: overfitting at validation-ndcg@5:0.38381  
  
eta:  
- 0.2: overfitting at validation-ndcg@5:0.38553
- 0.1: overfitting at validation-ndcg@5:0.38636
- 0.01: takes VERY long
- winner: 0.1
  
max_depth:
- 5: overfitting at validation-ndcg@5:0.38869
- 4: finished with validation-ndcg@5:0.38928
- winner: 4
  
min_child_weight:
- 5: overfitting at validation-ndcg@5:0.38385
- 10: overfitting at validation-ndcg@5:0.38606
- 15: overfitting at validation-ndcg@5:0.38667
- no massive difference between 10 and 15, evaluate again later
  
gamma:
- 1: overfitting at validation-ndcg@5:0.38436
- 2: overfitting at validation-ndcg@5:0.38365
- winner: 1

subsample:
- 0.8: overfitting at validation-ndcg@5:0.38525
- 0.6: overfitting at validation-ndcg@5:0.38117
- 0.75: overfitting at validation-ndcg@5:0.38330
- 0.85: overfitting at validation-ndcg@5:0.38591
- winner: 0.85
  
all winners so far together: finished with validation-ndcg@5:0.39155
  
lambdarank_num_pair_per_sample 5: causes fast overfitting, maybe try again later  
  
**untransformed set**

In [None]:
# Save model to json file to avoid having to train again
model.save_model('xgboost_model_fitted.json')

In [None]:
model = xgb.Booster()
model.load_model('xgboost_model_fitted.json')

In [None]:
df_test = df.iloc[test_idx].copy()

In [None]:
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
preds = model.predict(dtest)
df_test['pred'] = preds

# Compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in df_test.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.5f}")

### Results
mean/std/median + transformed data: 0.39400  
mean/std/median: 0.40245

### CatBoost rank

In [None]:
df = load_dataset(training_file_stats)

In [None]:
# Make sure training features are only the ones available in the test file and don't include target feature relevance
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx_all, test_idx = next(gss.split(X, y, groups=df['srch_id']))
X_train_all, y_train_all = X.iloc[train_idx_all], y.iloc[train_idx_all]
groups_train_all = df.iloc[train_idx_all]['srch_id']

In [None]:
gss_val = GroupShuffleSplit(test_size=0.2, random_state=2)
train_idx_final, val_idx = next(gss_val.split(X_train_all, y_train_all, groups=groups_train_all))
X_train, y_train = X_train_all.iloc[train_idx_final], y_train_all.iloc[train_idx_final]
X_val, y_val = X_train_all.iloc[val_idx], y_train_all.iloc[val_idx]

In [None]:
train_group_id = df.iloc[train_idx_all].iloc[train_idx_final]['srch_id']
val_group_id = df.iloc[train_idx_all].iloc[val_idx]['srch_id']

In [None]:
train_pool = Pool(X_train, y_train, group_id=train_group_id)
val_pool = Pool(X_val, y_val, group_id=val_group_id)

In [None]:
del cat_model

In [None]:
cat_model = CatBoostRanker(
    iterations=3000,
    learning_rate=0.2,
    depth=5,
    loss_function='YetiRank',
    eval_metric='NDCG:top=5',
    task_type='CPU',
    random_seed=2,
    early_stopping_rounds=30,
    verbose=5
)
cat_model.fit(train_pool, eval_set=[train_pool, val_pool])

In [None]:
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

In [None]:
preds_cat = cat_model.predict(X_test)

In [None]:
df_test = df.iloc[test_idx].copy()

In [None]:
df_test['pred'] = preds_cat

# Compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in df_test.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.4f}")

In [None]:
cat_model.save_model("catboost_model.cbm")

### Results
mean/median/std, same settings as xgboost: 0.3920  
mean/median/std, lr 0.2, depth 5: 0.3973

### Ensemble

In [None]:
df = load_dataset(training_file_stats)

In [None]:
# Make sure training features are only the ones available in the test file and don't include target feature relevance
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx_all, test_idx = next(gss.split(X, y, groups=df['srch_id']))
X_train_all, y_train_all = X.iloc[train_idx_all], y.iloc[train_idx_all]
groups_train_all = df.iloc[train_idx_all]['srch_id']

In [None]:
gss_val = GroupShuffleSplit(test_size=0.2, random_state=2)
train_idx_final, val_idx = next(gss_val.split(X_train_all, y_train_all, groups=groups_train_all))
X_train, y_train = X_train_all.iloc[train_idx_final], y_train_all.iloc[train_idx_final]
X_val, y_val = X_train_all.iloc[val_idx], y_train_all.iloc[val_idx]

In [None]:
xgb_model = xgb.Booster()
xgb_model.load_model('xgboost_model_fitted.json')

In [None]:
cat_model = CatBoostRanker()
cat_model.load_model("catboost_model.cbm")

In [None]:
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

In [None]:
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
xgb_pred = xgb_model.predict(dtest)
cat_pred = cat_model.predict(X_test)

all_preds = np.vstack([xgb_pred, cat_pred]).T
scaled = MinMaxScaler().fit_transform(all_preds)
xgb_scaled, cat_scaled = scaled[:, 0], scaled[:, 1]

xgb_ndcg = 0.7
cat_ndcg = 0.3
xgb_proportion = xgb_ndcg / (xgb_ndcg + cat_ndcg)
cat_proportion = cat_ndcg / (xgb_ndcg + cat_ndcg)
total_pred = xgb_proportion * xgb_scaled + cat_proportion * cat_scaled

In [None]:
test_df = df.iloc[test_idx].copy()

In [None]:
test_df['relevance'] = df['relevance']

In [None]:
df_test['pred'] = total_pred

# Compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in df_test.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.4f}")

### Results
xgboost and catboost same parameters, weighted on ndcg score: 0.4000  
catboost lr 0.2, depth 5, weighted on ndcg score: 0.4019  
xgboost and catboost same parameters, 80/20 weighting: 0.4029  
xgboost and catboost same parameters, 90/10 weighting: 0.4029  
xgboost and catboost same parameters, 70/30 weighting: 0.4028  

### Meta Model (XGBoost/CatBoost + Ridge regression)

In [None]:
df = load_dataset(training_file_stats)

In [None]:
# Make sure training features are only the ones available in the test file and don't include target feature relevance
X = df.drop(['srch_id', 'relevance'], axis=1)
y = df['relevance']

In [None]:
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx_all, test_idx = next(gss.split(X, y, groups=df['srch_id']))
X_train_all, y_train_all = X.iloc[train_idx_all], y.iloc[train_idx_all]
groups_train_all = df.iloc[train_idx_all]['srch_id']

In [None]:
gss_val = GroupShuffleSplit(test_size=0.2, random_state=2)
train_idx_final, val_idx = next(gss_val.split(X_train_all, y_train_all, groups=groups_train_all))
X_train, y_train = X_train_all.iloc[train_idx_final], y_train_all.iloc[train_idx_final]
X_val, y_val = X_train_all.iloc[val_idx], y_train_all.iloc[val_idx]

In [None]:
xgb_model = xgb.Booster()
xgb_model.load_model('xgboost_model_fitted.json')

In [None]:
cat_model = CatBoostRanker()
cat_model.load_model("catboost_model.cbm")

In [None]:
dval = xgb.DMatrix(X_val, enable_categorical=True)

In [None]:
xgb_val_pred = xgb_model.predict(dval)
cat_val_pred = cat_model.predict(X_val)

meta_X_val = np.column_stack((
    0.8 * xgb_val_pred,
    0.2 * cat_val_pred
))
meta_model = Ridge()

In [None]:
meta_model.fit(meta_X_val, y_val)

In [None]:
joblib.dump(meta_model, 'meta_model.pkl')

In [None]:
meta_model = joblib.load('meta_model.pkl')

In [None]:
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

In [None]:
dtest = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
xgb_test_pred = xgb_model.predict(dtest)
cat_test_pred = cat_model.predict(X_test)
meta_X_test = np.column_stack((
    0.8 * xgb_test_pred,
    0.2 * cat_test_pred
))
total_pred = meta_model.predict(meta_X_test)

In [None]:
df_test = df.iloc[test_idx].copy()

In [None]:
df_test['relevance'] = y_test

In [None]:
df_test['pred'] = total_pred

# Compute NDCG@5 per srch_id and take the average
ndcg_scores = []
for srch_id, group in df_test.groupby('srch_id'):
    if len(group) < 5:
        continue  # the @5 in NDCG@5
    true_relevance = group['relevance'].values.reshape(1, -1)
    predicted_scores = group['pred'].values.reshape(1, -1)
    score = ndcg_score(true_relevance, predicted_scores, k=5)
    ndcg_scores.append(score)
mean_ndcg = np.mean(ndcg_scores)
print(f"Mean NDCG@5: {mean_ndcg:.4f}")

#### Results
equal weight: 0.4023  
0.8/0.2 split: 0.4023

### XGBoost categorical (yes/no booking)

In [None]:
X, y = df.drop(['position', 'click_bool', 'booking_bool', 'gross_bookings_usd', 'date_time'], axis=1), df[['booking_bool']]
y_encoded = OrdinalEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
params = {
    "objective": "binary:logistic",
    "tree_method": "hist",
    "device": "cuda",
    "eval_metric": ["error", "logloss", "auc"]
         }
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]
n = 200


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=10,
   early_stopping_rounds=20
)

In [None]:
y_proba = model.predict(dtest_reg)
y_pred = (y_proba > 0.5).astype(int)
y_true = dtest_reg.get_label()
print(y_pred[:10])
print(y_true[:10])
acc = accuracy_score(y_true, y_pred)
print("Accuracy:", acc)

print("\nClassification Report:")
print(classification_report(y_true, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))

### Results
all but date_time: 0.9719390409870384

## Submission generator

In [None]:
xgb_model = xgb.Booster()
xgb_model.load_model('xgboost_model_fitted.json')

In [None]:
cat_model = CatBoostRanker()
cat_model.load_model("catboost_model.cbm")

In [None]:
test_df = load_dataset(test_file_stats)

In [None]:
test_df = df_with_stats.drop(['srch_id_std', 'srch_id_mean', 'srch_id_median', 'prop_id_mean', 'prop_id_std', 'prop_id_median'], axis=1)

In [None]:
test_df_without_srch = test_df.drop('srch_id', axis=1)

In [None]:
dtest = xgb.DMatrix(test_df_without_srch, enable_categorical=True)

In [None]:
group_id = test_df['srch_id']
test_pool = Pool(test_df_without_srch, group_id=group_id)

In [None]:
xgb_pred = xgb_model.predict(dtest)
cat_pred = cat_model.predict(test_pool)

scaler_xgb = MinMaxScaler()
scaler_cat = MinMaxScaler()

xgb_scaled = scaler_xgb.fit_transform(xgb_pred.reshape(-1, 1)).flatten()
cat_scaled = scaler_cat.fit_transform(cat_pred.reshape(-1, 1)).flatten()

xgb_ndcg = 0.85   # based on best average ndcg from evaluation
cat_ndcg = 0.15
xgb_proportion = xgb_ndcg / (xgb_ndcg + cat_ndcg)
cat_proportion = cat_ndcg / (xgb_ndcg + cat_ndcg)
total_pred = xgb_proportion * xgb_scaled + cat_proportion * cat_scaled

In [None]:
test_df['pred'] = xgb_pred

In [None]:
# Sort prop_ids within search (srch_id) by descending predicted relevance
test_df_filtered = test_df[['srch_id', 'prop_id', 'pred']]
test_df_sorted = test_df_filtered.sort_values(by=["srch_id", "pred"], ascending=[True, False])

In [None]:
test_df_sorted.head(30)

#### XGBoost rank

In [None]:
df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23 (1).csv', index=False)

#### CatBoost + XGBoost ensemble

In [None]:
df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23 (2).csv', index=False)

#### XGBoost categorical

In [None]:
df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23.csv', index=False)

----------------------------

In [None]:
df_submission.shape