In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import ndcg_score

In [3]:
def load_dataset(filename):
    return pd.read_csv(filename)

training_file_raw = 'training_set_VU_DM.csv'
test_file_raw = 'test_set_VU_DM.csv'
training_file_stats = 'training_set_stats_VU_DM.csv'
test_file_stats = 'test_set_stats_VU_DM.csv'

In [4]:
def dataset_stats(df):
    print(f'Number of features: {len(df.columns)}')
    total_observations = len(df)
    print(f'Number of observations: {len(df)}')
    print(f'Number of rows with missing values: {df.isnull().any(axis=1).sum()}')
    print(f'Number of columns with missing values: {df.isnull().any(axis=0).sum()}')
    print(f'Percentage not-missing data for features with missing values:')
    for feature in df.columns[df.isnull().any()]:
        print(f"{feature}: {100*(total_observations - df[feature].isnull().sum())/total_observations:.2f}% not missing")
dataset_stats(load_dataset(training_file_raw))

Number of features: 54
Number of observations: 2476054
Number of rows with missing values: 2476054
Number of columns with missing values: 51
Percentage not-missing data for features with missing values:
visitor_location_country_id: 100.00% not missing
visitor_hist_starrating: 5.06% not missing
visitor_hist_adr_usd: 5.07% not missing
prop_country_id: 100.00% not missing
prop_id: 100.00% not missing
prop_starrating: 100.00% not missing
prop_review_score: 99.85% not missing
prop_brand_bool: 100.00% not missing
prop_location_score1: 100.00% not missing
prop_location_score2: 78.14% not missing
prop_log_historical_price: 100.00% not missing
position: 100.00% not missing
price_usd: 100.00% not missing
promotion_flag: 100.00% not missing
srch_destination_id: 100.00% not missing
srch_length_of_stay: 100.00% not missing
srch_booking_window: 100.00% not missing
srch_adults_count: 100.00% not missing
srch_children_count: 100.00% not missing
srch_room_count: 100.00% not missing
srch_saturday_night_

In [5]:
df_train = load_dataset(training_file_raw)
df_test = load_dataset(test_file_raw)

In [6]:
print(min(df_train['prop_starrating']))

0.0


In [7]:
def add_engineered_columns(df_raw):
    df = df_raw.copy()

    df.loc[df['price_usd'] > 2060.0355, 'price_usd'] = np.nan # 0.999 percent of data maar kunnen dit nog aanpassen
    df['price_per_night'] = df['price_usd'] / df['srch_length_of_stay']
    df['month'] = pd.to_datetime(df['date_time']).dt.month

    df['review_score_relative'] = (df['prop_review_score'] - df.groupby('srch_id')['prop_review_score'].transform('median'))
    df['price_relative'] = (df['price_usd'] - df.groupby('srch_id')['price_usd'].transform('median'))

    df['log_price_usd'] = np.log1p(df['price_usd'])
    df['log_price_per_night'] = np.log1p(df['price_per_night'])

    df['orig_dest_missing'] = df['orig_destination_distance'].isna().astype(int)
    df['loc_score2_missing'] = df['prop_location_score2'].isna().astype(int)

    #for col in ['orig_destination_distance', 'prop_location_score2']:
    #    if col in df.columns:
    #        df[col] = df[col].fillna(df[col].median())
    return df

In [8]:
def compute_prop_stats(df):
    exclude = [
        'srch_id', 'prop_id', 'position', 'click_bool', 'booking_bool',
        'gross_bookings_usd', 'relevance', 'price_usd', 'visitor_location_country_id',
        'prop_country_id', 'site_id', 'srch_destination_id'
    ]
    numeric = df.select_dtypes('number').columns.drop(exclude)
    means = df.groupby('prop_id')[numeric].mean().add_suffix('_mean')
    meds  = df.groupby('prop_id')[numeric].median().add_suffix('_median')
    stats = means.join(meds)
    return stats

def prepare_final_features(df, prop_stats):
    df = df.merge(prop_stats, on='prop_id', how='left')
    return df.drop(['position', 'click_bool', 'booking_bool', 'gross_bookings_usd', 'price_usd', 'date_time'], axis=1, errors='ignore') #ignores de waardes als er geen column is om te droppen

In [9]:
train_raw = pd.read_csv("training_set_VU_DM.csv")
test_raw  = pd.read_csv("test_set_VU_DM.csv")

train_fe = add_engineered_columns(train_raw)
test_fe  = add_engineered_columns(test_raw)

train_fe['relevance'] = 0
train_fe.loc[train_fe['click_bool'] == 1, 'relevance'] = 1
train_fe.loc[train_fe['booking_bool'] == 1, 'relevance'] = 5

prop_stats = compute_prop_stats(train_fe)

train_final = prepare_final_features(train_fe, prop_stats)
test_final  = prepare_final_features(test_fe, prop_stats)

In [26]:
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import GroupShuffleSplit

# features + labels
X = train_final.drop(['srch_id', 'relevance'], axis=1)
y = train_final['relevance']

# grouping srch_id
groups = train_final.groupby('srch_id').size().to_numpy()

# splitting on srch_id
gss = GroupShuffleSplit(test_size=0.2, random_state=1)
train_idx, test_idx = next(gss.split(X, y, groups=train_final['srch_id']))

X_train_all = X.iloc[train_idx]
y_train_all = y.iloc[train_idx]
groups_train_all = train_final.iloc[train_idx]['srch_id']

# get val set out of training
gss_val = GroupShuffleSplit(test_size=0.2, random_state=2)
train_idx_final, val_idx = next(gss_val.split(X_train_all, y_train_all, groups=groups_train_all))

X_train = X_train_all.iloc[train_idx_final]
y_train = y_train_all.iloc[train_idx_final]
X_val = X_train_all.iloc[val_idx]
y_val = y_train_all.iloc[val_idx]

# group IDs
group_id_train = groups_train_all.iloc[train_idx_final].values
group_id_val = groups_train_all.iloc[val_idx].values

# making pools
train_pool = Pool(data=X_train, label=y_train, group_id=group_id_train)
val_pool = Pool(data=X_val, label=y_val, group_id=group_id_val)

# catboost model
model = CatBoostRanker(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='YetiRank',
    eval_metric='NDCG:top=5',
    verbose=100
)

# train the model
model.fit(train_pool, eval_set=val_pool)

0:	test: 0.2398205	best: 0.2398205 (0)	total: 3.24s	remaining: 53m 56s
100:	test: 0.3783813	best: 0.3783813 (100)	total: 4m 50s	remaining: 43m 6s
200:	test: 0.3865961	best: 0.3865961 (200)	total: 8m 45s	remaining: 34m 49s
300:	test: 0.3913125	best: 0.3914641 (298)	total: 12m 32s	remaining: 29m 8s
400:	test: 0.3939231	best: 0.3943669 (396)	total: 16m 9s	remaining: 24m 8s
500:	test: 0.3947529	best: 0.3951660 (486)	total: 19m 22s	remaining: 19m 18s
600:	test: 0.3965940	best: 0.3965940 (600)	total: 22m 35s	remaining: 14m 59s
700:	test: 0.3959902	best: 0.3966720 (624)	total: 25m 42s	remaining: 10m 58s
800:	test: 0.3967029	best: 0.3968585 (793)	total: 28m 52s	remaining: 7m 10s
900:	test: 0.3982731	best: 0.3982731 (900)	total: 32m 1s	remaining: 3m 31s
999:	test: 0.3997913	best: 0.3999148 (997)	total: 35m 7s	remaining: 0us

bestTest = 0.3999148091
bestIteration = 997

Shrink model to first 998 iterations.


<catboost.core.CatBoostRanker at 0x11a9d0910>

In [27]:
feature_names = model.feature_names_
X_submit = test_final[feature_names]

catboost_preds = model.predict(X_submit)
test_final['pred'] = catboost_preds

test_df_filtered = test_final[['srch_id', 'prop_id', 'pred']]

test_df_sorted = test_df_filtered.sort_values(by=['srch_id', 'pred'], ascending=[True, False])

df_submission = test_df_sorted.drop(columns=['pred'])
df_submission.to_csv('VU-DM-2025-Group-23.csv', index=False)
print("file done")


file done


In [28]:
import itertools

base_params = {
    "iterations": 1000,
    "learning_rate": 0.1,
    "loss_function": "YetiRank",
    "eval_metric": "NDCG:top=5",
    "verbose": 100,
    "random_seed": 42,
    "task_type": "CPU", 
    "early_stopping_rounds": 50
}

param_grid = {
    "depth": [6, 8],
    "l2_leaf_reg": [1, 5],
    "random_strength": [1, 5]
}

keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

results = []

for i, combo in enumerate(param_combinations):
    print(f"\n Training model {i+1}/{len(param_combinations)} met params: {combo}")
    params = {**base_params, **combo}

    model = CatBoostRanker(**params)

    model.fit(
        train_pool,
        eval_set=val_pool,
        use_best_model=True
    )

    best_ndcg = model.get_best_score()["validation"]["NDCG:top=5"]

    results.append({
        **combo,
        "valid_ndcg@5": best_ndcg,
        "best_iteration": model.get_best_iteration()
    })

    del model 

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="valid_ndcg@5", ascending=False)
results_df.to_csv("catboost_grid_search_results.csv", index=False)

print("\n Beste resultaten:")
print(results_df.head())



🔧 Training model 1/8 met params: {'depth': 6, 'l2_leaf_reg': 1, 'random_strength': 1}
0:	test: 0.2346201	best: 0.2346201 (0)	total: 2.69s	remaining: 44m 42s
100:	test: 0.3781072	best: 0.3782044 (99)	total: 3m 43s	remaining: 33m 6s
200:	test: 0.3868314	best: 0.3868314 (200)	total: 6m 42s	remaining: 26m 41s
300:	test: 0.3920654	best: 0.3920654 (300)	total: 9m 43s	remaining: 22m 35s
400:	test: 0.3944636	best: 0.3946296 (394)	total: 12m 40s	remaining: 18m 55s
500:	test: 0.3968989	best: 0.3969003 (499)	total: 15m 35s	remaining: 15m 31s
600:	test: 0.3968503	best: 0.3974550 (562)	total: 18m 26s	remaining: 12m 14s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3974549705
bestIteration = 562

Shrink model to first 563 iterations.


KeyError: 'validation'