In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from lightning.pytorch import Trainer, LightningModule, LightningDataModule
from lightning.pytorch.callbacks import EarlyStopping

In [3]:
train_df = pd.read_csv("data/cleaned_training_data.csv")
test_df  = pd.read_csv("data/cleaned_test_data.csv")

In [4]:
train_df["relevance"] = 5 * train_df["booking_bool"].fillna(0) + 1 * train_df["click_bool"].fillna(0)

In [5]:
user_cols = [
    "visitor_hist_starrating", "visitor_hist_adr_usd", "query_affinity_score_cleaned",
    "query_affinity_missing", "orig_destination_distance", "srch_length_of_stay",
    "srch_booking_window", "srch_adults_count", "srch_children_count",
    "srch_room_count", "srch_saturday_night_bool",
    # temporal cyclic encodings
    "month_sin", "month_cos", "search_hour_sin", "search_hour_cos",
    "day_of_week_sin", "day_of_week_cos"
]

property_cols = [

    # quality / rating
    "prop_starrating",
    "prop_review_score_filled",
    "has_usable_review",
    "prop_brand_bool",

    # current‐price signals
    "price_usd_without_promo", 
    "promotion_flag",

    # competitive set metrics
    "num_comps_lower",
    "num_comps_higher",
]

In [6]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train_df[user_cols + property_cols])

# Use only training rows without NaNs for training the GAN
df_train_clean = train_df.dropna(subset=user_cols + property_cols)

# Train GAN on relevant columns only
gan = CTGANSynthesizer(metadata, verbose=True, epochs=50)
gan.fit(df_train_clean[user_cols + property_cols])


Gen. (-0.17) | Discrim. (-0.17): 100%|██████████| 50/50 [1:17:13<00:00, 92.66s/it] 


In [8]:
train_df_fill = train_df.copy()

# Find rows with any NaNs
nan_rows = train_df_fill[user_cols + property_cols].isnull().any(axis=1)

# Generate synthetic samples
synth_data = gan.sample(num_rows=nan_rows.sum())

# Replace missing rows with synthetic ones (only relevant columns)
train_df_fill.loc[nan_rows, user_cols + property_cols] = synth_data[user_cols + property_cols].values
    

In [9]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(test_df[user_cols + property_cols])

# Use only training rows without NaNs for training the GAN
df_test_clean = test_df.dropna(subset=user_cols + property_cols)

# Train GAN on relevant columns only
gan = CTGANSynthesizer(metadata, verbose=True, epochs=50)
gan.fit(df_test_clean[user_cols + property_cols])

Gen. (-0.55) | Discrim. (-0.41): 100%|██████████| 50/50 [42:48<00:00, 51.37s/it]


In [11]:
test_df_fill = test_df.copy()

# Find rows with any NaNs
nan_rows = test_df_fill[user_cols + property_cols].isnull().any(axis=1)

# Generate synthetic samples
synth_data = gan.sample(num_rows=nan_rows.sum())

# Replace missing rows with synthetic ones (only relevant columns)
test_df_fill.loc[nan_rows, user_cols + property_cols] = synth_data[user_cols + property_cols].values
    

In [12]:
# Get a subset of srch_ids
# sampled_srch_ids = train_df["srch_id"].drop_duplicates().sample(n=50000, random_state=42)

# Then get all rows (hotels) for those srch_ids
#train_sample = train_df[train_df["srch_id"].isin(sampled_srch_ids)].reset_index(drop=True)

#sampled_srch_ids_test = test_df["srch_id"].drop_duplicates().sample(n=10000, random_state=42)
#test_sample = test_df[test_df["srch_id"].isin(sampled_srch_ids_test)].reset_index(drop=True)

#small_dataset = {
#    "train": train_sample,
#    "test": test_sample
#}

In [13]:
class ExpediaDataset(Dataset):
    def __init__(self, df, user_cols, hotel_cols):
        self.df = df.reset_index(drop=True)
        self.user_cols = user_cols
        self.hotel_cols = hotel_cols
        self.group_indices = [group.index.to_numpy() for _, group in self.df.groupby("srch_id")]

    def __len__(self):
        return len(self.group_indices)

    def __getitem__(self, idx):
        indices = self.group_indices[idx]
        group = self.df.iloc[indices]

        users = torch.tensor(group[self.user_cols].values, dtype=torch.float32)
        hotels = torch.tensor(group[self.hotel_cols].values, dtype=torch.float32)
        rel = torch.tensor(group["relevance"].values, dtype=torch.float32)
        srch_id = group["srch_id"].iloc[0]

        return users, hotels, rel, srch_id

In [14]:
class ExpediaDataModule(LightningDataModule):
    def __init__(
        self,
        df,
        user_cols,
        hotel_cols,
        batch_size=32,
        val_frac=0.1,
        random_state=42,
        num_workers=4
    ):
        super().__init__()
        self.df = df
        self.user_cols = user_cols
        self.hotel_cols = hotel_cols
        self.batch_size = batch_size
        self.val_frac = val_frac
        self.random_state = random_state
        self.num_workers = num_workers

    def setup(self, stage=None):
        # Split unique srch_ids to avoid leakage
        srch_ids = self.df["srch_id"].unique()
        train_ids, val_ids = train_test_split(srch_ids, test_size=self.val_frac, random_state=self.random_state)

        train_df = self.df[self.df["srch_id"].isin(train_ids)].copy()
        val_df = self.df[self.df["srch_id"].isin(val_ids)].copy()

        self.train_dataset = ExpediaDataset(train_df, self.user_cols, self.hotel_cols)
        self.val_dataset = ExpediaDataset(val_df, self.user_cols, self.hotel_cols)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=self.collate_fn
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=self.collate_fn
        )

    @staticmethod
    def collate_fn(batch):
    # batch is a list of (users, hotels, rel, srch_id)
        users, hotels, rels, srch_ids = zip(*batch)
        return list(users), list(hotels), list(rels), list(srch_ids)


In [15]:
def listnet_loss(preds: torch.Tensor, rels: torch.Tensor) -> torch.Tensor:
    """
    Computes ListNet loss between predicted and true relevance scores.
    Both tensors should be 1D and from a single group (query).
    """
    pred_prob = torch.softmax(preds, dim=0)
    rel_prob = torch.softmax(rels, dim=0)
    return -torch.sum(rel_prob * torch.log(pred_prob + 1e-8))


In [16]:
class RankingTower(LightningModule):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout=0.2):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, hidden_dims[0])
        self.blocks = nn.ModuleList()
        self.norms = nn.ModuleList()

        for i in range(len(hidden_dims) - 1):
            self.blocks.append(nn.Linear(hidden_dims[i], hidden_dims[i + 1]))
            self.norms.append(nn.LayerNorm(hidden_dims[i + 1]))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.input_proj(x))
        for block, norm in zip(self.blocks, self.norms):
            residual = x
            x = F.relu(block(x))
            x = norm(x)
            x = self.dropout(x)
            x = x + residual[:, :x.size(1)]  # Residual connection (with slicing if dims shrink)
        return F.normalize(x, p=2, dim=1)  # Ready for cosine similarity


In [17]:
def lambda_loss(scores, labels, sigma=1.0):
    """
    scores: Tensor of shape [n_items] (predicted relevance scores)
    labels: Tensor of shape [n_items] (true relevance, e.g., 0/1/2)
    """
    device = scores.device
    n = scores.size(0)

    # Pairwise label differences
    label_diff = labels.unsqueeze(1) - labels.unsqueeze(0)
    S_ij = torch.sign(label_diff)  # +1, 0, -1

    # Ignore pairs with equal labels
    valid_pair_mask = S_ij != 0

    # Predicted score differences
    pred_diff = scores.unsqueeze(1) - scores.unsqueeze(0)

    # ΔNDCG weights
    with torch.no_grad():
        # Compute ideal DCG
        sorted_labels, _ = torch.sort(labels, descending=True)
        ideal_dcg = ((2 ** sorted_labels - 1) / torch.log2(torch.arange(2, 2 + n, device=device).float())).sum()

        # Compute actual ΔDCG from swapping i and j
        gain = 2 ** labels - 1
        rank = torch.argsort(torch.argsort(-scores))  # predicted rank
        inv_log_rank = 1.0 / torch.log2(rank.float() + 2.0)
        dcg = gain * inv_log_rank

        delta_ndcg = torch.abs(
            (gain.unsqueeze(1) - gain.unsqueeze(0)) *
            (inv_log_rank.unsqueeze(1) - inv_log_rank.unsqueeze(0))
        )
        delta_ndcg = delta_ndcg / (ideal_dcg + 1e-10)

    # Logistic loss
    loss_matrix = delta_ndcg * F.softplus(-sigma * S_ij * pred_diff)  # softplus(x) = log(1 + exp(x))
    return loss_matrix[valid_pair_mask].mean()


In [18]:
import torch
import pytorch_lightning as L
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torchmetrics.retrieval import RetrievalNormalizedDCG


class TTranker(LightningModule):
    def __init__(self, hotel_dim, user_dim, embedding_dim=64, lr=1e-3, dropout=0.5):
        super().__init__()
        self.save_hyperparameters()

        self.tower_user = RankingTower(user_dim)
        self.tower_property = RankingTower(hotel_dim)
        #self.scorer = scorer_block(embedding_dim, dropout)

        self.ndcg5 = RetrievalNormalizedDCG(top_k=5)

    def forward(self, user_features, hotel_features):
        user_embed = self.tower_user(user_features)
        hotel_embed = self.tower_property(hotel_features)

        if torch.isnan(user_embed).any() or torch.isnan(hotel_embed).any():
            print("NaNs in embeddings!")

        if torch.all(user_embed == 0) or torch.all(hotel_embed == 0):
            print("Warning: all-zero embeddings!")

        user_embed = F.normalize(user_embed, dim=1)
        hotel_embed = F.normalize(hotel_embed, dim=1)
    
        return F.cosine_similarity(user_embed, hotel_embed)


    def training_step(self, batch, batch_idx):
        user_groups, hotel_groups, relevance_groups, srch_ids = batch
        losses = []

        for users, hotels, rels in zip(user_groups, hotel_groups, relevance_groups):
            users = users.to(self.device)
            hotels = hotels.to(self.device)
            rels = rels.to(self.device)

            preds = self(users, hotels)
            loss = lambda_loss(preds, rels)
            losses.append(loss)

        total_loss = torch.stack(losses).mean()
        self.log("train_loss", total_loss, prog_bar=True)
        return total_loss

    def validation_step(self, batch, batch_idx):
        user_groups, hotel_groups, relevance_groups, srch_ids = batch

        for users, hotels, rel, srch_id in zip(user_groups, hotel_groups, relevance_groups, srch_ids):
            preds = self(users, hotels)
            group_size = len(rel)

        # Create an index tensor with the same srch_id value for all items
            group_index = torch.full((group_size,), fill_value=srch_id, dtype=torch.long, device=rel.device)

        # Add group to NDCG@5 metric
            self.ndcg5.update(preds, rel, indexes=group_index)

        # Optionally log per-group loss
            loss = F.mse_loss(preds, rel)
            self.log("val_loss", loss, prog_bar=False, on_step=False, on_epoch=True, batch_size=group_size)
    
    def predict_step(self, batch, batch_idx):
        user_groups, hotel_groups, prop_ids_groups, srch_ids = batch
        results = []

        for users, hotels, prop_ids, srch_id in zip(user_groups, hotel_groups, prop_ids_groups, srch_ids):
            users = users.to(self.device)
            hotels = hotels.to(self.device)
            scores = self(users, hotels).detach().cpu().numpy()

            for prop_id, score in zip(prop_ids, scores):
                results.append({
                    "srch_id": int(srch_id),
                    "prop_id": int(prop_id),
                    "score": float(score)
                })
        return results

    def on_validation_epoch_end(self):
        val_ndcg = self.ndcg5.compute()
        self.log("val_ndcg@5", val_ndcg, prog_bar=True)
        self.ndcg5.reset()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-3, weight_decay=1e-4)



In [19]:
dm = ExpediaDataModule(
    df=train_df_fill,
    batch_size=128,
    user_cols = user_cols,
    hotel_cols = property_cols
)

In [None]:
from lightning.pytorch import Trainer, LightningModule, LightningDataModule
from lightning.pytorch.callbacks import EarlyStopping

model = TTranker(user_dim=len(user_cols), hotel_dim=len(property_cols), embedding_dim=128, dropout=0.5)

early_stop_callback = EarlyStopping(
    monitor="val_ndcg@5",       # Metric to monitor (could also be "val_ndcg@5")
    min_delta=0.0001,         # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will stop
    verbose=True,             # Print out messages when stopping
    mode="max",               # 'min' for loss, 'max' for metric like NDCG
)

trainer = Trainer(
    max_epochs=5,
    log_every_n_steps=50,     # useful for progress visibility
    callbacks=[early_stop_callback]
)
trainer.fit(model, datamodule=dm)

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
d:\UvA\Semester II\Period II\Data Mining Techniques\DataMiningTechniques\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name           | Type                   | Params | Mode 
------------------------------------------------------------------
0 | tower_user     | 

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

d:\UvA\Semester II\Period II\Data Mining Techniques\DataMiningTechniques\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


                                                                           

d:\UvA\Semester II\Period II\Data Mining Techniques\DataMiningTechniques\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 1405/1405 [10:39<00:00,  2.20it/s, v_num=0, train_loss=0.113, val_ndcg@5=0.159]

Metric val_ndcg@5 improved. New best score: 0.159


Epoch 1:   2%|▏         | 23/1405 [00:10<10:40,  2.16it/s, v_num=0, train_loss=0.103, val_ndcg@5=0.159]  

In [None]:
class ExpediaTestDataset(Dataset):
    def __init__(self, df, user_cols, hotel_cols):
        self.df = df.reset_index(drop=True)
        self.user_cols = user_cols
        self.hotel_cols = hotel_cols
        self.group_indices = [group.index.to_numpy() for _, group in self.df.groupby("srch_id")]

    def __len__(self):
        return len(self.group_indices)

    def __getitem__(self, idx):
        indices = self.group_indices[idx]
        group = self.df.iloc[indices]

        users = torch.tensor(group[self.user_cols].values, dtype=torch.float32)
        hotels = torch.tensor(group[self.hotel_cols].values, dtype=torch.float32)
        prop_ids = torch.tensor(group["prop_id"].values, dtype=torch.long)
        srch_id = torch.tensor(group["srch_id"].iloc[0], dtype=torch.long)

        return users, hotels, prop_ids, srch_id



In [None]:
test_dataset = ExpediaTestDataset(test_df, user_cols, property_cols)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
raw_preds = trainer.predict(model, test_loader)
flat_preds = [item for sublist in raw_preds for item in sublist]
df = pd.DataFrame(flat_preds)


c:\Users\NaniComputationalSci\miniconda3\envs\IntroComputationalScience\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0:   0%|          | 0/199549 [00:00<?, ?it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 1/199549 [00:00<12:46:37,  4.34it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 2/199549 [00:00<7:40:10,  7.23it/s] NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 3/199549 [00:00<5:25:45, 10.21it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 4/199549 [00:00<4:12:24, 13.18it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 5/199549 [00:00<3:27:27, 16.03it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 6/199549 [00:00<2:58:26, 18.64it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 7/199549 [00:00<2:37:17, 21.14it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 8/199549 [00:00<2:22:16, 23.38it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 9/199549 [00:00<2:10:27, 25.49it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|        


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
raw_preds = trainer.predict(model, test_loader)
flat_preds = [item for sublist in raw_preds for item in sublist]
df = pd.DataFrame(flat_preds)


c:\Users\NaniComputationalSci\miniconda3\envs\IntroComputationalScience\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting DataLoader 0:   0%|          | 0/199549 [00:00<?, ?it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 1/199549 [00:00<12:46:37,  4.34it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 2/199549 [00:00<7:40:10,  7.23it/s] NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 3/199549 [00:00<5:25:45, 10.21it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 4/199549 [00:00<4:12:24, 13.18it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 5/199549 [00:00<3:27:27, 16.03it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 6/199549 [00:00<2:58:26, 18.64it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 7/199549 [00:00<2:37:17, 21.14it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 8/199549 [00:00<2:22:16, 23.38it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|          | 9/199549 [00:00<2:10:27, 25.49it/s]NaNs in embeddings!
Predicting DataLoader 0:   0%|        


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
df

Unnamed: 0,srch_id,prop_id,score
0,163,3255,
1,163,12100,
2,163,36817,
3,163,48336,
4,163,103641,
...,...,...,...
25013,332598,117741,
25014,332598,120604,
25015,332598,126469,
25016,332598,128566,


In [None]:
df["rank"] = df.groupby("srch_id")["score"].rank(method="first", ascending=False).astype(int)
df = df.sort_values(by=["srch_id", "rank"])
df_submission = df[["srch_id", "prop_id"]].rename(columns={
    "srch_id": "SearchId",
    "prop_id": "PropertyId"
})


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
df_submission.to_csv("submission.csv", index=False, header=False)