In [None]:
!pip install polars

In [None]:
import polars as pl
import torch
from PIL import Image
import numpy as np
from sklearn.decomposition import PCA
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel

class ItemEncoder:
    def __init__(self,
                device,
                encoder_model_name: str = "openai/clip-vit-base-patch32",
    ):
        self.device = device
        self.model = CLIPModel.from_pretrained(encoder_model_name)
        self.processor = CLIPProcessor.from_pretrained(encoder_model_name)

        self.model.to(self.device)

        for param in self.model.parameters():
            param.requires_grad = False
        self.model.eval()
    
    @torch.no_grad()
    def encode_items(self, images, titles):
        """
        Args:
            images: PIL Image or list of PIL Images
            titles: str or list of str
        Returns:
            torch.Tensor: item embeddings [batch_size, hidden_dim]
        """
        # IMPORTANT: Add truncation=True and max_length=77
        inputs = self.processor(
            text=titles, 
            images=images, 
            return_tensors="pt", 
            padding=True,
            truncation=True,  # Truncate long texts
            max_length=77     # CLIP's max sequence length
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        outputs = self.model(**inputs)
        image_embeds = outputs.image_embeds  # [batch_size, 512]
        text_embeds = outputs.text_embeds     # [batch_size, 512]
        
        # Combine them (concatenate)
        item_embedding = torch.cat([image_embeds, text_embeds], dim=1)  # [batch_size, 1024]
        
        return item_embedding


def main():
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load data
    print("Loading data...")
    items_features = pl.read_parquet("/kaggle/working/item_feature_clean.parquet")
    items_info = pl.read_parquet("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/MicroLens_1M_x1/item_info.parquet")
    
    print(f"Items features shape: {items_features.shape}")
    print(f"Items info shape: {items_info.shape}")
    
    # Initialize encoder
    print("Initializing CLIP encoder...")
    encoder = ItemEncoder(device=device)
    
    # Prepare for encoding
    batch_size = 32  # Adjust based on your GPU memory
    all_embeddings = []
    
    # Get item IDs in order
    item_ids = items_features["item_id"].to_list()
    
    print(f"Encoding {len(item_ids)} items...")
    
    # Process in batches
    for i in tqdm(range(0, len(items_features), batch_size)):
        batch_end = min(i + batch_size, len(items_features))
        batch_items = items_features[i:batch_end]
        
        # Load images
        images = []
        titles = []
        
        for row in batch_items.iter_rows(named=True):
            # Check if this is padding row (item_id == 0)
            if row["item_id"] == 0:
                # For padding, we'll handle separately
                images.append(None)
                titles.append("")
            else:
                try:
                    img = Image.open(row["image_path"]).convert("RGB")
                    images.append(img)
                    # Ensure title is a string and handle None
                    title = row["item_title"] if row["item_title"] else ""
                    titles.append(str(title))
                except Exception as e:
                    print(f"Error loading image for item {row['item_id']}: {e}")
                    # Use a blank image as fallback
                    images.append(Image.new("RGB", (224, 224), color=(0, 0, 0)))
                    titles.append(row["item_title"] if row["item_title"] else "")
        
        # Separate padding from real items
        non_padding_indices = [idx for idx, img in enumerate(images) if img is not None]
        padding_indices = [idx for idx, img in enumerate(images) if img is None]
        
        batch_embeddings = torch.zeros((len(images), 1024), device=device)
        
        # Encode non-padding items
        if non_padding_indices:
            non_padding_images = [images[idx] for idx in non_padding_indices]
            non_padding_titles = [titles[idx] for idx in non_padding_indices]
            
            embeddings = encoder.encode_items(non_padding_images, non_padding_titles)
            
            for local_idx, global_idx in enumerate(non_padding_indices):
                batch_embeddings[global_idx] = embeddings[local_idx]
        
        # Padding rows already have zeros, so no need to set them
        
        all_embeddings.append(batch_embeddings.cpu().numpy())
    
    # Concatenate all embeddings
    all_embeddings = np.vstack(all_embeddings)
    print(f"All embeddings shape: {all_embeddings.shape}")  # Should be [91718, 1024]
    
    # Apply PCA to reduce from 1024 to 128 dimensions
    print("Applying PCA to reduce dimensions to 128...")
    
    # Separate padding and non-padding for PCA
    padding_mask = (np.array(item_ids) == 0)
    non_padding_embeddings = all_embeddings[~padding_mask]
    
    # Fit PCA on non-padding embeddings only
    pca = PCA(n_components=128)
    pca.fit(non_padding_embeddings)
    
    print(f"PCA explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")
    
    # Transform all embeddings
    embeddings_pca = np.zeros((len(all_embeddings), 128))
    embeddings_pca[~padding_mask] = pca.transform(non_padding_embeddings)
    # Padding rows remain zeros
    
    print(f"PCA embeddings shape: {embeddings_pca.shape}")  # Should be [91718, 128]
    
    # Convert to list of arrays for Polars
    embeddings_list = [emb.tolist() for emb in embeddings_pca]
    
    # Create a mapping from item_id to embedding
    item_id_to_embedding = dict(zip(item_ids, embeddings_list))
    
    # Add new column to items_info
    print("Adding new column to item_info...")
    
    # Map embeddings to items_info based on item_id
    items_info_ids = items_info["item_id"].to_list()
    clip_embeddings = [item_id_to_embedding.get(iid, [0.0] * 128) for iid in items_info_ids]
    
    # Add the new column
    items_info = items_info.with_columns(
        pl.Series("item_clip_emb_d128", clip_embeddings)
    )
    
    print(f"Updated items_info shape: {items_info.shape}")
    print(f"Columns: {items_info.columns}")
    
    # Save the updated dataframe
    output_path = "/kaggle/working/item_info_with_clip.parquet"
    items_info.write_parquet(output_path)
    print(f"Saved updated item_info to {output_path}")
    
    # Show sample
    print("\nSample of the data:")
    print(items_info.head(5))
    
    # Verify padding row
    padding_row = items_info.filter(pl.col("item_id") == 0)
    if len(padding_row) > 0:
        padding_emb = padding_row["item_clip_emb_d128"][0]
        print(f"\nPadding embedding (first 10 values): {padding_emb[:10]}")
        print(f"Padding embedding is all zeros: {all(x == 0.0 for x in padding_emb)}")


if __name__ == "__main__":
    main()

In [None]:
import polars as pl 

items = pl.read_parquet("/kaggle/working/item_info_with_clip.parquet")
print(items)

In [None]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt

import seaborn as sns
import os

sns.set_style("whitegrid")

In [None]:
# train = pl.read_parquet("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/MicroLens_1M_x1/train.parquet")
# valid = pl.read_parquet("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/MicroLens_1M_x1/valid.parquet")
# test = pl.read_parquet("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/MicroLens_1M_x1/test.parquet")

# item_info = pl.read_parquet("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/MicroLens_1M_x1/item_info.parquet")
# print("item info : ", item_info)
# item_feature = pl.read_parquet("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/item_feature.parquet")



# selection of only features columns
# item_feature_clean = item_feature.select([
#     "item_id",
#     "item_title",
#     "item_tags",
#     "likes_level",
#     "views_level"
# ])
# # adding image paths 
# item_feature_clean = item_feature_clean.with_columns([
#     (pl.lit("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/item_images/item_images/") +
#     pl.col("item_id").cast(str) + 
#     pl.lit(".jpg")).alias("image_path")
# ])
# # adding the padding row (idx 0)
# padding_row = pl.DataFrame({
#     "item_id":pl.Series([0], dtype=pl.Int64),
#     "item_title":"",
#     "item_tags":pl.Series([[]], dtype=pl.List(pl.Int64)),
#     "likes_level":pl.Series([0], dtype=pl.Int64),
#     "views_level":pl.Series([0], dtype=pl.Int64),
#     "image_path": ""
# })

# item_feature_clean = pl.concat([padding_row, item_feature_clean])
# # replacing the tags with padded tags from item_info
# item_tags_padded = item_info.select(["item_id", "item_tags"])
# item_feature_clean = item_feature_clean.drop("item_tags").join(item_tags_padded, on="item_id", how="left")
# print("item_feature_clean : ", item_feature_clean)

# item_feature_clean.write_parquet("/kaggle/working/item_feature_clean.parquet")
df = df.with_columns(pl.col("item_tags").arr.to_list())
max = df["item_tags"].list.max().max()
print(max)
# item_seq = pl.read_parquet("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/item_seq.parquet")
# print(item_seq)
# item_seq_clean = item_seq.unique(subset=["user_id"], keep="first").sort("user_id")
# print("item_seq_cleaned : ", item_seq_clean)
# item_seq_clean.write_parquet("/kaggle/working/item_seq_clean.parquet")

In [None]:
item_feature = pl.read_parquet("/kaggle/working/item_feature_clean.parquet")
print("item_feature : ", item_feature)

item_info = pl.read_parquet("/kaggle/input/www2025-mmctr-data/MicroLens_1M_MMCTR/MicroLens_1M_x1/item_info.parquet")
print("item_info : ", item_info)

In [None]:
print("\n=== Label Distribution (Train) ===")
label_counts = train_with_full_seq.group_by("label").agg(pl.count()).sort("label")
print(label_counts)

total = train_with_full_seq.shape[0]
positive = label_counts.filter(pl.col("label") == 1)["count"][0]
negative = label_counts.filter(pl.col("label") == 0)["count"][0]

print(f"\nPositive rate: {positive/total*100:.2f}%")
print(f"Negative rate: {negative/total*100:.2f}%")
print(f"Class imbalance ratio: 1:{negative/positive:.2f}")

plt.figure(figsize=(8, 5))
labels = ['Negative (0)', 'Positive (1)']
counts = [negative, positive]
plt.bar(labels, counts, color=['#ff6b6b', '#51cf66'])
plt.title('Label Distribution in Training Set')
plt.ylabel('Count')
for i, v in enumerate(counts):
    plt.text(i, v, f'{v:,}\n({v/total*100:.1f}%)', 
             ha='center', va='bottom')
plt.tight_layout()
plt.show()

In [None]:
print("\n=== User Frequency Distribution ===")
user_freq = train_with_full_seq.group_by("user_id").agg(
    pl.count().alias("user_interaction_count")
)
print(user_freq.select(pl.col("user_interaction_count")).describe())

print("\n=== Item Frequency Distribution ===")
item_freq = train_with_full_seq.group_by("item_id").agg(
    pl.count().alias("item_interaction_count")
)
print(item_freq.select(pl.col("item_interaction_count")).describe())

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].hist(user_freq["user_interaction_count"], bins=50, 
             color='skyblue', edgecolor='black')
axes[0].set_xlabel('Interactions per User')
axes[0].set_ylabel('Number of Users')
axes[0].set_title('User Interaction Distribution')
axes[0].set_yscale('log')


axes[1].hist(item_freq["item_interaction_count"], bins=50, 
             color='coral', edgecolor='black')
axes[1].set_xlabel('Interactions per Item')
axes[1].set_ylabel('Number of Users')
axes[1].set_title('Item Interaction Distribution')
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()



In [None]:
print("\n=== Item Sequence Length Analysis ===")

seq_stats = item_seq_clean.select(
    pl.col("item_seq").list.len().alias("original_length")
).describe()
print(seq_stats)

# count non zero elements from train set
train_seq_nonzero = train_with_full_seq.select(
    pl.col("item_seq").cast(pl.List(pl.Int64)).list.eval(
        pl.element().filter(pl.element() != 0).len()
    ).alias("non_zero_count")
)
print("\nPadded sequences in train (non-zero items):")
print(train_seq_nonzero.describe())

plt.figure(figsize=(10, 5))
seq_lengths_list = item_seq_clean.select(
    pl.col("item_seq").list.len()
).to_numpy().flatten()

plt.hist(seq_lengths_list, bins=100, color='mediumpurple', 
         edgecolor='black', alpha=0.7)
plt.xlabel('Sequence Length')
plt.ylabel('Number of Users')
plt.title('Distribution of User Sequence Lengths')
plt.axvline(seq_lengths_list.mean(), color='red', 
            linestyle='--', label=f'Mean: {seq_lengths_list.mean():.1f}')
plt.axvline(np.median(seq_lengths_list), color='green', 
            linestyle='--', label=f'Median: {np.median(seq_lengths_list):.1f}')
plt.legend()
plt.tight_layout()
plt.show()
long_sequences = (seq_lengths_list > 100).sum()
print(f"\n{long_sequences:,} users ({long_sequences/len(seq_lengths_list)*100:.2f}%) have >100 items (truncated in train/test)")

In [None]:
print("\n=== Engagement Levels Distribution ===")

# Likes level
print("Likes Level:")
print(train_with_full_seq.group_by("likes_level").agg(pl.count()).sort("likes_level"))

# Views level
print("\nViews Level:")
print(train_with_full_seq.group_by("views_level").agg(pl.count()).sort("views_level"))

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

likes_dist = train_with_full_seq.group_by("likes_level").agg(pl.count()).sort("likes_level")
axes[0].bar(likes_dist["likes_level"], likes_dist["count"], 
            color='lightcoral')
axes[0].set_xlabel('Likes Level')
axes[0].set_ylabel('Count')
axes[0].set_title('Likes Level Distribution')

views_dist = train_with_full_seq.group_by("views_level").agg(pl.count()).sort("views_level")
axes[1].bar(views_dist["views_level"], views_dist["count"], 
            color='lightblue')
axes[1].set_xlabel('Views Level')
axes[1].set_ylabel('Count')
axes[1].set_title('Views Level Distribution')

plt.tight_layout()
plt.show()

# Correlation with label
print("\n=== Engagement vs Label ===")
engagement_label = train_with_full_seq.group_by(["likes_level", "label"]).agg(pl.count())
print(engagement_label.sort(["likes_level", "label"]))

In [None]:
###########################################  Training ###########################################

In [None]:
train_with_full_seq = pl.read_parquet("/kaggle/working/train_with_full_seq.parquet")
valid_with_full_seq = pl.read_parquet("/kaggle/working/valid_with_full_seq.parquet")

print(train_with_full_seq)
print(valid_with_full_seq)

In [13]:
!rm -rf /kaggle/working/recommender_CTR

In [14]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("github")

# Use token as username with empty password
clone_url = f"https://{token}:x-oauth-basic@github.com/HajarHAMDOUCH01/recommender_CTR.git"
!git clone $clone_url

Cloning into 'recommender_CTR'...
remote: Enumerating objects: 264, done.[K
remote: Counting objects: 100% (164/164), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 264 (delta 90), reused 124 (delta 53), pack-reused 100 (from 1)[K
Receiving objects: 100% (264/264), 8.87 MiB | 37.70 MiB/s, done.
Resolving deltas: 100% (121/121), done.


In [11]:
!python /kaggle/working/recommender_CTR/task2/model.py

In [15]:
!python /kaggle/working/recommender_CTR/task2/test.py


TEST PREDICTION SCRIPT

Loading test dataset...
Dataset loaded: 379,142 samples
Has labels: False
Columns: ['ID', 'user_id', 'item_seq', 'item_id', 'likes_level', 'views_level']
✓ Test dataset loaded: 379,142 samples

LOADING ITEM DATA: item_clip_emb_d128

Loaded item_info shape: (91718, 4)
Columns: ['item_id', 'item_tags', 'item_emb_d128', 'item_clip_emb_d128']

Loading embeddings from: item_clip_emb_d128
Embeddings shape: (91718, 128)
Embeddings dtype: float32
Embeddings range: [-0.552900, 0.597740]
Embeddings mean: 0.000000, std: 0.064451

Loading item tags...
Item tags shape: (91718, 5)
Item tags dtype: int64
Item tags range: [0, 11739]

Number of items: 91,718
Number of unique tags: 11,740
Max tag ID: 11739
✓ Row 0 (item_id=0) is properly zeroed for padding

✓ DATA LOADED SUCCESSFULLY


Initializing model architecture...

Trainable params: 177,349,905
Item embedding dim: 208
Seq output dim: 7072
Learning rate: 0.0005


Loading checkpoint: /kaggle/working/model_21.pth
✓ Loaded che