In [None]:
!git clone https://github.com/icedarold/through-pages.git
%cd through-pages


# Phase 1 & 2: Item & User Encoding on Kaggle

This notebook runs the full pipeline to:
1. Preprocess interactions and items.
2. Generate Content Embeddings for books (Phase 2).
3. Train Multi-Interest User Encoder (Phase 1).
4. Generate 6 interest vectors for every user.

In [None]:
# 1. Install dependencies
!pip install -q sentence-transformers pyarrow fastparquet tqdm

In [None]:
# 2. Setup Directories
import os
os.makedirs('experiments/data_v1', exist_ok=True)
os.makedirs('experiments/models_v1', exist_ok=True)
os.makedirs('data', exist_ok=True)
os.makedirs('submit', exist_ok=True)

print("Ensure you have uploaded interactions.csv, editions.csv, etc. to /data and targets.csv to /submit")

### Step 3: Preprocessing

In [None]:
!python3 src/preprocess.py
!python3 src/data/items.py
!python3 src/data/sequences.py

### Step 4: Phase 2 - Item Content Embeddings
Uses Multilingual MPNet to encode Titles + Descriptions.

In [None]:
!python3 src/models/item_encoder.py --batch_size 128

### Step 5: Phase 1 - Multi-Interest User Encoder
Training the Transformer model with centroid-based attention.

In [None]:
!python3 src/data/enrich_items.py
!python3 src/train.py --epochs 10 --batch-size 256

### Step 6: Inference - Generate User Vectors
Save the 6 vectors (256-D) for each user.

In [None]:
!python3 src/inference_user.py

import numpy as np
data = np.load('experiments/data_v1/user_interests.npy', allow_pickle=True).item()
print(f"Final result: Generated interests for {len(data['user_ids'])} users.")
print(f"Interests shape: {data['interests'].shape}")