# Create Raw Embeddings for Validation and Test Sets

This notebook creates SBERT embeddings for validation and test subjects.

Input:
- `retrieved_test_validation.csv` - Validation posts (from 1b_Build_Test_Manifest)
- `retrieved_test_test.csv` - Test posts (from 1b_Build_Test_Manifest)

Output:
- `subject_features_raw_validation.npz` - Validation embeddings
- `subject_features_raw_test.npz` - Test embeddings

In [None]:
import numpy as np
import pandas as pd
import os
from sentence_transformers import SentenceTransformer

In [None]:
# Load SBERT model (same as training)
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)
embed_dim = model.get_sentence_embedding_dimension()
print(f"Loaded SBERT '{MODEL_NAME}' â€” embedding dim = {embed_dim}")

In [None]:
# Paths
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_dir = os.path.join(project_root, "data/processed")

val_csv = os.path.join(data_dir, "retrieved_test_validation.csv")
test_csv = os.path.join(data_dir, "retrieved_test_test.csv")

val_output = os.path.join(data_dir, "subject_features_raw_validation.npz")
test_output = os.path.join(data_dir, "subject_features_raw_test.npz")

print(f"Validation CSV: {val_csv}")
print(f"Test CSV: {test_csv}")
print(f"Output directory: {data_dir}")

In [None]:
def create_embeddings_from_csv(csv_path, output_path, dataset_name):
    """
    Create embeddings from a CSV file with columns: subject_id, label, text
    """
    print(f"\n{'='*60}")
    print(f"Processing {dataset_name}")
    print(f"{'='*60}")
    
    # Load CSV
    if not os.path.exists(csv_path):
        print(f"Error: Input file not found at {csv_path}")
        return
    
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} posts for {df['subject_id'].nunique()} subjects")
    
    # Encode all posts
    texts = df["text"].astype(str).tolist()
    batch_size = 64
    
    print("Encoding posts...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    print(f"Encoded shape: {embeddings.shape}")
    
    # Attach embeddings to dataframe
    df = df.reset_index(drop=True)
    df["emb"] = list(embeddings)
    
    # Group by subject
    subject_ids = df["subject_id"].unique()
    n_subjects = len(subject_ids)
    
    X_raw = []
    subject_list = []
    y = []
    
    print(f"Grouping embeddings for {n_subjects} subjects...")
    for sid in subject_ids:
        group_embs = np.vstack(df.loc[df["subject_id"] == sid, "emb"].values)
        X_raw.append(group_embs)
        subject_list.append(sid)
        
        # Get label
        y_val = int(df.loc[df["subject_id"] == sid, "label"].iloc[0])
        y.append(y_val)
    
    # Convert to numpy arrays
    X_raw = np.array(X_raw, dtype=object)
    y = np.array(y, dtype=np.int64)
    subject_ids_array = np.array(subject_list)
    
    print(f"X_raw shape: {X_raw.shape}")
    if X_raw.shape[0] > 0:
        print(f"First subject embedding matrix shape: {X_raw[0].shape}")
    print(f"y shape: {y.shape}")
    print(f"Label distribution: {np.bincount(y)}")
    
    # Save
    np.savez_compressed(
        output_path,
        X_raw=X_raw,
        y=y,
        subject_ids=subject_ids_array
    )
    
    print(f"Saved to {output_path}")
    print(f"Done with {dataset_name}!\n")

print("Embedding function defined")

In [None]:
# Process validation set
create_embeddings_from_csv(val_csv, val_output, "VALIDATION")

In [None]:
# Process test set
create_embeddings_from_csv(test_csv, test_output, "TEST")

In [None]:
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

# Load and verify saved files
if os.path.exists(val_output):
    val_data = np.load(val_output, allow_pickle=True)
    print(f"\nValidation set:")
    print(f"  - Subjects: {len(val_data['y'])}")
    print(f"  - X_raw shape: {val_data['X_raw'].shape}")
    print(f"  - First subject posts: {val_data['X_raw'][0].shape[0]}")
    print(f"  - Embedding dim: {val_data['X_raw'][0].shape[1]}")

if os.path.exists(test_output):
    test_data = np.load(test_output, allow_pickle=True)
    print(f"\nTest set:")
    print(f"  - Subjects: {len(test_data['y'])}")
    print(f"  - X_raw shape: {test_data['X_raw'].shape}")
    print(f"  - First subject posts: {test_data['X_raw'][0].shape[0]}")
    print(f"  - Embedding dim: {test_data['X_raw'][0].shape[1]}")

print("\nAll embeddings created successfully!")