In [2]:
import numpy as np
import pandas as pd
import os
from sentence_transformers import SentenceTransformer

def create_raw_embeddings():
    """
    This script encodes posts using a sentence transformer and saves the raw, 
    unaggregated embeddings for each subject.
    The output is a .npz file containing:
    - X_raw: A NumPy object array where each element is a (num_posts, embedding_dim) array for a subject.
    - y: The corresponding labels.
    - subject_ids: The subject IDs.
    """
    print("Starting: Create raw embeddings")

    # Paths
    project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
    retrieved_path = os.path.join(project_root, "data/processed/retrieved_noise_dataset.csv")
    out_dir = os.path.join(project_root, "data/processed")
    feat_path = os.path.join(out_dir, "subject_features_raw.npz")

    # Load model
    MODEL_NAME = "all-MiniLM-L6-v2"
    model = SentenceTransformer(MODEL_NAME)
    embed_dim = model.get_sentence_embedding_dimension()
    print(f"Loaded SBERT '{MODEL_NAME}' — embedding dim = {embed_dim}")

    # Load dataset
    if not os.path.exists(retrieved_path):
        print(f"Error: Input file not found at {retrieved_path}")
        return
    retrieved_df = pd.read_csv(retrieved_path)
    print(f"Loaded dataset with {len(retrieved_df)} posts.")

    # Encode all posts (batched)
    texts = retrieved_df["text"].astype(str).tolist()
    batch_size = 64
    print("Encoding posts (this may take a moment)...")
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    print("Encoded:", embeddings.shape)

    # Attach embeddings back to retrieved_df
    retrieved_df = retrieved_df.reset_index(drop=True)
    retrieved_df["emb"] = list(embeddings)

    # Group embeddings by subject, but do not aggregate
    subject_ids = retrieved_df["subject_id"].unique()
    n_subjects = len(subject_ids)

    X_raw = []
    subject_list = []
    print(f"Grouping embeddings for {n_subjects} subjects...")

    for sid in subject_ids:
        group_embs = np.vstack(retrieved_df.loc[retrieved_df["subject_id"] == sid, "emb"].values)
        X_raw.append(group_embs)
        subject_list.append(sid)

    # Convert to a numpy object array
    X_raw = np.array(X_raw, dtype=object)

    print("Built X_raw object array with shape:", X_raw.shape)
    if X_raw.shape[0] > 0:
        print("Shape of first subject's embedding matrix:", X_raw[0].shape)

    # Build y (label vector)
    y = []
    for sid in subject_list:
        y_val = int(retrieved_df.loc[retrieved_df["subject_id"] == sid, "label"].iloc[0])
        y.append(y_val)
    y = np.array(y, dtype=np.int64)
    print("Built y:", y.shape)

    # Save the raw features to disk
    os.makedirs(out_dir, exist_ok=True)
    np.savez_compressed(feat_path, X_raw=X_raw, y=y, subject_ids=np.array(subject_list))
    print(f"Saved raw features to {feat_path}")
    print("Finished: Create raw embeddings")

if __name__ == '__main__':
    create_raw_embeddings()

Starting: Create raw embeddings
Loaded SBERT 'all-MiniLM-L6-v2' — embedding dim = 384
Loaded dataset with 9720 posts.
Encoding posts (this may take a moment)...


Batches:   0%|          | 0/152 [00:00<?, ?it/s]

Encoded: (9720, 384)
Grouping embeddings for 486 subjects...
Built X_raw object array with shape: (486, 20, 384)
Shape of first subject's embedding matrix: (20, 384)
Built y: (486,)
Saved raw features to /Users/gualtieromarencoturi/Desktop/thesis/Master-Thesis-CEM-Depression-etc-case-study/data/processed/subject_features_raw.npz
Finished: Create raw embeddings
