In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import os


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load model
MODEL_NAME = "all-MiniLM-L6-v2"   
model = SentenceTransformer(MODEL_NAME)
embed_dim = model.get_sentence_embedding_dimension()
print(f"Loaded SBERT '{MODEL_NAME}' embedding dim = {embed_dim}")

Loaded SBERT 'all-MiniLM-L6-v2' — embedding dim = 384


In [5]:
#Load DS
retrieved_df = pd.read_csv("../data/processed/retrieved_noise_dataset.csv")

In [6]:
# Encode all posts (batched)
texts = retrieved_df["text"].astype(str).tolist()  # ensure strings
batch_size = 64
print("Encoding posts (this may take a moment)...")
embeddings = model.encode(
    texts,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True  # returns a numpy array shape (n_posts, embed_dim)
)
print("Encoded:", embeddings.shape)

Encoding posts (this may take a moment)...


Batches: 100%|██████████| 152/152 [00:28<00:00,  5.33it/s]

Encoded: (9720, 384)





In [7]:
#Attach embeddings back to retrieved_df (index alignment)
retrieved_df = retrieved_df.reset_index(drop=True)
retrieved_df["emb"] = list(embeddings)   # each row gets a numpy vector


In [8]:
# Aggregate per subject: mean, max, std pooling (concatenate -> fixed-length vector)
def aggregate_embeddings_for_group(emb_stack):
    """
    emb_stack: numpy array shape (n_posts_for_subject, embed_dim)
    returns: concatenated vector [mean; max; std] shape (3*embed_dim,)
    """
    mean = emb_stack.mean(axis=0)
    max_  = emb_stack.max(axis=0)
    std   = emb_stack.std(axis=0)
    # handle case where std might be all zeros for single-post subjects
    return np.concatenate([mean, max_, std])

subject_ids = retrieved_df["subject_id"].unique()
n_subjects = len(subject_ids)
agg_factors = 3  # mean, max, std
feature_dim = embed_dim * agg_factors

X = np.zeros((n_subjects, feature_dim), dtype=np.float32)
subject_list = []
print(f"Aggregating embeddings for {n_subjects} subjects...")

for i, sid in enumerate(subject_ids):
    group_embs = np.vstack(retrieved_df.loc[retrieved_df["subject_id"] == sid, "emb"].values)
    agg_vec = aggregate_embeddings_for_group(group_embs)
    X[i] = agg_vec
    subject_list.append(sid)

print("Built X:", X.shape)

Aggregating embeddings for 486 subjects...
Built X: (486, 1152)


In [9]:
# Build y (label vector) from retrieved_df (take the label associated to each subject)
y = []
for sid in subject_list:
    y_val = int(retrieved_df.loc[retrieved_df["subject_id"] == sid, "label"].iloc[0])
    y.append(y_val)
y = np.array(y, dtype=np.int64)


In [10]:
# Save the features to disk for later training
out_dir = "../data/processed"
os.makedirs(out_dir, exist_ok=True)
feat_path = os.path.join(out_dir, "subject_features.npz")
np.savez_compressed(feat_path, X=X, y=y, subject_ids=np.array(subject_list))
print(f"Saved features to {feat_path}")

Saved features to ../data/processed/subject_features.npz
