In [1]:
import pandas as pd
from transformers import AutoTokenizer
import numpy as np
import os

In [2]:
# Paths
retrieved_path = "../data/processed/retrieved_noise_dataset.csv"   # your retrieved posts
concepts_path  = "../data/processed/merged_questionnaires.csv"     # questionnaire
out_path       = "../data/processed/concat_subject_texts.csv"
# Load data
retrieved_df = pd.read_csv(retrieved_path)
symptoms_df  = pd.read_csv(concepts_path)

In [3]:
# --- 1. Clean and normalize IDs ---
retrieved_df["subject_id"] = retrieved_df["subject_id"].astype(str)
symptoms_df["subject_id"]  = (
    symptoms_df["Subject"].astype(str).str.replace("^train_", "", regex=True)
)

In [4]:
# --- 2. Merge diagnosis & concepts ---
concept_cols = [
    c for c in symptoms_df.columns if c not in ["Subject", "Diagnosis", "subject_id"]
]
symptoms_df = symptoms_df[["subject_id", "Diagnosis"] + concept_cols]

retrieved_df = retrieved_df.merge(
    symptoms_df, on="subject_id", how="inner"
)
print(f"Merged retrieved posts: {retrieved_df.shape}")

Merged retrieved posts: (9720, 25)


In [5]:
# --- 3. Prepare tokenizer (for token-length-aware truncation) ---
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
SEP = tokenizer.sep_token if tokenizer.sep_token else "[SEP]"
MAX_TOKENS = 4096

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [6]:
# --- 4. Concatenate posts per subject ---
def concat_posts(posts, max_tokens=MAX_TOKENS):
    """
    Concatenate a list of post texts into one string separated by [SEP],
    truncating when total token length exceeds model limit.
    """
    concatenated = ""
    total_tokens = 0
    for text in posts:
        # Predict token count for this post (+1 for SEP)
        tokens = len(tokenizer.tokenize(text)) + 1
        if total_tokens + tokens > max_tokens - 2:
            break
        concatenated += text.strip() + f" {SEP} "
        total_tokens += tokens
    return concatenated.strip()

grouped = []
for sid, group in retrieved_df.groupby("subject_id", sort=False):
    # Keep retrieval order (your DF already has top-15 + 5 random in order)
    posts = group["text"].astype(str).tolist()
    merged_text = concat_posts(posts)
    label = group["Diagnosis"].iloc[0]
    concepts = group[concept_cols].iloc[0].to_dict()
    grouped.append({"subject_id": sid, "text": merged_text, "label": label, **concepts})

concat_df = pd.DataFrame(grouped)
print("Created concatenated dataset:", concat_df.shape)

Created concatenated dataset: (486, 24)


In [7]:
# --- 5. Save ---
os.makedirs(os.path.dirname(out_path), exist_ok=True)
concat_df.to_csv(out_path, index=False)
print(f"Saved concatenated texts to {out_path}")

Saved concatenated texts to ../data/processed/concat_subject_texts.csv
