# Build Manifest

In [33]:
import os
import glob
import re
import json
from collections import Counter
import pandas as pd
import xml.etree.ElementTree as ET
from sentence_transformers import SentenceTransformer, util
import numpy as np   

In [6]:
# --- Paths ---
POS_BASE = "positive_examples_anonymous_chunks"
NEG_BASE = "negative_examples_anonymous_chunks"

# Using regex to extract subject and chunk number
SUBJECT_RE = re.compile(r"subject(\d+)", re.IGNORECASE)
CHUNK_RE   = re.compile(r"_(\d+)$")   # trailing _<chunkid>

In [7]:
def parse_name(filename):
    """
    Parse filenames like 'train_subject7488_1.xml'
    Returns:
        tuple: (subject_id, chunk_id)
    """
    basename_no_ext = os.path.splitext(filename)[0]  # e.g. 'train_subject7488_1'
    parts = basename_no_ext.split("_")               # ['train', 'subject7488', '1']

    subject_id = parts[1]   # 'subject7488'
    chunk_id = int(parts[2])  # '1' -> 1

    return subject_id, chunk_id

In [8]:
def build_manifest(base_folder, label):
    """
    Build a manifest DataFrame:
    - Each row = one subject
    - Columns: subject_id, chunks (list of file paths), label
    """
    manifest = {}

    # find ALL xml files recursively
    pattern = os.path.join(base_folder, "**", "*.xml")
    all_files = glob.glob(pattern, recursive=True)

    for filepath in all_files:
        filename = os.path.basename(filepath)
        subject_id, chunk_id = parse_name(filename)

        if subject_id not in manifest:
            manifest[subject_id] = []
        manifest[subject_id].append(filepath)

    # build DataFrame
    df = pd.DataFrame([
        {"subject_id": subject, "chunks": sorted(files), "label": label}
        for subject, files in manifest.items()
    ])

    return df


In [9]:
df_pos = build_manifest(POS_BASE, "positive")
df_neg = build_manifest(NEG_BASE, "negative")

df_all = pd.concat([df_pos, df_neg], ignore_index=True)

print(df_all.head())
print(df_all.info())



    subject_id                                             chunks     label
0  subject6760  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
1   subject127  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
2  subject7326  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
3  subject2712  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
4  subject2252  [positive_examples_anonymous_chunks/chunk 1/tr...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486 entries, 0 to 485
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   subject_id  486 non-null    object
 1   chunks      486 non-null    object
 2   label       486 non-null    object
dtypes: object(3)
memory usage: 11.5+ KB
None


In [None]:
#Visualize Sample for debug

# Pick first row
row = df_all.iloc[0]
print("Subject:", row["subject_id"])
print("Label:", row["label"])
print("Chunks:", row["chunks"])

# Load first XML file for this subject
first_chunk_path = row["chunks"][0]
tree = ET.parse(first_chunk_path)
root = tree.getroot()

# For quick inspection, print raw XML text
with open(first_chunk_path, "r") as f:
    print(f.read()[:1000])  # print first 1000 chars only


Subject: subject6760
Label: positive
Chunks: ['positive_examples_anonymous_chunks/chunk 1/train_subject6760_1.xml', 'positive_examples_anonymous_chunks/chunk 10/train_subject6760_10.xml', 'positive_examples_anonymous_chunks/chunk 2/train_subject6760_2.xml', 'positive_examples_anonymous_chunks/chunk 3/train_subject6760_3.xml', 'positive_examples_anonymous_chunks/chunk 4/train_subject6760_4.xml', 'positive_examples_anonymous_chunks/chunk 5/train_subject6760_5.xml', 'positive_examples_anonymous_chunks/chunk 6/train_subject6760_6.xml', 'positive_examples_anonymous_chunks/chunk 7/train_subject6760_7.xml', 'positive_examples_anonymous_chunks/chunk 8/train_subject6760_8.xml', 'positive_examples_anonymous_chunks/chunk 9/train_subject6760_9.xml']
<INDIVIDUAL>
<ID>train_subject6760</ID>
<WRITING>
	<TITLE>   </TITLE>
	<DATE> 2014-07-03 20:10:46 </DATE>
	<INFO> reddit post </INFO>
	<TEXT> I have to admit that Facebook seemed to have increased the quality of my life in the past. At this point, ever

In [None]:

#Functions for getting the cleaned text from the chunks
WHITESPACE_RE = re.compile(r"\s+")

def _normalize_text(t: str) -> str:
    t = t or ""
    t = t.replace("\u0000", "")
    t = WHITESPACE_RE.sub(" ", t).strip()
    return t

def extract_texts_from_xml(path, min_chars=10):
    """
    Given one subject chunk XMLs, extract posts.
    Each <WRITING> becomes a post: TITLE + TEXT (concatenated).
    Returns a list of strings.
    """
    try:
        tree = ET.parse(path)
        root = tree.getroot()
    except Exception as e:
        print(f"[XML-Parse-Error] {path}: {e}")
        return []

    posts = []
    for writing in root.findall("WRITING"):
        title = writing.findtext("TITLE") or ""
        text  = writing.findtext("TEXT") or ""

        combined = _normalize_text(f"{title} {text}".strip())
        if len(combined) >= min_chars:
            posts.append(combined)

    return posts


In [17]:
def explode_manifest_to_posts(df_manifest):
    """
    Expand manifest DataFrame into a DataFrame of posts.
    Each row = one post with subject_id, label, text.
    """
    rows = []

    for _, row in df_manifest.iterrows():
        subject_id = row["subject_id"]
        label = row["label"]
        chunk_paths = row["chunks"]

        for file_path in chunk_paths:
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
            except Exception as e:
                print(f"[XML-Parse-Error] {file_path}: {e}")
                continue

            for writing in root.findall("WRITING"):
                title = (writing.findtext("TITLE") or "").strip()
                text = (writing.findtext("TEXT") or "").strip()

                full_text = f"{title}\n{text}" if title else text
                full_text = full_text.strip()

                if full_text:
                    rows.append({
                        "subject_id": subject_id,
                        "label": label,
                        "text": full_text
                    })

    return pd.DataFrame(rows)


In [None]:

posts_df = explode_manifest_to_posts(df_all)
#Visual inspection for debug
print(posts_df.head())
print(len(posts_df), "posts extracted")



    subject_id     label                                               text
0  subject6760  positive  I have to admit that Facebook seemed to have i...
1  subject6760  positive  The deterrent for deleting the account is that...
2  subject6760  positive                        That being gay is a choice.
3  subject6760  positive  I'm from India where there is not too much awa...
4  subject6760  positive  I completely agree. I have not accessed Facebo...
294977 posts extracted


In [None]:
#Getting the excel
# Path relative to notebook
symptom_file = "../data/processed/merged_questionnaires.csv"

symptoms_df = pd.read_csv(symptom_file)

#visual inspection
print(symptoms_df.head())
print(symptoms_df.info())

             Subject  Sadness  Pessimism  Past failure  Loss of pleasure  \
0  train_subject4550        0          0             0                 0   
1  train_subject4181        0          0             0                 0   
2  train_subject8202        0          0             0                 0   
3  train_subject6783        0          0             0                 0   
4  train_subject1642        0          0             0                 0   

   Guilty feelings  Punishment feelings  Self-dislike  Self-criticalness  \
0                0                    0             0                  0   
1                0                    0             0                  0   
2                0                    0             0                  0   
3                0                    0             0                  0   
4                0                    0             0                  0   

   Suicidal thoughts or wishes  ...  Indecisiveness  Worthlessness  \
0               

In [None]:
# Function to create symptom queries for each subject (only symptoms = 1)
def subject_symptom_queries(row):
    return [
        col.replace("_", " ")  # nicer text
        for col in symptoms_df.columns
        if col != "subject_id" and row[col] == 1
    ]

symptoms_df["queries"] = symptoms_df.apply(subject_symptom_queries, axis=1)

# Visual inspection
print(symptoms_df[["Subject", "queries"]].head())
print(symptoms_df.loc[[122], ["Subject", "queries"]])

             Subject queries
0  train_subject4550      []
1  train_subject4181      []
2  train_subject8202      []
3  train_subject6783      []
4  train_subject1642      []
               Subject                                            queries
122  train_subject7329  [Changes in sleeping pattern, Changes in appet...


In [None]:
#MERGE
# Fix column name
symptoms_df = symptoms_df.rename(columns={"Subject": "subject_id"})

# Strip the "train_" prefix
symptoms_df["subject_id"] = symptoms_df["subject_id"].str.replace(r"^train_", "", regex=True)

#merge on user id
merged_df = posts_df.merge(
    symptoms_df[["subject_id"] + [c for c in symptoms_df.columns if c not in ["subject_id"]]],
    on="subject_id",
    how="left"
)
print(merged_df.head())
print(symptoms_df["subject_id"].nunique())
print(merged_df["subject_id"].nunique())

    subject_id     label                                               text  \
0  subject6760  positive  I have to admit that Facebook seemed to have i...   
1  subject6760  positive  The deterrent for deleting the account is that...   
2  subject6760  positive                        That being gay is a choice.   
3  subject6760  positive  I'm from India where there is not too much awa...   
4  subject6760  positive  I completely agree. I have not accessed Facebo...   

   Sadness  Pessimism  Past failure  Loss of pleasure  Guilty feelings  \
0        0          1             1                 0                1   
1        0          1             1                 0                1   
2        0          1             1                 0                1   
3        0          1             1                 0                1   
4        0          1             1                 0                1   

   Punishment feelings  Self-dislike  ...  Worthlessness  Loss of energy  \
0   

In [32]:
# Load SBERT model (can change later to another one)
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

In [35]:
def retrieve_posts_for_subject(subject_id, symptoms_df, posts_df, k=5, n_fallback=5):
    """
    Retrieve posts for one subject based on SBERT similarity with symptoms or fallback to random sampling if no symptoms are present.
    """
    # --- 1. get subject's queries
    subj_row = symptoms_df[symptoms_df["subject_id"] == subject_id].iloc[0]
    queries = subj_row["queries"]

    # --- 2. get subject's posts
    subj_posts = posts_df[posts_df["subject_id"] == subject_id]["text"].tolist()
    if len(subj_posts) == 0:
        return []  # no posts at all for this subject

    # --- 3. if no queries → fallback
    if len(queries) == 0:
        return list(np.random.choice(subj_posts, size=min(n_fallback, len(subj_posts)), replace=False))

    # --- 4. embed queries + posts
    query_embs = sbert_model.encode(queries, convert_to_tensor=True)
    post_embs = sbert_model.encode(subj_posts, convert_to_tensor=True)

    # --- 5. cosine similarity
    cos_scores = util.cos_sim(query_embs, post_embs)  # shape [n_queries, n_posts]

    # --- 6. select top-k posts per query
    retrieved_posts = set()
    for i, q in enumerate(queries):
        top_results = np.argpartition(-cos_scores[i].cpu().numpy(), range(k))[:k]
        for idx in top_results:
            retrieved_posts.add(subj_posts[idx])

    return list(retrieved_posts)


In [37]:
retrieved_data = []

for subject_id in symptoms_df["subject_id"]:
    posts = retrieve_posts_for_subject(subject_id, symptoms_df, posts_df, k=5, n_fallback=5)
    for p in posts:
        retrieved_data.append({
            "subject_id": subject_id,
            "label": symptoms_df.loc[symptoms_df["subject_id"] == subject_id, "Diagnosis"].values[0],
            "text": p
        })

retrieved_df = pd.DataFrame(retrieved_data)
print(retrieved_df.head())

    subject_id  label                                               text
0  subject4550      0                                 Amazing Street Art
1  subject4550      0  I want to say just one thing after this that "...
2  subject4550      0               Autorickshaw Strike Hits Mumbai Hard
3  subject4550      0  Oscar Pistorius will be released from prison f...
4  subject4550      0  My Greatest fear in my life is - being jobless...


In [39]:
#save
retrieved_df.to_csv("../data/processed/retrieved_dataset.csv", index=False)

In [42]:
# Count of unique subject IDs
retrieved_df["subject_id"].nunique()



486

Now I'll do the same but introducing some random post (possibly not relevant) as well, which should be better for learning

In [51]:
def retrieve_posts_for_subject_noise(subject_id, symptoms_df, posts_df, k_relevant=15, n_random=5):
    """
    Retrieve posts for one subject:
    - Top-k SBERT-relevant posts
    - Plus n_random random posts from the remaining pool
    Always pads so total = k_relevant + n_random (if enough posts exist).
    """
    target_total = k_relevant + n_random

    # --- 1. get subject's queries
    subj_row = symptoms_df[symptoms_df["subject_id"] == subject_id].iloc[0]
    queries = subj_row["queries"]

    # --- 2. get subject's posts
    subj_posts = posts_df[posts_df["subject_id"] == subject_id]["text"].tolist()
    if len(subj_posts) == 0:
        return []  # no posts at all

    retrieved_posts = set()

    # --- 3. if no queries → only random
    if len(queries) > 0:
        # --- embed queries + posts
        query_embs = sbert_model.encode(queries, convert_to_tensor=True)
        post_embs = sbert_model.encode(subj_posts, convert_to_tensor=True)

        # --- cosine similarity
        cos_scores = util.cos_sim(query_embs, post_embs).cpu().numpy()

        # --- select top-k posts per query
        for i in range(len(queries)):
            top_results = np.argpartition(-cos_scores[i], range(min(k_relevant, len(subj_posts))))[:k_relevant]
            for idx in top_results:
                retrieved_posts.add(subj_posts[idx])

    # --- 4. sample random from remaining posts
    remaining_posts = list(set(subj_posts) - retrieved_posts)
    if remaining_posts and n_random > 0:
        n_to_sample = min(n_random, len(remaining_posts))
        random_posts = np.random.choice(remaining_posts, size=n_to_sample, replace=False)
        retrieved_posts.update(random_posts)

    # --- 5. pad if fewer than target_total
    retrieved_posts = list(retrieved_posts)
    if len(retrieved_posts) < target_total:
        # sample with replacement if necessary
        extra_needed = target_total - len(retrieved_posts)
        padding = np.random.choice(subj_posts, size=extra_needed, replace=True)
        retrieved_posts.extend(padding.tolist())

    # --- 6. if more than target_total (possible if many relevant overlap across queries), truncate
    if len(retrieved_posts) > target_total:
        retrieved_posts = retrieved_posts[:target_total]

    return retrieved_posts


In [None]:
retrieved_data_noise = []

for subject_id in symptoms_df["subject_id"]:
    posts = retrieve_posts_for_subject_noise(subject_id, symptoms_df, posts_df,
                                       k_relevant=15,  # top-15 relevant (padded with random if <15)
                                       n_random=5)    # plus 5 random 
    for p in posts:
        retrieved_data_noise.append({
            "subject_id": subject_id,
            "label": symptoms_df.loc[symptoms_df["subject_id"] == subject_id, "Diagnosis"].values[0],
            "text": p
        })

retrieved_noise_df = pd.DataFrame(retrieved_data_noise)
print(retrieved_noise_df.head())
print(f"Total retrieved posts: {len(retrieved_noise_df)}")
print(f"Unique subjects: {retrieved_noise_df['subject_id'].nunique()}")


    subject_id  label                                               text
0  subject4550      0                                             Pocket
1  subject4550      0                                      So beautiful!
2  subject4550      0  Australia's Chris Rogers to miss another Test ...
3  subject4550      0  He need to go through gastic bypass surgery an...
4  subject4550      0  Good gravy, I love this woman! She's as gifted...
Total retrieved posts: 9720
Unique subjects: 486


In [54]:
#save
retrieved_noise_df.to_csv("../data/processed/retrieved_noise_dataset.csv", index=False)

In [57]:
# Check balance of patients
print("=== Class distribution in retrieved_df ===")
print(retrieved_df.groupby("label")["subject_id"].nunique())

print("=== Class distribution in retrieved_noise_df ===")
print(retrieved_noise_df.groupby("label")["subject_id"].nunique())


=== Class distribution in retrieved_df ===
label
0    403
1     83
Name: subject_id, dtype: int64
=== Class distribution in retrieved_noise_df ===
label
0    403
1     83
Name: subject_id, dtype: int64


In [55]:
retrieved_noise_df["subject_id"].nunique()

486