<a href="https://colab.research.google.com/github/KravitzLab/PsygeneAnalyses/blob/PCA_analysis/rdoc_heiarchy_create.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:

import os, re, zipfile
import pandas as pd
from bs4 import BeautifulSoup
from google.colab import files
import csv


In [None]:
from numpy import empty
# --- Step 1: Upload multiple HTML files ---
print("Upload one or more RDoC HTML files (hold Ctrl or Shift to select multiple)")
uploaded = files.upload()
filenames = list(uploaded.keys())

all_data = []

### Create function to clean the names ###
def clean_name(name):
    # remove extension
    name = re.sub(r"\.html?$", "", name)
    # remove NIMH and trailing text
    name = re.split(r"-\s*National Institute of Mental Health", name, flags=re.I)[0]
    # remove quotes, parentheses, etc.
    name = re.sub(r"[“”\"()]", "", name)
    # normalize whitespace and special characters
    name = re.sub(r"[^a-zA-Z0-9]+", "_", name)
    # lowercase, strip leading/trailing underscores
    return name.lower().strip("_")



# --- Step 2: Parse each file ---
for html_path in filenames:
    print(f"Parsing {html_path}...")

    # get the subconstrcut name
    src_name = clean_name(html_path)

    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml")

    # get raw text
    system_section = soup.find("p", class_="rdoc-construct-slug")
    system_text = system_section.get_text(strip=True)  # e.g. "Domain: Positive Valence Systems>Construct: Reward Responsiveness"

    # split on '>' first
    parts = system_text.split(">")
    system_name = None
    construct_name = None

    for part in parts:
      part = part.strip()
      if part.lower().startswith("domain:"):
        system_name = part.split("Domain:")[1].strip()
      elif part.lower().startswith("construct:"):
        construct_name = part.split("Construct:")[1].strip()

    # if no construct then construct == subconstruct
    if not construct_name:
      construct_name = src_name


    # --- Extract unit sections (Molecules, Behavior, etc.) ---
    sections = soup.find_all("section", class_="rdoc-unit")

    for sec in sections:
        # Get the main category (like "Behavior" or "Molecules")
        unit_name_tag = sec.find("h2", class_="rdoc-unit__heading--unit-name")
        if not unit_name_tag:
            continue
        unit_name = unit_name_tag.get_text(strip=True)

        # Get all elements listed under this unit
        elements = sec.find_all("a", class_="rdoc-unit__el-link")

        for el in elements:
            element_name = el.get_text(strip=True)
            element_href = el.get("href", "")
            element_title = el.get("title", "")

            all_data.append({
                "system": clean_name(system_name),
                "constrct": clean_name(construct_name),
                "subconstruct": src_name,
                "element_type": clean_name(unit_name),
                "element_name": clean_name(element_name)
            })

# --- Step 3: Combine and export ---
df = pd.DataFrame(all_data)

write_download = False
if write_download == True:
  output_file = "rdoc_full_hierarchy.csv"
  df.to_csv(output_file, index=False)

  files.download(output_file)

In [83]:
print(df["element_name"].unique())

['mirror_neurons' 'inferior_parietal_cortex' 'superior_temporal_sulcus'
 'ventral_dorsal_premotor' 'cortico_spinal_facilitation_tms'
 'mu_suppression' 'ability_to_identify_what_actions_an_agent_is_executing'
 'gaze_following' 'imitation' 'mimicry' 'balanced_emotional_empathy_scale'
 'empathy_quotient'
 'perspective_taking_and_empathic_concern_subscales_of_the_interpersonal_reactivity_index'
 'how_why_task' 'posterior_parietal_cortex' 'premotor_cortex'
 'supplementary_motor_area' 'conceptual_apraxia' 'ideational_apraxia'
 'ideomotor_apraxia' 'limb_kinetic_apraxia' 'go_before_you_know' 'd1'
 'dopamine' 'gaba' 'glutamate' 'nmda'
 'distinct_types_of_inhibitory_neurons' 'pyramidal_cells'
 'pfc_parietal_cingulate_dorsal_thalamus_dorsal_striatum' 'vlpfc' 'delta'
 'eeg_gamma_rhythms' 'eeg_theta_rhythms' 'ax_cpt_dpx'
 'change_detection_tasks' 'complex_span_tasks'
 'delayed_match_to_non_sample' 'delayed_match_to_sample' 'keep_track_task'
 'letter_memory_running_memory' 'letter_number_sequencing'