In [103]:
from Bio import Entrez
import pandas as pd

In [101]:
def parse_mesh_nodes(text):
    # Split the text into nodes. The first element is empty because of the leading '\n', so we discard it.
    nodes_raw = text.split('\n\n\n')[0:-1]  # Ignore the last split as it's empty after the final '\n\n\n'
    
    nodes = []
    for node in nodes_raw:
        node_dict = {}
        # Extract the title
        title_section = node.split('\n', 2)[1]
        #print(title_section)
        if "Supplementary Concept" in title_section:
            continue
        node_title = title_section.split(': ')[1].strip()
        node_dict['Title'] = node_title
        
        # Extract Entry Terms
        entry_terms_section = node.split('Entry Terms:\n')[1].split('\n\n')[0]  # Split at 'Entry Terms:\n' and get the first part
        entry_terms_list = [term.strip() for term in entry_terms_section.split('\n') if term.strip()]  # Remove any empty lines
        node_dict['Entry Terms'] = entry_terms_list
        
        # Extract All MeSH Categories
        all_mesh_categories_section = node.split('All MeSH Categories\n')[1].split('\n\n')[0]  # Split at 'All MeSH Categories' and get the first part
        all_mesh_categories_list = [category.strip() for category in all_mesh_categories_section.split('\n') if category.strip()]  # Remove any empty lines
        node_dict['All MeSH Categories'] = all_mesh_categories_list
        
        nodes.append(node_dict)
    
    return nodes

# Search for Nodes
1. Perform a search for each node using the Entrez E-utilities,
2. extract the ID list from the search results, fetch the records for those IDs,
3. parse the fetched records with a function you'll define (parse_mesh_nodes), and
4. finally concatenate the results into a pandas DataFrame.
Relevant nodes: 
- Behavioral Symptoms [F01.145.126]
- Neurobehavioral Manifestations [F01.700]
- Mental Disorders [F03]

In [105]:
from urllib.error import HTTPError

In [109]:
def fetch_and_parse_node(tree_number):
    retries = 3  # Maximum number of retries
    for attempt in range(retries):
        try:
            # Search for the node using the Tree Number
            handle = Entrez.esearch(db="mesh", term=tree_number, retmax=1000)
            search_results = Entrez.read(handle)
            handle.close()
            
            # Extract the ID list from the results
            id_list = search_results['IdList']
            
            # Use the ID list to fetch the records
            if id_list:
                handle = Entrez.efetch(db="mesh", id=','.join(id_list))
                records = handle.read()
                handle.close()
                return parse_mesh_nodes(records)
            else:
                return []
        except HTTPError as e:
            if attempt < retries - 1:  # Check if it's not the last attempt
                print(f"Attempt {attempt + 1} failed with error {e}. Retrying in 10 seconds...")
                time.sleep(10)  # Wait for 10 seconds before retrying
            else:
                print(f"Attempt {attempt + 1} failed with error {e}. No more retries.")
                raise  # Re-raise the exception if all retries failed

# Tree Numbers for the nodes
tree_numbers = ['F01.145.126','F01.700', 'F03'] #, 'F01.700', 'F03'

# Fetch, parse, and aggregate data for all nodes
all_nodes_data = []
for tree_number in tree_numbers:
    node_data = fetch_and_parse_node(tree_number)
    all_nodes_data.extend(node_data)

# Convert the aggregated data into a pandas DataFrame
df = pd.DataFrame(all_nodes_data)

#print(df)

                            Title  \
0             Behavioral Symptoms   
1  Neurobehavioral Manifestations   
2                Mental Disorders   

                                         Entry Terms  \
0  [Behavioral Symptom, Symptom, Behavioral, Symp...   
1  [Manifestation, Neurobehavioral, Manifestation...   
2  [Mental Disorder, Psychiatric Illness, Psychia...   

                                 All MeSH Categories  
0  [Psychiatry and Psychology Category, Behavior ...  
1  [Diseases Category, Nervous System Diseases, N...  
2  [Psychiatry and Psychology Category, Mental Di...  


In [113]:
df_exploded = df.explode('All MeSH Categories')
df_no_duplicates = df_exploded.drop_duplicates(subset=['All MeSH Categories'])


In [111]:
df_exploded.shape

(210, 3)

In [114]:
df_no_duplicates.shape

(196, 3)

In [115]:
df_no_duplicates.to_csv("./data/mesh_psychiatry_psychology.csv")

### Fetch MeSH Categories based on ID

In [98]:
id_list = ['68001526']

# Note: retmode="text" for fetching plain text data
handle = Entrez.efetch(db="mesh", id=id_list)
records = handle.read()  # Directly read from the handle
records

'\n1: Behavioral Symptoms\nObservable manifestations of impaired psychological functioning.\nYear introduced: 1998\n\nSubheadings:\n    blood\n    cerebrospinal fluid\n    chemically induced\n    classification\n    complications\n    diagnosis\n    diagnostic imaging\n    diet therapy\n    drug therapy\n    economics\n    enzymology\n    epidemiology\n    ethnology\n    etiology\n    genetics\n    history\n    immunology\n    metabolism\n    microbiology\n    mortality\n    nursing\n    parasitology\n    pathology\n    physiopathology\n    prevention and control\n    psychology\n    rehabilitation\n    surgery\n    therapy\n    urine\n    virology\n\nTree Number(s): F01.145.126\nEntry Terms:\n    Behavioral Symptom\n    Symptom, Behavioral\n    Symptoms, Behavioral\n\n    All MeSH Categories\n        Psychiatry and Psychology Category\n            Behavior and Behavior Mechanisms\n                Behavior\n                    Behavioral Symptoms\n                        Aberrant Motor

In [99]:
parse_mesh_nodes(records)

1: Behavioral Symptoms


[{'Title': 'Behavioral Symptoms',
  'Entry Terms': ['Behavioral Symptom',
   'Symptom, Behavioral',
   'Symptoms, Behavioral'],
  'All MeSH Categories': ['Psychiatry and Psychology Category',
   'Behavior and Behavior Mechanisms',
   'Behavior',
   'Behavioral Symptoms',
   'Aberrant Motor Behavior in Dementia',
   'Aggression',
   'Psychomotor Agitation',
   'Wandering Behavior',
   'Affective Symptoms',
   'Aggression',
   'Agonistic Behavior',
   'Bullying +',
   'Catatonia',
   'Child Reactive Disorders',
   'Delusions',
   'Depersonalization',
   'Depression',
   'Encopresis',
   'Enuresis',
   'Diurnal Enuresis',
   'Nocturnal Enuresis',
   'Hearing Loss, Functional',
   'Human Coprophagia',
   'Malingering',
   'Mental Fatigue',
   'Alert Fatigue, Health Personnel',
   'Compassion Fatigue',
   'Obsessive Behavior',
   'Stalking',
   'Paranoid Behavior',
   'Polydipsia, Psychogenic',
   'Problem Behavior',
   'Schizophrenic Language',
   'Self-Injurious Behavior',
   'Excoriation

### Search for Terminology

In [88]:
descriptor = "Depression"
handle = Entrez.esearch(db="mesh", term=f"{descriptor}[mh]", retmax=1000)
record = Entrez.read(handle)
record

{'Count': '2', 'RetMax': '2', 'RetStart': '0', 'IdList': ['68003866', '68003863'], 'TranslationSet': [{'From': 'Depression[mh]', 'To': '"depressive disorder"[MeSH Terms] OR "depression"[MeSH Terms]'}], 'TranslationStack': [{'Term': '"depressive disorder"[MeSH Terms]', 'Field': 'MeSH Terms', 'Count': '1', 'Explode': 'N'}, {'Term': '"depression"[MeSH Terms]', 'Field': 'MeSH Terms', 'Count': '1', 'Explode': 'N'}, 'OR', 'GROUP'], 'QueryTranslation': '"depressive disorder"[MeSH Terms] OR "depression"[MeSH Terms]'}

In [89]:
id_list = ['68003866']

# Note: retmode="text" for fetching plain text data
handle = Entrez.efetch(db="mesh", id=id_list)
records = handle.read()  # Directly read from the handle
records

'\n1: Depressive Disorder\nAn affective disorder manifested by either a dysphoric mood or loss of interest\nor pleasure in usual activities. The mood disturbance is prominent and relatively\npersistent.\nYear introduced: 1981\n\nSubheadings:\n    blood\n    cerebrospinal fluid\n    chemically induced\n    classification\n    complications\n    diagnosis\n    diagnostic imaging\n    diet therapy\n    drug therapy\n    economics\n    enzymology\n    epidemiology\n    ethnology\n    etiology\n    genetics\n    history\n    immunology\n    metabolism\n    microbiology\n    mortality\n    nursing\n    parasitology\n    pathology\n    physiopathology\n    prevention and control\n    psychology\n    rehabilitation\n    surgery\n    therapy\n    urine\n    virology\n\nTree Number(s): F03.600.300\nEntry Terms:\n    Depressive Disorders\n    Disorder, Depressive\n    Disorders, Depressive\n    Neurosis, Depressive\n    Depressive Neuroses\n    Depressive Neurosis\n    Neuroses, Depressive\n    D

In [90]:
parse_mesh_nodes(records)

1: Depressive Disorder


[{'Title': 'Depressive Disorder',
  'Entry Terms': ['Depressive Disorders',
   'Disorder, Depressive',
   'Disorders, Depressive',
   'Neurosis, Depressive',
   'Depressive Neuroses',
   'Depressive Neurosis',
   'Neuroses, Depressive',
   'Depression, Endogenous',
   'Depressions, Endogenous',
   'Endogenous Depression',
   'Endogenous Depressions',
   'Depressive Syndrome',
   'Depressive Syndromes',
   'Syndrome, Depressive',
   'Syndromes, Depressive',
   'Depression, Neurotic',
   'Depressions, Neurotic',
   'Neurotic Depression',
   'Neurotic Depressions',
   'Melancholia',
   'Melancholias',
   'Unipolar Depression',
   'Depression, Unipolar',
   'Depressions, Unipolar',
   'Unipolar Depressions'],
  'All MeSH Categories': ['Psychiatry and Psychology Category',
   'Mental Disorders',
   'Mood Disorders',
   'Depressive Disorder',
   'Depression, Postpartum',
   'Depressive Disorder, Major',
   'Depressive Disorder, Treatment-Resistant',
   'Dysthymic Disorder',
   'Premenstrual 

In [57]:
nodes_raw = records.split('\n\n\n')[0:-1]
nodes_raw[0].split('\n', 2)#[0]

['',
 '1: Depressive Disorder',
 'An affective disorder manifested by either a dysphoric mood or loss of interest\nor pleasure in usual activities. The mood disturbance is prominent and relatively\npersistent.\nYear introduced: 1981\n\nSubheadings:\n    blood\n    cerebrospinal fluid\n    chemically induced\n    classification\n    complications\n    diagnosis\n    diagnostic imaging\n    diet therapy\n    drug therapy\n    economics\n    enzymology\n    epidemiology\n    ethnology\n    etiology\n    genetics\n    history\n    immunology\n    metabolism\n    microbiology\n    mortality\n    nursing\n    parasitology\n    pathology\n    physiopathology\n    prevention and control\n    psychology\n    rehabilitation\n    surgery\n    therapy\n    urine\n    virology\n\nTree Number(s): F03.600.300\nEntry Terms:\n    Depressive Disorders\n    Disorder, Depressive\n    Disorders, Depressive\n    Neurosis, Depressive\n    Depressive Neuroses\n    Depressive Neurosis\n    Neuroses, Depressive

In [76]:
def parse_mesh_nodes(text):
    # Split the text into nodes. The first element is empty because of the leading '\n', so we discard it.
    nodes_raw = text.split('\n\n\n')[0:-1]  # Ignore the last split as it's empty after the final '\n\n\n'
    
    nodes = []
    for node in nodes_raw:
        node_dict = {}
        # Extract the title
        title_section = node.split('\n', 2)[1]
        #print(title_section)
        if "Supplementary Concept" in title_section:
            continue
        node_title = title_section.split(': ')[1].strip()
        node_dict['Title'] = node_title
        
        # Extract Entry Terms
        entry_terms_section = node.split('Entry Terms:\n')[1].split('\n\n')[0]  # Split at 'Entry Terms:\n' and get the first part
        entry_terms_list = [term.strip() for term in entry_terms_section.split('\n') if term.strip()]  # Remove any empty lines
        node_dict['Entry Terms'] = entry_terms_list
        
        # Extract All MeSH Categories
        all_mesh_categories_section = node.split('All MeSH Categories\n')[1].split('\n\n')[0]  # Split at 'All MeSH Categories' and get the first part
        all_mesh_categories_list = [category.strip() for category in all_mesh_categories_section.split('\n') if category.strip()]  # Remove any empty lines
        node_dict['All MeSH Categories'] = all_mesh_categories_list
        
        nodes.append(node_dict)
    
    return nodes
