In [156]:
from Bio import Entrez
import pandas as pd
import time

In [101]:
def parse_mesh_nodes(text):
    # Split the text into nodes. The first element is empty because of the leading '\n', so we discard it.
    nodes_raw = text.split('\n\n\n')[0:-1]  # Ignore the last split as it's empty after the final '\n\n\n'
    
    nodes = []
    for node in nodes_raw:
        node_dict = {}
        # Extract the title
        title_section = node.split('\n', 2)[1]
        #print(title_section)
        if "Supplementary Concept" in title_section:
            continue
        node_title = title_section.split(': ')[1].strip()
        node_dict['Title'] = node_title
        
        # Extract Entry Terms
        entry_terms_section = node.split('Entry Terms:\n')[1].split('\n\n')[0]  # Split at 'Entry Terms:\n' and get the first part
        entry_terms_list = [term.strip() for term in entry_terms_section.split('\n') if term.strip()]  # Remove any empty lines
        node_dict['Entry Terms'] = entry_terms_list
        
        # Extract All MeSH Categories
        all_mesh_categories_section = node.split('All MeSH Categories\n')[1].split('\n\n')[0]  # Split at 'All MeSH Categories' and get the first part
        all_mesh_categories_list = [category.strip() for category in all_mesh_categories_section.split('\n') if category.strip()]  # Remove any empty lines
        node_dict['All MeSH Categories'] = all_mesh_categories_list
        
        nodes.append(node_dict)
    
    return nodes

# Search from Start Nodes
1. Perform a search for each node using the Entrez E-utilities,
2. extract the ID list from the search results, fetch the records for those IDs,
3. parse the fetched records with a function you'll define (parse_mesh_nodes), and
4. finally concatenate the results into a pandas DataFrame.
 
Relevant nodes: 
- Behavioral Symptoms [F01.145.126]
- Neurobehavioral Manifestations [F01.700]
- Mental Disorders [F03]

In [105]:
from urllib.error import HTTPError

In [109]:
def fetch_and_parse_node(tree_number):
    retries = 3  # Maximum number of retries
    for attempt in range(retries):
        try:
            # Search for the node using the Tree Number
            handle = Entrez.esearch(db="mesh", term=tree_number, retmax=1000)
            search_results = Entrez.read(handle)
            handle.close()
            
            # Extract the ID list from the results
            id_list = search_results['IdList']
            
            # Use the ID list to fetch the records
            if id_list:
                handle = Entrez.efetch(db="mesh", id=','.join(id_list))
                records = handle.read()
                handle.close()
                return parse_mesh_nodes(records)
            else:
                return []
        except HTTPError as e:
            if attempt < retries - 1:  # Check if it's not the last attempt
                print(f"Attempt {attempt + 1} failed with error {e}. Retrying in 10 seconds...")
                time.sleep(10)  # Wait for 10 seconds before retrying
            else:
                print(f"Attempt {attempt + 1} failed with error {e}. No more retries.")
                raise  # Re-raise the exception if all retries failed

# Tree Numbers for the nodes
tree_numbers = ['F01.145.126','F01.700', 'F03'] #, 'F01.700', 'F03'

# Fetch, parse, and aggregate data for all nodes
all_nodes_data = []
for tree_number in tree_numbers:
    node_data = fetch_and_parse_node(tree_number)
    all_nodes_data.extend(node_data)

# Convert the aggregated data into a pandas DataFrame
df = pd.DataFrame(all_nodes_data)

print(df)

                            Title  \
0             Behavioral Symptoms   
1  Neurobehavioral Manifestations   
2                Mental Disorders   

                                         Entry Terms  \
0  [Behavioral Symptom, Symptom, Behavioral, Symp...   
1  [Manifestation, Neurobehavioral, Manifestation...   
2  [Mental Disorder, Psychiatric Illness, Psychia...   

                                 All MeSH Categories  
0  [Psychiatry and Psychology Category, Behavior ...  
1  [Diseases Category, Nervous System Diseases, N...  
2  [Psychiatry and Psychology Category, Mental Di...  


In [113]:
df_exploded = df.explode('All MeSH Categories')
df_no_duplicates = df_exploded.drop_duplicates(subset=['All MeSH Categories'])


In [111]:
df_exploded.shape

(210, 3)

In [114]:
df_no_duplicates.shape

(196, 3)

In [152]:
df_no_duplicates.head()

Unnamed: 0,Title,Entry Terms,All MeSH Categories
0,Behavioral Symptoms,"[Behavioral Symptom, Symptom, Behavioral, Symp...",Psychiatry and Psychology Category
0,Behavioral Symptoms,"[Behavioral Symptom, Symptom, Behavioral, Symp...",Behavior and Behavior Mechanisms
0,Behavioral Symptoms,"[Behavioral Symptom, Symptom, Behavioral, Symp...",Behavior
0,Behavioral Symptoms,"[Behavioral Symptom, Symptom, Behavioral, Symp...",Behavioral Symptoms
0,Behavioral Symptoms,"[Behavioral Symptom, Symptom, Behavioral, Symp...",Aberrant Motor Behavior in Dementia


In [115]:
df_no_duplicates.to_csv("./data/mesh_psychiatry_psychology.csv")

# Search for Terminology Synonyms

## load reviewed terminology list

In [120]:
reviewed_mesh_terms = pd.read_csv("data/neuro_diseases_terminology/mesh_psychiatry_psychology_BVI.csv")
reviewed_mesh_terms.shape

(193, 4)

In [139]:
reviewed_mesh_terms_included = reviewed_mesh_terms[reviewed_mesh_terms['1=included, 0=excluded']==1]
reviewed_mesh_terms_included.shape

(185, 4)

In [142]:
reviewed_mesh_terms_included.columns = ["ignore", "mesh_category", "term", "inclusion_flag"]
reviewed_mesh_terms_included['term'] = reviewed_mesh_terms_included['term'].str.replace('+', '', regex=False).str.strip()
reviewed_mesh_terms_included = reviewed_mesh_terms_included[['mesh_category','term']]
reviewed_mesh_terms_included.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviewed_mesh_terms_included['term'] = reviewed_mesh_terms_included['term'].str.replace('+', '', regex=False).str.strip()


Unnamed: 0,mesh_category,term
1,Behavioral Symptoms,Aberrant Motor Behavior in Dementia
2,Behavioral Symptoms,Aggression
3,Behavioral Symptoms,Psychomotor Agitation
4,Behavioral Symptoms,Wandering Behavior
5,Behavioral Symptoms,Affective Symptoms


In [144]:
manual_terms = pd.read_csv("data/neuro_diseases_terminology/bvi_manual_list.csv", header=0, names=['mesh_category','term'])
manual_terms['term'] = manual_terms['term'].str.replace(r'\([^)]*\)', '', regex=True).str.strip()
manual_terms.head(5)

Unnamed: 0,mesh_category,term
0,Psychiatry/mental disorders,Depression
1,Psychiatry/mental disorders,Anxiety disorders
2,Psychiatry/mental disorders,Bipolar disorder
3,Psychiatry/mental disorders,Obsessive-compulsive disorder
4,Psychiatry/mental disorders,Post-traumatic stress disorder


In [147]:
terms_list_combined = pd.concat([reviewed_mesh_terms_included, manual_terms], ignore_index=True)

In [149]:
terms_list_combined.shape

(225, 2)

In [151]:
terms_list_combined_no_duplicates = terms_list_combined.drop_duplicates(subset=['All MeSH Categories'])
terms_list_combined.shape

(221, 2)

In [170]:
last_30_records = terms_list_combined_no_duplicates.tail(30)
last_30_records

Unnamed: 0,mesh_category,term
195,Psychiatry/mental disorders,Phobias
196,Psychiatry/mental disorders,Borderline personality disorder
197,Psychiatry/mental disorders,Narcissistic personality disorder
198,Psychiatry/mental disorders,Antisocial personality disorder
199,Psychiatry/mental disorders,Dissociative disorders
200,Psychiatry/mental disorders,Substance use disorders
201,Psychiatry/mental disorders,Somatoform disorders
202,Psychiatry/mental disorders,Impulse-control disorders
203,Psychiatry/mental disorders,Sleep disorders
204,Psychiatry/mental disorders,Adjustment disorders


In [171]:
# Process each term in the DataFrame
nodes_list_30 = []
for term in last_30_records['term']:
    print(term)
    handle = Entrez.esearch(db="mesh", term=f"{term}[mh]", retmax=1000)
    record = Entrez.read(handle)
    id_list = record['IdList']
    retries = 3  # Maximum number of retries
    if id_list:
        for attempt in range(retries):
            try:
                handle = Entrez.efetch(db="mesh", id=id_list)
                records = handle.read()
                try:
                    nodes = parse_mesh_nodes(records)
                    nodes_list_30.extend(nodes)
                    break  # Success, so break out of the retry loop
                except Exception as parse_error:
                    # Handle parsing error specifically
                    print(f"Error parsing records for term '{term}'. Error: {parse_error}")
                    print(f"Problematic records:\n{records}")
                    break  # Break out of the retry loop, but continue with next term
            except HTTPError as e:
                if attempt < retries - 1:  # Check if it's not the last attempt
                    print(f"Attempt {attempt + 1} failed with error {e}. Retrying in 15 seconds...")
                    time.sleep(15)  # Wait for 10 seconds before retrying
                else:
                    print(f"Attempt {attempt + 1} failed with error {e}. No more retries.")
                    raise  # Re-raise the exception if all retries failed
        

# Assuming `nodes_list` is a list of dictionaries where each dictionary represents parsed information from each MeSH record
final_df_30 = pd.DataFrame(nodes_list_30)

# Display the final DataFrame
print(final_df_30)

Phobias
Borderline personality disorder
Narcissistic personality disorder
Antisocial personality disorder
Dissociative disorders
Substance use disorders
Somatoform disorders
Impulse-control disorders
Sleep disorders
Adjustment disorders
Migraine, headache
Epilepsy
Stroke
Alzheimer's disease
Parkinson's disease
Multiple sclerosis
Amyotrophic lateral sclerosis
Huntington's disease
Bell's palsy
Peripheral neuropathy
Guillain-Barré syndrome
Cerebral palsy
Duchenne muscular systrophy
Brain tumors, e.g., glioma, glioblastoma, meningioma/meningeoma
Spinal cord injury
Hydrocephalus
Meningitis
Encephalitis
Restless legs syndrome
Trigeminal neuralgia
                                                Title  \
0                                    Phobic Disorders   
1                     Borderline Personality Disorder   
2                   Narcissistic Personality Disorder   
3                     Antisocial Personality Disorder   
4                              Dissociative Disorders   
5        

In [163]:
final_df_part1 = pd.DataFrame(nodes_list)

In [166]:
final_df_part1.tail()

Unnamed: 0,Title,Entry Terms,All MeSH Categories
177,"Stress Disorders, Post-Traumatic","[Post-Traumatic Stress Disorder, Stress Disord...","[Psychiatry and Psychology Category, Mental Di..."
178,Feeding and Eating Disorders,"[Eating and Feeding Disorders, Feeding Disorde...","[Psychiatry and Psychology Category, Mental Di..."
179,Autism Spectrum Disorder,"[Autism Spectrum Disorders, Autistic Spectrum ...","[Psychiatry and Psychology Category, Mental Di..."
180,Panic Disorder,"[Disorder, Panic, Disorders, Panic, Panic Diso...","[Psychiatry and Psychology Category, Mental Di..."
181,Phobic Disorders,"[Disorder, Phobic, Phobic Disorder, Neuroses, ...","[Psychiatry and Psychology Category, Mental Di..."


In [232]:
processed_terms_combined = pd.concat([final_df_part1, final_df_30], ignore_index=True)

# Extract the first element from 'All MeSH Categories' and rename the column to 'Disease Class'
processed_terms_combined['Disease Class'] = processed_terms_combined['All MeSH Categories'].apply(lambda x: x[0] if x else None)

# Keep the 'Title', 'Entry Terms', and 'Disease Class' columns
processed_terms_combined = processed_terms_combined[['Title', 'Entry Terms', 'Disease Class']]

# Explode the 'Entry Terms' column to create separate rows for each term
exploded_df = processed_terms_combined.explode('Entry Terms')

# Combine 'Title' and exploded 'Entry Terms' into one column, while keeping 'Disease Class' associated
combined_terms = pd.concat([
    exploded_df[['Entry Terms', 'Disease Class']].rename(columns={'Entry Terms': 'Neurological Disease'}),
    processed_terms_combined[['Title', 'Disease Class']].rename(columns={'Title': 'Neurological Disease'})
], ignore_index=True)

In [233]:
combined_terms[combined_terms['Neurological Disease']=="Depression"]

Unnamed: 0,Neurological Disease,Disease Class
2899,Depression,Psychiatry and Psychology Category


In [194]:
combined_terms['Source'] = "MeSH"

In [195]:
combined_terms.head()

Unnamed: 0,Neurological Disease,Disease Class,Source
0,Disturbance in Motor Function in Dementia,Psychiatry and Psychology Category,MeSH
1,Aberrant Motor Behaviors in Dementia,Psychiatry and Psychology Category,MeSH
2,Disturbances in Motor Function in Dementia,Psychiatry and Psychology Category,MeSH
3,Agitation in Dementia,Psychiatry and Psychology Category,MeSH
4,Dementia Agitation,Psychiatry and Psychology Category,MeSH


In [196]:
combined_terms.to_csv("data/neuro_diseases_terminology/diseases_dictionary_mesh_psych.csv")

### terms not processed successfully

In [175]:
df2_renamed = terms_list_combined_no_duplicates.rename(columns={'term': 'Title'})
df2_renamed.head()

Unnamed: 0,mesh_category,Title
0,Behavioral Symptoms,Aberrant Motor Behavior in Dementia
1,Behavioral Symptoms,Aggression
2,Behavioral Symptoms,Psychomotor Agitation
3,Behavioral Symptoms,Wandering Behavior
4,Behavioral Symptoms,Affective Symptoms


In [176]:
df1 = processed_terms_combined.copy()
# Perform an outer join and use an indicator to track the source of each row
merged_df = pd.merge(df1, df2_renamed, on='Title', how='outer', indicator=True)

# Filter to keep only the rows from df2 (where the merge indicator is 'right_only')
non_matching_rows_df2 = merged_df[merged_df['_merge'] == 'right_only']

# Drop the merge indicator column and any columns from df1 that were added to the result
final_df = non_matching_rows_df2.drop(columns=['_merge'] + list(df1.columns.difference(['Title'])))


In [177]:
final_df

Unnamed: 0,Title,mesh_category
3,Adjustment disorders,Psychiatry/mental disorders
11,Alice in Wonderland Syndrome,Neurobehavioral Manifestations
14,Alzheimer's disease,Neurology
18,Amyotrophic lateral sclerosis,Neurology
23,Antisocial personality disorder,Psychiatry/mental disorders
26,Anxiety disorders,Psychiatry/mental disorders
30,Attention-deficit/hyperactivity disorder,Psychiatry/mental disorders
33,Autism spectrum disorder,Psychiatry/mental disorders
36,Bell's palsy,Neurology
39,Bipolar and Related Disorders,Mental Disorders


## combine with icd and mesh dictionary

In [205]:
psych_df = pd.read_csv("data/neuro_diseases_terminology/diseases_dictionary_mesh_psych.csv",index_col=0)

In [206]:
psych_df.shape

(3097, 3)

In [207]:
psych_df.head(5)

Unnamed: 0,Neurological Disease,Disease Class,Source
0,Disturbance in Motor Function in Dementia,Psychiatry and Psychology Category,MeSH
1,Aberrant Motor Behaviors in Dementia,Psychiatry and Psychology Category,MeSH
2,Disturbances in Motor Function in Dementia,Psychiatry and Psychology Category,MeSH
3,Agitation in Dementia,Psychiatry and Psychology Category,MeSH
4,Dementia Agitation,Psychiatry and Psychology Category,MeSH


In [208]:
neuro_df = pd.read_csv("data/neuro_diseases_terminology/diseases_dictionary_mesh_icd_flat.csv")

In [209]:
neuro_df.shape

(16519, 3)

In [210]:
neuro_df.head()

Unnamed: 0,Neurological Disease,Source,Disease Class
0,"1-methyl-4-phenyl-1,2,3,6-tetrahydropyridine i...",ICD,Diseases of the nervous system
1,11p Partial Monosomy Syndrome,MeSH,Neurologic Manifestations
2,"3s, Spinocerebellar Ataxia",MeSH,Neurodegenerative Diseases
3,"47,XX,+21",MeSH,Neurologic Manifestations
4,"47,XY,+21",MeSH,Neurologic Manifestations


In [211]:
combined_all_df = pd.concat([neuro_df, psych_df], ignore_index=True)

In [212]:
combined_all_df.shape

(19616, 3)

In [213]:
combined_all_df.head()

Unnamed: 0,Neurological Disease,Source,Disease Class
0,"1-methyl-4-phenyl-1,2,3,6-tetrahydropyridine i...",ICD,Diseases of the nervous system
1,11p Partial Monosomy Syndrome,MeSH,Neurologic Manifestations
2,"3s, Spinocerebellar Ataxia",MeSH,Neurodegenerative Diseases
3,"47,XX,+21",MeSH,Neurologic Manifestations
4,"47,XY,+21",MeSH,Neurologic Manifestations


In [214]:
combined_all_df_no_dups = combined_all_df.drop_duplicates(subset=['Neurological Disease'])
combined_all_df_no_dups.shape

(18830, 3)

In [215]:
combined_all_df_no_dups[combined_all_df_no_dups['Neurological Disease'] == 'Depression']

Unnamed: 0,Neurological Disease,Source,Disease Class
19418,Depression,MeSH,Psychiatry and Psychology Category


In [216]:
combined_all_df_no_dups.to_csv("data/neuro_diseases_terminology/diseases_dictionary_mesh_icd_flat_20240313.csv")

### hieararchical representation

In [256]:
processed_terms_combined = pd.concat([final_df_part1, final_df_30], ignore_index=True)

# Extract the first element from 'All MeSH Categories' and rename the column to 'Disease Class'
processed_terms_combined['Disease Class'] = processed_terms_combined['All MeSH Categories'].apply(lambda x: x[0] if x else None)

# Keep the 'Title', 'Entry Terms', and 'Disease Class' columns
processed_terms_combined = processed_terms_combined[['Title', 'Entry Terms', 'Disease Class']]

In [257]:
processed_terms_combined = processed_terms_combined.rename(columns={'Title': 'MeSH Common name', 'Entry Terms': 'MeSH Synonyms', 'Disease Class': 'MeSH Disease Class'})
processed_terms_combined['MeSH Synonyms'] = processed_terms_combined['MeSH Synonyms'].apply(lambda x: " | ".join(x))
processed_terms_combined.head()

Unnamed: 0,MeSH Common name,MeSH Synonyms,MeSH Disease Class
0,Aberrant Motor Behavior in Dementia,Disturbance in Motor Function in Dementia | Ab...,Psychiatry and Psychology Category
1,Aggression,Aggressions,Psychiatry and Psychology Category
2,Psychomotor Agitation,"Psychomotor Hyperactivity | Hyperactivity, Psy...",Diseases Category
3,Wandering Behavior,"Behavior, Wandering",Psychiatry and Psychology Category
4,Affective Symptoms,"Affective Symptom | Symptom, Affective | Sympt...",Psychiatry and Psychology Category


In [258]:
neuro_df_hierarachy = pd.read_csv("data/neuro_diseases_terminology/diseases_dictionary_mesh_icd.csv")

In [259]:
neuro_df_hierarachy.head()

Unnamed: 0,ICD Node URI,ICD Parent URI,Mesh ID,MeSH Tree Number,ICD Title,MeSH Common name,MeSH Disease Class,ICD Disease Class,MeSH Synonyms
0,,,D000069279,C10.228.140.490.125,,Drug Resistant Epilepsy,Central Nervous System Diseases,,"Refractory Epilepsy, Drug| Epilepsies, Drug Re..."
1,,,D000069281,C10.228.140.617.738.275.500,,Autoimmune Hypophysitis,Central Nervous System Diseases,,"Autoimmune Hypophysitides| Hypophysitis, Lymph..."
2,,,D000069544,C10.228.228.399| C10.228.140.430.520,,Infectious Encephalitis,Central Nervous System Diseases,,"Infectious Encephalitis| Encephalitis, Infectious"
3,,,D000070607,C10.668.829.600.375,,Morton Neuroma,Neuromuscular Diseases,,"Morton's Metatarsalgia| Neuroma, Morton| Neuro..."
4,http://id.who.int/icd/entity/1289874444,http://id.who.int/icd/entity/699015578,D000070624,C10.228.140.199.444.375| C10.900.300.087.235.375,Brain contusion,Brain Contusion,Central Nervous System Diseases,Diseases of the nervous system,"Contusion, Brain| Contusio Cerebri| Brain Cont..."


In [260]:
neuro_df_hierarachy.shape, processed_terms_combined.shape

((8130, 9), (209, 3))

In [261]:
processed_terms_combined[processed_terms_combined['MeSH Common name']=='Depression']

Unnamed: 0,MeSH Common name,MeSH Synonyms,MeSH Disease Class
11,Depression,Depressive Symptoms | Depressive Symptom | Sym...,Psychiatry and Psychology Category


In [262]:
combined_all_df_hierarchical = pd.concat([neuro_df_hierarachy, processed_terms_combined], ignore_index=True)

In [263]:
combined_all_df_hierarchical.head()

Unnamed: 0,ICD Node URI,ICD Parent URI,Mesh ID,MeSH Tree Number,ICD Title,MeSH Common name,MeSH Disease Class,ICD Disease Class,MeSH Synonyms
0,,,D000069279,C10.228.140.490.125,,Drug Resistant Epilepsy,Central Nervous System Diseases,,"Refractory Epilepsy, Drug| Epilepsies, Drug Re..."
1,,,D000069281,C10.228.140.617.738.275.500,,Autoimmune Hypophysitis,Central Nervous System Diseases,,"Autoimmune Hypophysitides| Hypophysitis, Lymph..."
2,,,D000069544,C10.228.228.399| C10.228.140.430.520,,Infectious Encephalitis,Central Nervous System Diseases,,"Infectious Encephalitis| Encephalitis, Infectious"
3,,,D000070607,C10.668.829.600.375,,Morton Neuroma,Neuromuscular Diseases,,"Morton's Metatarsalgia| Neuroma, Morton| Neuro..."
4,http://id.who.int/icd/entity/1289874444,http://id.who.int/icd/entity/699015578,D000070624,C10.228.140.199.444.375| C10.900.300.087.235.375,Brain contusion,Brain Contusion,Central Nervous System Diseases,Diseases of the nervous system,"Contusion, Brain| Contusio Cerebri| Brain Cont..."


In [264]:
combined_all_df_hierarchical.shape

(8339, 9)

In [265]:
combined_all_df_hierarchical.to_csv("data/neuro_diseases_terminology/diseases_dictionary_mesh_icd_2024.csv")

In [266]:
combined_all_df_hierarchical[combined_all_df_hierarchical['MeSH Common name']=='Depression']

Unnamed: 0,ICD Node URI,ICD Parent URI,Mesh ID,MeSH Tree Number,ICD Title,MeSH Common name,MeSH Disease Class,ICD Disease Class,MeSH Synonyms
8141,,,,,,Depression,Psychiatry and Psychology Category,,Depressive Symptoms | Depressive Symptom | Sym...
