# About this notebook:

- We are creating dictionaries for subsequent article labelling.
- We will be using BeautifulSoup to scrap the website [MedlinePlus](https://medlineplus.gov)

**About MedlinePlus**<br>
MedlinePlus is an online information service produced by the United States National Library of Medicine (NLM). We use it's glossary to find terms associated to respective diseases.

# Import Libraries

In [1]:
import numpy as np 
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import re


from tqdm import tqdm
tqdm.pandas()

In [2]:
from bs4 import BeautifulSoup
import requests
import json


# Part 1.1: Obtaining disease type and its url

In [3]:
url="https://medlineplus.gov/healthtopics.html"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'lxml') 


In [4]:
soup

<!DOCTYPE html>
<html class="nojs us" data-root="https://medlineplus.gov/" id="health_topics_landing" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="_top" http-equiv="window-target"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://medlineplus.gov/healthtopics.html" rel="canonical"/>
<link href="https://medlineplus.gov/healthtopics.html" hreflang="en" rel="alternate"/>
<link href="https://medlineplus.gov/spanish/healthtopics.html" hreflang="es" rel="alternate"/>
<meta content="medlineplus-ac-dictionary" name="ac-dictionary"/>
<meta content="Information on symptoms, causes, treatment and prevention for over 1000 diseases, illnesses, health conditions and wellness issues." name="description"/>
<link href="https://medlineplus.gov/images/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="http

In [5]:
 diseas_grp = soup.find_all('div', {'class':'section-body'})[1]

In [6]:
diseas_grp = diseas_grp.find_all('li')

In [7]:
diseas_grp[0].text.replace('\n','')

'Blood, Heart and Circulation'

In [8]:
diseas_grp[0].find('a')['href']

'https://medlineplus.gov/bloodheartandcirculation.html'

In [9]:
dict_ls=[]
for each_d in diseas_grp:
    disease={}
    disease['disease_type']=each_d.text.replace('\n','')
    disease['url'] = each_d.find('a')['href']
    dict_ls.append(disease)
dict_ls

[{'disease_type': 'Blood, Heart and Circulation',
  'url': 'https://medlineplus.gov/bloodheartandcirculation.html'},
 {'disease_type': 'Bones, Joints and Muscles',
  'url': 'https://medlineplus.gov/bonesjointsandmuscles.html'},
 {'disease_type': 'Brain and Nerves',
  'url': 'https://medlineplus.gov/brainandnerves.html'},
 {'disease_type': 'Digestive System',
  'url': 'https://medlineplus.gov/digestivesystem.html'},
 {'disease_type': 'Ear, Nose and Throat',
  'url': 'https://medlineplus.gov/earnoseandthroat.html'},
 {'disease_type': 'Endocrine System',
  'url': 'https://medlineplus.gov/endocrinesystem.html'},
 {'disease_type': 'Eyes and Vision',
  'url': 'https://medlineplus.gov/eyesandvision.html'},
 {'disease_type': 'Immune System',
  'url': 'https://medlineplus.gov/immunesystem.html'},
 {'disease_type': 'Kidneys and Urinary System',
  'url': 'https://medlineplus.gov/kidneysandurinarysystem.html'},
 {'disease_type': 'Lungs and Breathing',
  'url': 'https://medlineplus.gov/lungsandbrea

In [10]:
Disease_df = pd.DataFrame(dict_ls)
Disease_df

Unnamed: 0,disease_type,url
0,"Blood, Heart and Circulation",https://medlineplus.gov/bloodheartandcirculati...
1,"Bones, Joints and Muscles",https://medlineplus.gov/bonesjointsandmuscles....
2,Brain and Nerves,https://medlineplus.gov/brainandnerves.html
3,Digestive System,https://medlineplus.gov/digestivesystem.html
4,"Ear, Nose and Throat",https://medlineplus.gov/earnoseandthroat.html
5,Endocrine System,https://medlineplus.gov/endocrinesystem.html
6,Eyes and Vision,https://medlineplus.gov/eyesandvision.html
7,Immune System,https://medlineplus.gov/immunesystem.html
8,Kidneys and Urinary System,https://medlineplus.gov/kidneysandurinarysyste...
9,Lungs and Breathing,https://medlineplus.gov/lungsandbreathing.html


# Part 1.2 : Obtaining terms associated to each disease type

In [18]:
Disease_df['Associated_words']=np.nan
Disease_df

Unnamed: 0,disease_type,url,Associated_words
0,"Blood, Heart and Circulation",https://medlineplus.gov/bloodheartandcirculati...,
1,"Bones, Joints and Muscles",https://medlineplus.gov/bonesjointsandmuscles....,
2,Brain and Nerves,https://medlineplus.gov/brainandnerves.html,
3,Digestive System,https://medlineplus.gov/digestivesystem.html,
4,"Ear, Nose and Throat",https://medlineplus.gov/earnoseandthroat.html,
5,Endocrine System,https://medlineplus.gov/endocrinesystem.html,
6,Eyes and Vision,https://medlineplus.gov/eyesandvision.html,
7,Immune System,https://medlineplus.gov/immunesystem.html,
8,Kidneys and Urinary System,https://medlineplus.gov/kidneysandurinarysyste...,
9,Lungs and Breathing,https://medlineplus.gov/lungsandbreathing.html,


In [19]:
Associated_words_dictionary={}
for index, row in tqdm(Disease_df.iterrows(), total=len(Disease_df)):
    #Step 1: obtain link
    url = row['url']
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'lxml')     

    #Step 2: obtain list of related words in each web:
    _ls = soup.find_all('li',{'class':'item'})

    _synonym_ls = []
    for item in _ls:
        dict_ = {}
        regex = r'see.*'
        _str = item.text.replace('\n','').strip()
        word = re.sub(regex, "", _str).strip()
        _synonym_ls.append(word)
            
    #Step 3: put into the dataframe:
    name = row['disease_type']
    Associated_words_dictionary[f'{name}'] = _synonym_ls

Associated_words_dictionary

100%|██████████| 14/14 [00:19<00:00,  1.43s/it]


{'Blood, Heart and Circulation': ['AAA',
  'Abdominal Aortic Aneurysm',
  'ABO Blood Groups',
  'Acute Lymphoblastic Leukemia',
  'Acute Lymphocytic Leukemia',
  'Acute Myeloblastic Leukemia',
  'Acute Myeloid Leukemia',
  'AF',
  'ALL',
  'AML',
  'Anatomy',
  'Anemia',
  'Aneurysms',
  'Angiitis',
  'Angina',
  'Angioplasty',
  'Anti-platelet drugs',
  'Anticoagulants',
  'Antihypertensive Medicines',
  'Aortic Aneurysm',
  'Aortic Dissection',
  'Aortic Stenosis',
  'Aplastic Anemia',
  'Arrhythmia',
  'Arteriosclerosis',
  'Arteriosclerosis of Extremities',
  'Arteriosclerosis, Coronary',
  'Arteriovenous Malformations',
  'Atherosclerosis',
  'Atherosclerosis, Coronary',
  'Atrial Fibrillation',
  'Automated External Defibrillators',
  'AVM',
  "Behcet's Syndrome",
  'Berry Aneurysm',
  'Bleeding',
  'Bleeding Disorders',
  'Blood',
  'Blood Cells',
  'Blood Clots',
  'Blood Coagulation Disorders',
  'Blood Count Tests',
  'Blood Disorders',
  'Blood Donation',
  'Blood Platelet D

In [20]:
Disease_df['Associated_words'] = Disease_df['disease_type'].map(Associated_words_dictionary)
Disease_df

Unnamed: 0,disease_type,url,Associated_words
0,"Blood, Heart and Circulation",https://medlineplus.gov/bloodheartandcirculati...,"[AAA, Abdominal Aortic Aneurysm, ABO Blood Gro..."
1,"Bones, Joints and Muscles",https://medlineplus.gov/bonesjointsandmuscles....,"[Achilles Tendon Injuries, Achondroplasia, ACL..."
2,Brain and Nerves,https://medlineplus.gov/brainandnerves.html,"[Acoustic Neuroma, Acromegaly, Acute Flaccid M..."
3,Digestive System,https://medlineplus.gov/digestivesystem.html,"[Abdominal Pain, Achalasia, Acid Reflux, Acute..."
4,"Ear, Nose and Throat",https://medlineplus.gov/earnoseandthroat.html,"[Achalasia, Acoustic Neuroma, Adenoidectomy, A..."
5,Endocrine System,https://medlineplus.gov/endocrinesystem.html,"[Achondroplasia, Acromegaly, Addison Disease, ..."
6,Eyes and Vision,https://medlineplus.gov/eyesandvision.html,"[Age-Related Macular Degeneration, Amblyopia, ..."
7,Immune System,https://medlineplus.gov/immunesystem.html,"[Acquired Immunodeficiency Syndrome, Acute Lym..."
8,Kidneys and Urinary System,https://medlineplus.gov/kidneysandurinarysyste...,"[Anatomy, Bladder Cancer, Bladder Control, Bla..."
9,Lungs and Breathing,https://medlineplus.gov/lungsandbreathing.html,"[Acute Bronchitis, Acute Respiratory Distress ..."


In [21]:
disease_ls = Disease_df['disease_type'].to_list()

In [22]:
mesh_df = pd.read_csv("MeSH_terms.csv")
mesh_df.head()

Unnamed: 0,Term,term_full_ID,Pri_Disease_ID,Sec_Disease_ID,Pri_disease_term,Sec_disease_term
0,Infections,C01,C01,C01,Infections,
1,"Aneurysm, Infected",C01.069,C01,C01.069,Infections,"Aneurysm, Infected"
2,"Arthritis, Infectious",C01.100,C01,C01.100,Infections,"Arthritis, Infectious"
3,"Arthritis, Reactive",C01.100.500,C01,C01.100,Infections,"Arthritis, Infectious"
4,Asymptomatic Infections,C01.125,C01,C01.125,Infections,Asymptomatic Infections


# Part 1.3: Matching each disease type to PubMed Primary diseases

In [23]:
_df = mesh_df.loc[(mesh_df["term_full_ID"].str.len())<4,:]
Pri_disease_dict = dict(zip(_df['term_full_ID'], _df['Term']))
Pri_disease_dict

{'C01': 'Infections',
 'C04': 'Neoplasms',
 'C05': 'Musculoskeletal Diseases',
 'C06': 'Digestive System Diseases',
 'C07': 'Stomatognathic Diseases',
 'C08': 'Respiratory Tract Diseases',
 'C09': 'Otorhinolaryngologic Diseases',
 'C10': 'Nervous System Diseases',
 'C11': 'Eye Diseases',
 'C12': 'Urogenital Diseases',
 'C14': 'Cardiovascular Diseases',
 'C15': 'Hemic and Lymphatic Diseases',
 'C16': 'Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
 'C17': 'Skin and Connective Tissue Diseases',
 'C18': 'Nutritional and Metabolic Diseases',
 'C19': 'Endocrine System Diseases',
 'C20': 'Immune System Diseases',
 'C21': 'Disorders of Environmental Origin',
 'C22': 'Animal Diseases',
 'C23': 'Pathological Conditions, Signs and Symptoms',
 'C24': 'Occupational Diseases',
 'C25': 'Chemically-Induced Disorders',
 'C26': 'Wounds and Injuries'}

In [24]:
Pri_disease = list(mesh_df['Pri_disease_term'].unique())
Pri_disease

['Infections',
 'Neoplasms',
 'Musculoskeletal Diseases',
 'Digestive System Diseases',
 'Stomatognathic Diseases',
 'Respiratory Tract Diseases',
 'Otorhinolaryngologic Diseases',
 'Nervous System Diseases',
 'Eye Diseases',
 'Urogenital Diseases',
 'Cardiovascular Diseases',
 'Hemic and Lymphatic Diseases',
 'Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
 'Skin and Connective Tissue Diseases',
 'Nutritional and Metabolic Diseases',
 'Endocrine System Diseases',
 'Immune System Diseases',
 'Disorders of Environmental Origin',
 'Animal Diseases',
 'Pathological Conditions, Signs and Symptoms',
 'Occupational Diseases',
 'Chemically-Induced Disorders',
 'Wounds and Injuries']

In [25]:
PubMed_disease = [
'Cardiovascular Diseases',
'Musculoskeletal Diseases',
'Nervous System Diseases',
'Digestive System Diseases',
'Otorhinolaryngologic Diseases',
'Endocrine System Diseases',
'Eye Diseases',
'Immune System Diseases',
'Urogenital Diseases',
'Respiratory Tract Diseases',
'Otorhinolaryngologic Diseases',
'Skin and Connective Tissue Diseases',
'Urogenital Diseases',
'Urogenital Diseases'
]

In [27]:
new_dict = {key: value for key, value in zip(disease_ls, PubMed_disease)}
new_dict

{'Blood, Heart and Circulation': 'Cardiovascular Diseases',
 'Bones, Joints and Muscles': 'Musculoskeletal Diseases',
 'Brain and Nerves': 'Nervous System Diseases',
 'Digestive System': 'Digestive System Diseases',
 'Ear, Nose and Throat': 'Otorhinolaryngologic Diseases',
 'Endocrine System': 'Endocrine System Diseases',
 'Eyes and Vision': 'Eye Diseases',
 'Immune System': 'Immune System Diseases',
 'Kidneys and Urinary System': 'Urogenital Diseases',
 'Lungs and Breathing': 'Respiratory Tract Diseases',
 'Mouth and Teeth': 'Otorhinolaryngologic Diseases',
 'Skin, Hair and Nails': 'Skin and Connective Tissue Diseases',
 'Female Reproductive System': 'Urogenital Diseases',
 'Male Reproductive System': 'Urogenital Diseases'}

In [28]:
Disease_df['PubMed_disease'] = Disease_df['disease_type'].map(new_dict)
Disease_df

Unnamed: 0,disease_type,url,Associated_words,PubMed_disease
0,"Blood, Heart and Circulation",https://medlineplus.gov/bloodheartandcirculati...,"[AAA, Abdominal Aortic Aneurysm, ABO Blood Gro...",Cardiovascular Diseases
1,"Bones, Joints and Muscles",https://medlineplus.gov/bonesjointsandmuscles....,"[Achilles Tendon Injuries, Achondroplasia, ACL...",Musculoskeletal Diseases
2,Brain and Nerves,https://medlineplus.gov/brainandnerves.html,"[Acoustic Neuroma, Acromegaly, Acute Flaccid M...",Nervous System Diseases
3,Digestive System,https://medlineplus.gov/digestivesystem.html,"[Abdominal Pain, Achalasia, Acid Reflux, Acute...",Digestive System Diseases
4,"Ear, Nose and Throat",https://medlineplus.gov/earnoseandthroat.html,"[Achalasia, Acoustic Neuroma, Adenoidectomy, A...",Otorhinolaryngologic Diseases
5,Endocrine System,https://medlineplus.gov/endocrinesystem.html,"[Achondroplasia, Acromegaly, Addison Disease, ...",Endocrine System Diseases
6,Eyes and Vision,https://medlineplus.gov/eyesandvision.html,"[Age-Related Macular Degeneration, Amblyopia, ...",Eye Diseases
7,Immune System,https://medlineplus.gov/immunesystem.html,"[Acquired Immunodeficiency Syndrome, Acute Lym...",Immune System Diseases
8,Kidneys and Urinary System,https://medlineplus.gov/kidneysandurinarysyste...,"[Anatomy, Bladder Cancer, Bladder Control, Bla...",Urogenital Diseases
9,Lungs and Breathing,https://medlineplus.gov/lungsandbreathing.html,"[Acute Bronchitis, Acute Respiratory Distress ...",Respiratory Tract Diseases


# Part 1.4 Create dict asso_dict

In [29]:
asso_df = Disease_df.loc[:,["Associated_words","PubMed_disease"]]
asso_df

Unnamed: 0,Associated_words,PubMed_disease
0,"[AAA, Abdominal Aortic Aneurysm, ABO Blood Gro...",Cardiovascular Diseases
1,"[Achilles Tendon Injuries, Achondroplasia, ACL...",Musculoskeletal Diseases
2,"[Acoustic Neuroma, Acromegaly, Acute Flaccid M...",Nervous System Diseases
3,"[Abdominal Pain, Achalasia, Acid Reflux, Acute...",Digestive System Diseases
4,"[Achalasia, Acoustic Neuroma, Adenoidectomy, A...",Otorhinolaryngologic Diseases
5,"[Achondroplasia, Acromegaly, Addison Disease, ...",Endocrine System Diseases
6,"[Age-Related Macular Degeneration, Amblyopia, ...",Eye Diseases
7,"[Acquired Immunodeficiency Syndrome, Acute Lym...",Immune System Diseases
8,"[Anatomy, Bladder Cancer, Bladder Control, Bla...",Urogenital Diseases
9,"[Acute Bronchitis, Acute Respiratory Distress ...",Respiratory Tract Diseases


In [30]:
asso_df = asso_df.explode('Associated_words')

In [31]:
asso_df = asso_df.apply(lambda x: x.str.lower())
asso_df

Unnamed: 0,Associated_words,PubMed_disease
0,aaa,cardiovascular diseases
0,abdominal aortic aneurysm,cardiovascular diseases
0,abo blood groups,cardiovascular diseases
0,acute lymphoblastic leukemia,cardiovascular diseases
0,acute lymphocytic leukemia,cardiovascular diseases
...,...,...
13,testicular disorders,urogenital diseases
13,undescended testicle,urogenital diseases
13,vasectomy,urogenital diseases
13,venereal disease,urogenital diseases


In [34]:
asso_dict = asso_df.groupby('Associated_words')['PubMed_disease'].apply(list).to_dict()
asso_dict

{'aaa': ['cardiovascular diseases'],
 'abdominal aortic aneurysm': ['cardiovascular diseases'],
 'abdominal pain': ['digestive system diseases'],
 'abdominal pregnancy': ['urogenital diseases'],
 'abo blood groups': ['cardiovascular diseases'],
 'abortion': ['urogenital diseases'],
 'accidents': ['skin and connective tissue diseases'],
 'achalasia': ['digestive system diseases', 'otorhinolaryngologic diseases'],
 'achilles tendon injuries': ['musculoskeletal diseases'],
 'achondroplasia': ['musculoskeletal diseases', 'endocrine system diseases'],
 'acid reflux': ['digestive system diseases'],
 'acl injuries': ['musculoskeletal diseases'],
 'acne': ['skin and connective tissue diseases'],
 'acne rosacea': ['skin and connective tissue diseases'],
 'acoustic neuroma': ['nervous system diseases',
  'otorhinolaryngologic diseases'],
 'acquired immunodeficiency syndrome': ['immune system diseases'],
 'acromegaly': ['nervous system diseases', 'endocrine system diseases'],
 'actinic keratosis'

In [35]:
for key, value in asso_dict.items():
    asso_dict[key]=set(value)

In [36]:
asso_dict

{'aaa': {'cardiovascular diseases'},
 'abdominal aortic aneurysm': {'cardiovascular diseases'},
 'abdominal pain': {'digestive system diseases'},
 'abdominal pregnancy': {'urogenital diseases'},
 'abo blood groups': {'cardiovascular diseases'},
 'abortion': {'urogenital diseases'},
 'accidents': {'skin and connective tissue diseases'},
 'achalasia': {'digestive system diseases', 'otorhinolaryngologic diseases'},
 'achilles tendon injuries': {'musculoskeletal diseases'},
 'achondroplasia': {'endocrine system diseases', 'musculoskeletal diseases'},
 'acid reflux': {'digestive system diseases'},
 'acl injuries': {'musculoskeletal diseases'},
 'acne': {'skin and connective tissue diseases'},
 'acne rosacea': {'skin and connective tissue diseases'},
 'acoustic neuroma': {'nervous system diseases',
  'otorhinolaryngologic diseases'},
 'acquired immunodeficiency syndrome': {'immune system diseases'},
 'acromegaly': {'endocrine system diseases', 'nervous system diseases'},
 'actinic keratosis'

# Part 2: Repeat the steps to create another dictionary asso_dict_2 for other disease types

<font size=5>Obtaining Disease types & url</font>

In [37]:
url="https://medlineplus.gov/healthtopics.html"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'lxml') 

In [38]:
soup

<!DOCTYPE html>
<html class="nojs us" data-root="https://medlineplus.gov/" id="health_topics_landing" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="_top" http-equiv="window-target"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://medlineplus.gov/healthtopics.html" rel="canonical"/>
<link href="https://medlineplus.gov/healthtopics.html" hreflang="en" rel="alternate"/>
<link href="https://medlineplus.gov/spanish/healthtopics.html" hreflang="es" rel="alternate"/>
<meta content="medlineplus-ac-dictionary" name="ac-dictionary"/>
<meta content="Information on symptoms, causes, treatment and prevention for over 1000 diseases, illnesses, health conditions and wellness issues." name="description"/>
<link href="https://medlineplus.gov/images/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="http

In [39]:
# <div class="section-body"
# ul class="indent"
_soup = soup.find_all('ul',{'class':'indent'})[1]

In [40]:
diseas_grp_2 = _soup.find_all('li')

In [41]:
diseas_grp_2

[<li type="groups"><a href="https://medlineplus.gov/cancers.html">Cancers</a></li>,
 <li type="groups"><a href="https://medlineplus.gov/diabetesmellitus.html">Diabetes Mellitus</a></li>,
 <li type="groups"><a href="https://medlineplus.gov/geneticsbirthdefects.html">Genetics/Birth Defects</a></li>,
 <li type="groups"><a href="https://medlineplus.gov/infections.html">Infections</a></li>,
 <li type="groups"><a href="https://medlineplus.gov/injuriesandwounds.html">Injuries and Wounds</a></li>,
 <li type="groups"><a href="https://medlineplus.gov/mentalhealthandbehavior.html">Mental Health and Behavior</a></li>,
 <li type="groups"><a href="https://medlineplus.gov/metabolicproblems.html">Metabolic Problems</a></li>,
 <li type="groups"><a href="https://medlineplus.gov/poisoningtoxicologyenvironmentalhealth.html">Poisoning, Toxicology, Environmental Health</a></li>,
 <li type="groups"><a href="https://medlineplus.gov/pregnancyandreproduction.html">Pregnancy and Reproduction</a></li>,
 <li type=

In [42]:
dict_ls_2=[]
for each_d in diseas_grp_2:
    disease={}
    disease['disease_type']=each_d.text
    disease['url'] = each_d.find('a')['href']
    dict_ls_2.append(disease)
dict_ls_2

[{'disease_type': 'Cancers', 'url': 'https://medlineplus.gov/cancers.html'},
 {'disease_type': 'Diabetes Mellitus',
  'url': 'https://medlineplus.gov/diabetesmellitus.html'},
 {'disease_type': 'Genetics/Birth Defects',
  'url': 'https://medlineplus.gov/geneticsbirthdefects.html'},
 {'disease_type': 'Infections',
  'url': 'https://medlineplus.gov/infections.html'},
 {'disease_type': 'Injuries and Wounds',
  'url': 'https://medlineplus.gov/injuriesandwounds.html'},
 {'disease_type': 'Mental Health and Behavior',
  'url': 'https://medlineplus.gov/mentalhealthandbehavior.html'},
 {'disease_type': 'Metabolic Problems',
  'url': 'https://medlineplus.gov/metabolicproblems.html'},
 {'disease_type': 'Poisoning, Toxicology, Environmental Health',
  'url': 'https://medlineplus.gov/poisoningtoxicologyenvironmentalhealth.html'},
 {'disease_type': 'Pregnancy and Reproduction',
  'url': 'https://medlineplus.gov/pregnancyandreproduction.html'},
 {'disease_type': 'Substance Abuse Problems',
  'url': 'h

In [43]:
Disease_df_2 = pd.DataFrame(dict_ls_2)
Disease_df_2

Unnamed: 0,disease_type,url
0,Cancers,https://medlineplus.gov/cancers.html
1,Diabetes Mellitus,https://medlineplus.gov/diabetesmellitus.html
2,Genetics/Birth Defects,https://medlineplus.gov/geneticsbirthdefects.html
3,Infections,https://medlineplus.gov/infections.html
4,Injuries and Wounds,https://medlineplus.gov/injuriesandwounds.html
5,Mental Health and Behavior,https://medlineplus.gov/mentalhealthandbehavio...
6,Metabolic Problems,https://medlineplus.gov/metabolicproblems.html
7,"Poisoning, Toxicology, Environmental Health",https://medlineplus.gov/poisoningtoxicologyenv...
8,Pregnancy and Reproduction,https://medlineplus.gov/pregnancyandreproducti...
9,Substance Abuse Problems,https://medlineplus.gov/substanceabuseproblems...


<font size=5>Obtaining terms associated to each disease types</font>

In [44]:
Disease_df_2['Associated_words']=np.nan
Disease_df_2

Unnamed: 0,disease_type,url,Associated_words
0,Cancers,https://medlineplus.gov/cancers.html,
1,Diabetes Mellitus,https://medlineplus.gov/diabetesmellitus.html,
2,Genetics/Birth Defects,https://medlineplus.gov/geneticsbirthdefects.html,
3,Infections,https://medlineplus.gov/infections.html,
4,Injuries and Wounds,https://medlineplus.gov/injuriesandwounds.html,
5,Mental Health and Behavior,https://medlineplus.gov/mentalhealthandbehavio...,
6,Metabolic Problems,https://medlineplus.gov/metabolicproblems.html,
7,"Poisoning, Toxicology, Environmental Health",https://medlineplus.gov/poisoningtoxicologyenv...,
8,Pregnancy and Reproduction,https://medlineplus.gov/pregnancyandreproducti...,
9,Substance Abuse Problems,https://medlineplus.gov/substanceabuseproblems...,


In [45]:
Associated_words_dictionary={}
for index, row in tqdm(Disease_df_2.iterrows(), total=len(Disease_df_2)):
    #Step 1: obtain link
    url = row['url']
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'lxml')     

    #Step 2: obtain list of related words in each web:
    _ls = soup.find_all('li',{'class':'item'})

    _synonym_ls = []
    for item in _ls:
        dict_ = {}
        regex = r'see.*'
        _str = item.text.replace('\n','').strip()
        word = re.sub(regex, "", _str).strip()
        _synonym_ls.append(word)
            
    #Step 3: put into the dataframe:
    name = row['disease_type']
    Associated_words_dictionary[f'{name}'] = _synonym_ls

Associated_words_dictionary

100%|██████████| 10/10 [00:13<00:00,  1.33s/it]


{'Cancers': ['Actinic Keratosis',
  'Acute Lymphoblastic Leukemia',
  'Acute Lymphocytic Leukemia',
  'Acute Myeloblastic Leukemia',
  'Acute Myeloid Leukemia',
  'Adenoma',
  'Adrenal Gland Cancer',
  'ALL',
  'Alternative Therapy for Cancer',
  'AML',
  'Anal Cancer',
  'Basal Cell Carcinoma',
  'Benign Tumors',
  'Bile Duct Cancer',
  'Biopsy',
  'Bladder Cancer',
  'Bone Cancer',
  'Brachytherapy',
  'Brain Cancer',
  'Brain Tumors',
  'Breast Cancer',
  'Breast Cancer, Male',
  'Bronchogenic Carcinoma',
  'Cancer',
  'Cancer Alternative Therapies',
  'Cancer and Pregnancy',
  'Cancer Chemotherapy',
  'Cancer Immunotherapy',
  'Cancer in Children',
  'Cancer--Living with Cancer',
  'Carcinoid Tumors',
  'Carcinoma',
  'Cervical Cancer',
  'Cervical Cancer Screening',
  'Chemotherapy',
  'Childhood Brain Tumors',
  'Childhood Cancer',
  'Childhood Leukemia',
  'Cholangiocarcinoma',
  'Chronic Granulocytic Leukemia',
  'Chronic Lymphocytic Leukemia',
  'Chronic Myelogenous Leukemia',

In [46]:
Disease_df_2

Unnamed: 0,disease_type,url,Associated_words
0,Cancers,https://medlineplus.gov/cancers.html,
1,Diabetes Mellitus,https://medlineplus.gov/diabetesmellitus.html,
2,Genetics/Birth Defects,https://medlineplus.gov/geneticsbirthdefects.html,
3,Infections,https://medlineplus.gov/infections.html,
4,Injuries and Wounds,https://medlineplus.gov/injuriesandwounds.html,
5,Mental Health and Behavior,https://medlineplus.gov/mentalhealthandbehavio...,
6,Metabolic Problems,https://medlineplus.gov/metabolicproblems.html,
7,"Poisoning, Toxicology, Environmental Health",https://medlineplus.gov/poisoningtoxicologyenv...,
8,Pregnancy and Reproduction,https://medlineplus.gov/pregnancyandreproducti...,
9,Substance Abuse Problems,https://medlineplus.gov/substanceabuseproblems...,


In [47]:
Disease_df_2['Associated_words'] = Disease_df_2['disease_type'].map(Associated_words_dictionary)
Disease_df_2

Unnamed: 0,disease_type,url,Associated_words
0,Cancers,https://medlineplus.gov/cancers.html,"[Actinic Keratosis, Acute Lymphoblastic Leukem..."
1,Diabetes Mellitus,https://medlineplus.gov/diabetesmellitus.html,"[A1C, Blood Glucose, Blood Sugar, Children and..."
2,Genetics/Birth Defects,https://medlineplus.gov/geneticsbirthdefects.html,"[Abnormalities, Achondroplasia, Adrenoleukodys..."
3,Infections,https://medlineplus.gov/infections.html,"[ABPA, Abscess, Acquired Immunodeficiency Synd..."
4,Injuries and Wounds,https://medlineplus.gov/injuriesandwounds.html,"[Abuse, Accidents, Achilles Tendon Injuries, A..."
5,Mental Health and Behavior,https://medlineplus.gov/mentalhealthandbehavio...,"[ADD, ADHD, Adolescent Development, Agoraphobi..."
6,Metabolic Problems,https://medlineplus.gov/metabolicproblems.html,"[Adrenoleukodystrophy, Amino Acid Metabolism D..."
7,"Poisoning, Toxicology, Environmental Health",https://medlineplus.gov/poisoningtoxicologyenv...,"[Air Pollution, Arsenic, Asbestos, Asbestosis,..."
8,Pregnancy and Reproduction,https://medlineplus.gov/pregnancyandreproducti...,"[Abdominal Pregnancy, Abortion, Adolescent Pre..."
9,Substance Abuse Problems,https://medlineplus.gov/substanceabuseproblems...,"[Alcohol, Alcohol Abuse, Alcohol Abuse in Preg..."


In [48]:
#Mental health is not within the interest of our project. As such, the row is dropped.
Disease_df_2.drop(Disease_df_2[Disease_df_2['disease_type']=='Mental Health and Behavior'].index, inplace=True)
Disease_df_2

Unnamed: 0,disease_type,url,Associated_words
0,Cancers,https://medlineplus.gov/cancers.html,"[Actinic Keratosis, Acute Lymphoblastic Leukem..."
1,Diabetes Mellitus,https://medlineplus.gov/diabetesmellitus.html,"[A1C, Blood Glucose, Blood Sugar, Children and..."
2,Genetics/Birth Defects,https://medlineplus.gov/geneticsbirthdefects.html,"[Abnormalities, Achondroplasia, Adrenoleukodys..."
3,Infections,https://medlineplus.gov/infections.html,"[ABPA, Abscess, Acquired Immunodeficiency Synd..."
4,Injuries and Wounds,https://medlineplus.gov/injuriesandwounds.html,"[Abuse, Accidents, Achilles Tendon Injuries, A..."
6,Metabolic Problems,https://medlineplus.gov/metabolicproblems.html,"[Adrenoleukodystrophy, Amino Acid Metabolism D..."
7,"Poisoning, Toxicology, Environmental Health",https://medlineplus.gov/poisoningtoxicologyenv...,"[Air Pollution, Arsenic, Asbestos, Asbestosis,..."
8,Pregnancy and Reproduction,https://medlineplus.gov/pregnancyandreproducti...,"[Abdominal Pregnancy, Abortion, Adolescent Pre..."
9,Substance Abuse Problems,https://medlineplus.gov/substanceabuseproblems...,"[Alcohol, Alcohol Abuse, Alcohol Abuse in Preg..."


<font size=5>Matching each disease type to PubMed Primary Diseases</font>

In [49]:
disease_ls = Disease_df_2['disease_type'].to_list()
disease_ls

['Cancers',
 'Diabetes Mellitus',
 'Genetics/Birth Defects',
 'Infections',
 'Injuries and Wounds',
 'Metabolic Problems',
 'Poisoning, Toxicology, Environmental Health',
 'Pregnancy and Reproduction',
 'Substance Abuse Problems']

In [50]:
Pri_disease

['Infections',
 'Neoplasms',
 'Musculoskeletal Diseases',
 'Digestive System Diseases',
 'Stomatognathic Diseases',
 'Respiratory Tract Diseases',
 'Otorhinolaryngologic Diseases',
 'Nervous System Diseases',
 'Eye Diseases',
 'Urogenital Diseases',
 'Cardiovascular Diseases',
 'Hemic and Lymphatic Diseases',
 'Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
 'Skin and Connective Tissue Diseases',
 'Nutritional and Metabolic Diseases',
 'Endocrine System Diseases',
 'Immune System Diseases',
 'Disorders of Environmental Origin',
 'Animal Diseases',
 'Pathological Conditions, Signs and Symptoms',
 'Occupational Diseases',
 'Chemically-Induced Disorders',
 'Wounds and Injuries']

In [51]:
PubMed_disease = [
'Neoplasms',
'Endocrine System Diseases',
'Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
'Infections',
'Wounds and Injuries',
'Nutritional and Metabolic Diseases',
'Chemically-Induced Disorders',
'Urogenital Diseases',
'Chemically-Induced Disorders']

In [52]:
new_dict = {key: value for key, value in zip(disease_ls, PubMed_disease)}
new_dict
Disease_df_2['PubMed_disease'] = Disease_df_2['disease_type'].map(new_dict)
Disease_df_2

{'Cancers': 'Neoplasms',
 'Diabetes Mellitus': 'Endocrine System Diseases',
 'Genetics/Birth Defects': 'Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
 'Infections': 'Infections',
 'Injuries and Wounds': 'Wounds and Injuries',
 'Metabolic Problems': 'Nutritional and Metabolic Diseases',
 'Poisoning, Toxicology, Environmental Health': 'Chemically-Induced Disorders',
 'Pregnancy and Reproduction': 'Urogenital Diseases',
 'Substance Abuse Problems': 'Chemically-Induced Disorders'}

Unnamed: 0,disease_type,url,Associated_words,PubMed_disease
0,Cancers,https://medlineplus.gov/cancers.html,"[Actinic Keratosis, Acute Lymphoblastic Leukem...",Neoplasms
1,Diabetes Mellitus,https://medlineplus.gov/diabetesmellitus.html,"[A1C, Blood Glucose, Blood Sugar, Children and...",Endocrine System Diseases
2,Genetics/Birth Defects,https://medlineplus.gov/geneticsbirthdefects.html,"[Abnormalities, Achondroplasia, Adrenoleukodys...","Congenital, Hereditary, and Neonatal Diseases ..."
3,Infections,https://medlineplus.gov/infections.html,"[ABPA, Abscess, Acquired Immunodeficiency Synd...",Infections
4,Injuries and Wounds,https://medlineplus.gov/injuriesandwounds.html,"[Abuse, Accidents, Achilles Tendon Injuries, A...",Wounds and Injuries
6,Metabolic Problems,https://medlineplus.gov/metabolicproblems.html,"[Adrenoleukodystrophy, Amino Acid Metabolism D...",Nutritional and Metabolic Diseases
7,"Poisoning, Toxicology, Environmental Health",https://medlineplus.gov/poisoningtoxicologyenv...,"[Air Pollution, Arsenic, Asbestos, Asbestosis,...",Chemically-Induced Disorders
8,Pregnancy and Reproduction,https://medlineplus.gov/pregnancyandreproducti...,"[Abdominal Pregnancy, Abortion, Adolescent Pre...",Urogenital Diseases
9,Substance Abuse Problems,https://medlineplus.gov/substanceabuseproblems...,"[Alcohol, Alcohol Abuse, Alcohol Abuse in Preg...",Chemically-Induced Disorders


<font size=5>Creating dict asso_dict_2</font>

In [53]:
asso_df_2 = Disease_df_2.loc[:,["Associated_words","PubMed_disease"]]
asso_df_2

Unnamed: 0,Associated_words,PubMed_disease
0,"[Actinic Keratosis, Acute Lymphoblastic Leukem...",Neoplasms
1,"[A1C, Blood Glucose, Blood Sugar, Children and...",Endocrine System Diseases
2,"[Abnormalities, Achondroplasia, Adrenoleukodys...","Congenital, Hereditary, and Neonatal Diseases ..."
3,"[ABPA, Abscess, Acquired Immunodeficiency Synd...",Infections
4,"[Abuse, Accidents, Achilles Tendon Injuries, A...",Wounds and Injuries
6,"[Adrenoleukodystrophy, Amino Acid Metabolism D...",Nutritional and Metabolic Diseases
7,"[Air Pollution, Arsenic, Asbestos, Asbestosis,...",Chemically-Induced Disorders
8,"[Abdominal Pregnancy, Abortion, Adolescent Pre...",Urogenital Diseases
9,"[Alcohol, Alcohol Abuse, Alcohol Abuse in Preg...",Chemically-Induced Disorders


In [54]:
asso_df_2 = asso_df_2.explode('Associated_words')
asso_df_2

Unnamed: 0,Associated_words,PubMed_disease
0,Actinic Keratosis,Neoplasms
0,Acute Lymphoblastic Leukemia,Neoplasms
0,Acute Lymphocytic Leukemia,Neoplasms
0,Acute Myeloblastic Leukemia,Neoplasms
0,Acute Myeloid Leukemia,Neoplasms
...,...,...
9,Teenage Drinking,Chemically-Induced Disorders
9,Tobacco Smoking,Chemically-Induced Disorders
9,"Tobacco, Smokeless",Chemically-Induced Disorders
9,Underage Drinking,Chemically-Induced Disorders


In [55]:
asso_df_2 = asso_df_2.apply(lambda x: x.str.lower())
asso_df_2

Unnamed: 0,Associated_words,PubMed_disease
0,actinic keratosis,neoplasms
0,acute lymphoblastic leukemia,neoplasms
0,acute lymphocytic leukemia,neoplasms
0,acute myeloblastic leukemia,neoplasms
0,acute myeloid leukemia,neoplasms
...,...,...
9,teenage drinking,chemically-induced disorders
9,tobacco smoking,chemically-induced disorders
9,"tobacco, smokeless",chemically-induced disorders
9,underage drinking,chemically-induced disorders


In [56]:
asso_dict_2 = asso_df_2.groupby('Associated_words')['PubMed_disease'].apply(list).to_dict()
for key, value in asso_dict_2.items():
    asso_dict_2[key]=set(value)
asso_dict_2

{'a1c': {'endocrine system diseases'},
 'abdominal pregnancy': {'urogenital diseases'},
 'abnormalities': {'congenital, hereditary, and neonatal diseases and abnormalities'},
 'abortion': {'urogenital diseases'},
 'abpa': {'infections'},
 'abscess': {'infections'},
 'abuse': {'wounds and injuries'},
 'accidents': {'wounds and injuries'},
 'achilles tendon injuries': {'wounds and injuries'},
 'achondroplasia': {'congenital, hereditary, and neonatal diseases and abnormalities'},
 'acl injuries': {'wounds and injuries'},
 'acquired immunodeficiency syndrome': {'infections'},
 'actinic keratosis': {'neoplasms'},
 'acute bronchitis': {'infections'},
 'acute flaccid myelitis': {'infections'},
 'acute lymphoblastic leukemia': {'neoplasms'},
 'acute lymphocytic leukemia': {'neoplasms'},
 'acute myeloblastic leukemia': {'neoplasms'},
 'acute myeloid leukemia': {'neoplasms'},
 'adenoma': {'neoplasms'},
 'adenovirus infections': {'infections'},
 'adhesions': {'wounds and injuries'},
 'adolescent 

# Part 3: Further processing of the dictionaries

In [57]:
#some words are generic, poor meaning, not unique to a disease group (e.g. blood test)
asso_dict.keys()

dict_keys(['aaa', 'abdominal aortic aneurysm', 'abdominal pain', 'abdominal pregnancy', 'abo blood groups', 'abortion', 'accidents', 'achalasia', 'achilles tendon injuries', 'achondroplasia', 'acid reflux', 'acl injuries', 'acne', 'acne rosacea', 'acoustic neuroma', 'acquired immunodeficiency syndrome', 'acromegaly', 'actinic keratosis', 'acute bronchitis', 'acute flaccid myelitis', 'acute lymphoblastic leukemia', 'acute lymphocytic leukemia', 'acute myeloblastic leukemia', 'acute myeloid leukemia', 'acute pancreatitis', 'acute respiratory distress syndrome', 'add', 'addison disease', 'adenoidectomy', 'adenoids', 'adenomyosis', 'adenovirus infections', 'adhd', 'adhesions', 'adolescent pregnancy', 'adrenal gland cancer', 'adrenal gland disorders', 'adrenal insufficiency', 'adrenoleukodystrophy', 'adult immunization', 'af', 'age-related macular degeneration', 'ageusia', 'aging skin', 'aids', 'aids and infections', 'aids and pregnancy', 'aids in women', 'aids medicines', 'aids--living wit

In [58]:
asso_dict['blood']
asso_dict['all']
asso_dict['gas']
asso_dict['blood tests']
asso_dict['anatomy']


{'cardiovascular diseases'}

{'cardiovascular diseases', 'immune system diseases'}

{'digestive system diseases'}

{'cardiovascular diseases'}

{'cardiovascular diseases',
 'digestive system diseases',
 'endocrine system diseases',
 'eye diseases',
 'immune system diseases',
 'musculoskeletal diseases',
 'nervous system diseases',
 'otorhinolaryngologic diseases',
 'respiratory tract diseases',
 'skin and connective tissue diseases',
 'urogenital diseases'}

In [59]:
asso_dict_2.keys()

dict_keys(['a1c', 'abdominal pregnancy', 'abnormalities', 'abortion', 'abpa', 'abscess', 'abuse', 'accidents', 'achilles tendon injuries', 'achondroplasia', 'acl injuries', 'acquired immunodeficiency syndrome', 'actinic keratosis', 'acute bronchitis', 'acute flaccid myelitis', 'acute lymphoblastic leukemia', 'acute lymphocytic leukemia', 'acute myeloblastic leukemia', 'acute myeloid leukemia', 'adenoma', 'adenovirus infections', 'adhesions', 'adolescent pregnancy', 'adrenal gland cancer', 'adrenoleukodystrophy', 'adult immunization', 'aids', 'aids and infections', 'aids and pregnancy', 'air pollution', 'alcohol', 'alcohol abuse', 'alcohol abuse in pregnancy', 'alcohol and youth', 'alcohol consumption', 'alcohol dependence', 'alcohol use disorder (aud)', 'alcohol use disorder (aud) treatment', 'alcohol withdrawal', 'alcoholism', 'all', 'allergic bronchopulmonary aspergillosis', 'alpha-1 antitrypsin deficiency', 'alternative therapy for cancer', 'amino acid metabolism disorders', 'aml', 

In [60]:
asso_dict_2['alcohol']
asso_dict_2['all']
asso_dict_2['anatomy']
asso_dict_2['water']

{'chemically-induced disorders'}

{'neoplasms'}

{'urogenital diseases'}

{'chemically-induced disorders'}

In [61]:
#removing terms that are too generic

_ls = ['blood','all','gas','blood tests','anatomy','alcohol','all','anatomy water']
for i in _ls:
    if i in asso_dict.keys():
        del asso_dict[i]
    if i in asso_dict_2.keys():
        del asso_dict_2[i]
        


# Part 4: Export Dictionaries

In [62]:
np.save('asso_dict.npy', asso_dict) 
np.save('asso_dict_2.npy', asso_dict_2) 

