In [None]:
import os
import xml.etree.ElementTree as ET
import json
from Bio import Entrez
import requests
from selenium import webdriver
from bs4 import BeautifulSoup  # For HTML parsing
import time
from tqdm import tqdm

___________________________ORPHANET DATA_______________________________

The list of rare diseases are available here  -  https://www.orphadata.com/classifications/

In [None]:
def read_files_from_folder(folder_path):
    files = os.listdir(folder_path)
    return files

In [None]:
def parse_orphanet_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    diseases = []
    for disorder in root.findall('.//Disorder'):
        name = disorder.find('.//Name[@lang="en"]')
        orpha_code = disorder.find('.//OrphaCode')
        if name is not None and orpha_code is not None:
            diseases.append({'name': name.text, 'orpha_code': orpha_code.text})
    return diseases

In [None]:
diseases_dict = {}
for file in read_files_from_folder('Rare_Diseases'):
    if file.endswith('.xml'):
        diseases = parse_orphanet_xml('Rare_Diseases/' + file)
        file_name = file.split('.')[0]
        diseases_dict[file_name] = diseases


--------------------------End Orphanet data------------------------------

In [None]:
list(diseases_dict.items())[:1]

--------------------Begin Pubmed Data fetch------------------------

In [None]:
EMAIL = 'abc@gmail.com'

In [None]:
def search_pubmed(query, max_results=10):
    Entrez.email = EMAIL  # Always provide your email
    query_with_filter = query + " AND free full text[sb]"  # Adding the free full text filter
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax=max_results,
                            retmode='xml', 
                            term=query_with_filter)
    results = Entrez.read(handle)

    if results['IdList'] == []:
        handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax=max_results,
                            retmode='xml', 
                            term=query)
        
        results = Entrez.read(handle)
    return results['IdList']


def fetch_pubmed_details(id_list):
    Entrez.email = EMAIL 
    if len(id_list) == 0:
        return None
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed', id=ids, retmode='xml')
    papers = Entrez.read(handle)
    return papers

In [None]:
disease_articles = {}

In [None]:
all_diseases = set()
for disease_name, diseases in diseases_dict.items():
    for disease in diseases:
        all_diseases.add(disease['name'])

In [None]:
len(all_diseases)

In [None]:
for disease in all_diseases:
    if disease in disease_articles and disease_articles[disease] != []:
        continue    
    query = f'{disease}'
    disease_articles[disease] = search_pubmed(query, 50)



In [None]:
# logged on 8th Dec - 12:09 AM - json dumped at 12:10 AM
with open('disease_articles.json', 'w') as f:
    json.dump(disease_articles, f)


In [None]:
# load the json file for disease articles
with open('disease_articles.json', 'r') as f:
    disease_articles = json.load(f)

In [None]:
papers_list = dict()

In [None]:
len(disease_articles) 

In [None]:

for disease, ids in tqdm(disease_articles.items()):
    if disease in papers_list and papers_list[disease] != []:
        continue
    papers_list[disease] = fetch_pubmed_details(ids)
    time.sleep(1)



In [None]:
disease_articles_new = {key: value for key, value in disease_articles.items() if value}
len(disease_articles_new)

In [None]:
combined_data = list()

In [None]:
def get_article_details(diseases_dict:dict, disease_articles:dict,combined_data:list):
    for category, diseases in diseases_dict.items():
        print(f"Processing {category}...")
        if os.path.exists(f"combined_data/{category}.json"):
            continue

        for disease in tqdm(diseases):
            disease_data = {
                'name': disease['name'],
                'orpha_code': disease['orpha_code'],
                'articles': []
            }
            
            if disease['name'] in disease_articles:
                ids = disease_articles_new[disease['name']][:5]
                papers = fetch_pubmed_details(ids)
                time.sleep(0.5)
                if papers:
                    for paper in papers['PubmedArticle']:
                        pmid = paper['MedlineCitation']['PMID'].title()
                        # Extract necessary details from each paper
                        article_data = paper['MedlineCitation']['Article']
                        if article_data['ELocationID'] and article_data['ELocationID'][0].attributes['EIdType'] == 'doi':
                            doi = article_data['ELocationID'][0].title()

                        article_url = f"https://doi.org/{doi}" if doi else ''
                        title = article_data['ArticleTitle']
                        abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0] if 'Abstract' in paper['MedlineCitation']['Article'] else ''
                        disease_data['articles'].append({'PMID':pmid, 'title': title, 'abstract': abstract, 'article_url': article_url})

            combined_data.append(disease_data)


        # Create the "combined_data" folder if it doesn't exist
        if not os.path.exists("combined_data"):
            os.makedirs("combined_data")

        # Dump the combined_data dictionary to a JSON file in the "combined_data" folder
        with open(f"combined_data/{category}.json", "w") as f:
            json.dump(combined_data, f)

        combined_data.clear()



In [None]:
get_article_details(diseases_dict, disease_articles_new,combined_data)

In [None]:
def get_full_text_from_doi(doi_url):
    # Use Selenium to handle JavaScript-enabled requests
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920x1080')  # Set the window size
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36')
    options.add_argument('--headless')  # Run Chrome in headless mode
    with webdriver.Chrome(options=options) as driver:
        driver.get(doi_url)
        time.sleep(2)
        html = driver.page_source

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    article_tag = soup.find('article')
    if article_tag:
        article_text = article_tag.get_text()
    else:
        article_text = soup.get_text()

    return article_text



In [None]:
from bs4 import BeautifulSoup
import re

def clean_html(html_content):
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    article_tag = soup.find('article')
 
    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.extract()

    # Get text
    text = ''
    if article_tag:
        text = article_tag.get_text()
    else:
        text = soup.get_text()

    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())

    # Break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

    # Drop blank lines and remove non-ascii characters
    text = '\n'.join(chunk for chunk in chunks if chunk)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    return text


In [None]:
options = webdriver.ChromeOptions()
options.add_argument('window-size=1920x1080')  # Set the window size
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36')
options.add_argument('--headless')  # Run Chrome in headless mode

In [None]:
combined_data_folder = 'combined_data'
loaded_files = []

In [None]:
for file_name in (os.listdir(combined_data_folder)):
    print(f"Processing {file_name}...")
    if file_name in loaded_files:
        continue
    file_path = os.path.join(combined_data_folder, file_name)
    with open(file_path, 'r') as f:
        final_data = json.load(f)
        # Process the data here
        with webdriver.Chrome(options=options) as driver:
            for data in tqdm(final_data):
                for article in data['articles']:
                    if 'full_text' in article:
                        continue
                    if article['article_url']:
                        driver.get(article['article_url'])
                        time.sleep(1)
                        html = driver.page_source
                        # Parse the HTML using BeautifulSoup
                        article_text = clean_html(html)
                        article['full_text'] = article_text        # Add the loaded file to the list

         
        # Create the "final_data" folder if it doesn't exist
        if not os.path.exists("final_data"):
            os.makedirs("final_data")

        # Dump the combined_data dictionary to a JSON file in the "final_data" folder
        with open(f"final_data/{file_name}_final.json", "w") as f:
            json.dump(final_data, f)  

        loaded_files.append(file_name)   
        final_data = []
    

In [None]:
with open(f"final_data/Rare_systemic_and_rhumatological_diseases.json_final.json", "w") as f:
            json.dump(final_data, f) 

In [None]:
loaded_files.append("Rare_systemic_and_rhumatological_diseases.json")

In [None]:
loaded_files

## Data processing and cleaning 

In [None]:
import pandas as pd