In [3]:
import os
import xml.etree.ElementTree as ET
import json
from Bio import Entrez
import requests
from selenium import webdriver
from bs4 import BeautifulSoup  # For HTML parsing
import time
from tqdm import tqdm


### GET DISEASES DATA FROM ORPHANET FILES

The list of rare diseases are available here  -  https://www.orphadata.com/classifications/

In [None]:
def read_files_from_folder(folder_path):
    files = os.listdir(folder_path)
    return files

def parse_orphanet_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    diseases = []
    for disorder in root.findall('.//Disorder'):
        name = disorder.find('.//Name[@lang="en"]')
        orpha_code = disorder.find('.//OrphaCode')
        if name is not None and orpha_code is not None:
            diseases.append({'name': name.text, 'orpha_code': orpha_code.text})
    return diseases

In [None]:
diseases_dict = {}
for file in read_files_from_folder('Rare_Diseases'):
    if file.endswith('.xml'):
        diseases = parse_orphanet_xml('Rare_Diseases/' + file)
        file_name = file.split('.')[0]
        diseases_dict[file_name] = diseases


--------------------------End Orphanet data------------------------------

### Begin Pubmed Data fetch

In [None]:
EMAIL = 'abc@gmail.com'

In [None]:
def search_pubmed(query, max_results=10):
    Entrez.email = EMAIL  # Always provide your email
    query_with_filter = query + " AND free full text[sb]"  # Adding the free full text filter
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax=max_results,
                            retmode='xml', 
                            term=query_with_filter)
    results = Entrez.read(handle)

    if results['IdList'] == []:
        handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax=max_results,
                            retmode='xml', 
                            term=query)
        
        results = Entrez.read(handle)
    return results['IdList']


def fetch_pubmed_details(id_list):
    Entrez.email = EMAIL 
    if len(id_list) == 0:
        return None
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed', id=ids, retmode='xml')
    papers = Entrez.read(handle)
    return papers

In [None]:
disease_articles = {}

In [None]:
all_diseases = set()
for disease_name, diseases in diseases_dict.items():
    for disease in diseases:
        all_diseases.add(disease['name'])

In [None]:
for disease in all_diseases:
    if disease in disease_articles and disease_articles[disease] != []:
        continue    
    query = f'{disease}'
    disease_articles[disease] = search_pubmed(query, 50)

In [None]:
# logged on 8th Dec - 12:09 AM - json dumped at 12:10 AM
with open('disease_articles.json', 'w') as f:
    json.dump(disease_articles, f)


In [None]:
# load the json file for disease articles if present else create a new one
with open('disease_articles.json', 'r') as f:
    disease_articles = json.load(f)

In [None]:
papers_list = dict()

In [None]:

for disease, ids in tqdm(disease_articles.items()):
    if disease in papers_list and papers_list[disease] != []:
        continue
    papers_list[disease] = fetch_pubmed_details(ids)
    time.sleep(1)



In [None]:
disease_articles_new = {key: value for key, value in disease_articles.items() if value}
len(disease_articles_new)

In [None]:
combined_data = list()

def get_article_details(diseases_dict:dict, disease_articles:dict,combined_data:list):
    for category, diseases in diseases_dict.items():
        print(f"Processing {category}...")
        if os.path.exists(f"combined_data/{category}.json"):
            continue

        for disease in tqdm(diseases):
            disease_data = {
                'name': disease['name'],
                'orpha_code': disease['orpha_code'],
                'articles': []
            }
            
            if disease['name'] in disease_articles:
                ids = disease_articles_new[disease['name']][:5]
                papers = fetch_pubmed_details(ids)
                time.sleep(0.5)
                if papers:
                    for paper in papers['PubmedArticle']:
                        pmid = paper['MedlineCitation']['PMID'].title()
                        # Extract necessary details from each paper
                        article_data = paper['MedlineCitation']['Article']
                        if article_data['ELocationID'] and article_data['ELocationID'][0].attributes['EIdType'] == 'doi':
                            doi = article_data['ELocationID'][0].title()

                        article_url = f"https://doi.org/{doi}" if doi else ''
                        title = article_data['ArticleTitle']
                        abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0] if 'Abstract' in paper['MedlineCitation']['Article'] else ''
                        disease_data['articles'].append({'PMID':pmid, 'title': title, 'abstract': abstract, 'article_url': article_url})

            combined_data.append(disease_data)


        # Create the "combined_data" folder if it doesn't exist
        if not os.path.exists("combined_data"):
            os.makedirs("combined_data")

        # Dump the combined_data dictionary to a JSON file in the "combined_data" folder
        with open(f"combined_data/{category}.json", "w") as f:
            json.dump(combined_data, f)

        combined_data.clear()



In [None]:
get_article_details(diseases_dict, disease_articles_new,combined_data)

In [None]:
def get_full_text_from_doi(doi_url):
    # Use Selenium to handle JavaScript-enabled requests
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920x1080')  # Set the window size
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36')
    options.add_argument('--headless')  # Run Chrome in headless mode
    with webdriver.Chrome(options=options) as driver:
        driver.get(doi_url)
        time.sleep(2)
        html = driver.page_source

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    article_tag = soup.find('article')
    if article_tag:
        article_text = article_tag.get_text()
    else:
        article_text = soup.get_text()

    return article_text



In [2]:
from bs4 import BeautifulSoup
import re

def clean_html(html_content):
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    article_tag = soup.find('article')
 
    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.extract()

    # Get text
    text = ''
    if article_tag:
        text = article_tag.get_text()
    else:
        text = soup.get_text()

    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())

    # Break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

    # Drop blank lines and remove non-ascii characters
    text = '\n'.join(chunk for chunk in chunks if chunk)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    return text


In [3]:
options = webdriver.ChromeOptions()
options.add_argument('window-size=1920x1080')  # Set the window size
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36')
options.add_argument('--headless')  # Run Chrome in headless mode

In [None]:
combined_data_folder = 'combined_data'
for file_name in (os.listdir(combined_data_folder)):
    print(f"Processing {file_name}...")
    if os.path.exists(f"final_data/{file_name}"):
        continue
    file_path = os.path.join(combined_data_folder, file_name)
    with open(file_path, 'r') as f:
        final_data = json.load(f)
        # Process the data here
        with webdriver.Chrome(options=options) as driver:
            for data in tqdm(final_data[:250]):
                for article in data['articles']:
                    if 'full_text' in article:
                        continue
                    if article['article_url']:
                        driver.get(article['article_url'])
                        time.sleep(4)
                        html = driver.page_source
                        # Parse the HTML using BeautifulSoup
                        article_text = clean_html(html)
                        article['full_text'] = article_text        # Add the loaded file to the list

         
        # Create the "final_data" folder if it doesn't exist
        if not os.path.exists("final_data"):
            os.makedirs("final_data")

        # Dump the combined_data dictionary to a JSON file in the "final_data" folder
        with open(f"final_data/{file_name}", "w") as f:
            json.dump(final_data, f)  

        final_data = []
    

## Data processing and cleaning 

In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chintanaddoni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:

final_data_folder = 'final_data'
combined_data_df = []
df = pd.DataFrame(columns=['name', 'orpha_code', 'PMID', 'title', 'abstract', 'article_url', 'full_text'])

for file_name in os.listdir(final_data_folder):
    file_path = os.path.join(final_data_folder, file_name)
    with open(file_path, 'r') as f:
        data = json.load(f)
        combined_data_df = []
        for disease_data in data:
            name = disease_data['name']
            orpha_code = disease_data['orpha_code']
            articles = disease_data['articles']
            for article in articles:
                if 'full_text' in article: 
                    if len(article['full_text']) < 1000:
                        article['full_text'] = article['abstract']
                    combined_data_df.append([name, orpha_code, article['PMID'], article['title'], article['abstract'], article['article_url'], article['full_text']])
                else:
                    article['full_text'] = article['abstract']
                    combined_data_df.append([name, orpha_code, article['PMID'], article['title'], article['abstract'], article['article_url'], article['full_text']])

    # Append combined_data_df to df using pd.concat()
    df = pd.concat([df, pd.DataFrame(combined_data_df, columns=df.columns)], ignore_index=True)
    df.drop_duplicates(subset = df.columns.difference(['full_text', 'article_url']),inplace=True)




In [16]:
df.shape

(27507, 7)

In [17]:
df.head()

Unnamed: 0,name,orpha_code,PMID,title,abstract,article_url,full_text
0,Rare teratologic disease,52662,33745447,Diagnostic precision and identification of rar...,Diagnostic precision and the identification of...,https://doi.org/10.1002/Jimd.12306,Journal of Inherited Metabolic DiseaseVolume 4...
1,Rare teratologic disease,52662,36401554,Prevalence and mortality among children with a...,"We examined the total prevalence, trends in pr...",https://doi.org/10.1002/Bdr2.2129,"We examined the total prevalence, trends in pr..."
2,Rare teratologic disease,52662,27126916,Frederik Ruysch (1638-1731): Historical perspe...,The Peter the Great Museum of Anthropology and...,https://doi.org/10.1002/Ajmg.A.37663,The Peter the Great Museum of Anthropology and...
3,Rare teratologic disease,52662,35644130,A Multicountry Analysis of Prevalence and Mort...,Bladder exstrophy (BE) is a rare but severe b...,https://doi.org/10.1055/S-0042-1748318,Subscribe to RSS\nPlease copy the URL and add ...
4,Rare teratologic disease,52662,33253899,Prevalence and mortality in children with cong...,"This study determined the prevalence, mortalit...",https://doi.org/10.1016/J.Annepidem.2020.11.007,"Annals of EpidemiologyVolume 56, April 2021, P..."


In [18]:

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def remove_citations(text):
    # Remove citations references (e.g., [1], [1,2], [1-3])
    text = re.sub(r'\[\d+(,\s?\d+)*(\s?-\s?\d+)?\]', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    return text
 

def remove_references_section(text):
    # Naive approach to remove references section
    text = re.sub(r'\b(references|bibliography)\b.*', '', text, flags=re.IGNORECASE | re.DOTALL)
    return text

def tokenize_and_clean(text):
    # Tokenize into words
    words = word_tokenize(text)

    # Remove stopwords and non-alphabetic words
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    # # add stemming
    # stemmer = PorterStemmer()
    # words = [stemmer.stem(word) for word in words]

    return words

def preprocess_text(text):
    text = remove_citations(text)
    text = remove_references_section(text)
    words = tokenize_and_clean(text)
    return ' '.join(words)


In [19]:
df['cleaned_text'] = df['full_text'].apply(preprocess_text)

In [20]:
df.to_json('final_data_cleaned.json', orient='records')

In [21]:
df.head()

Unnamed: 0,name,orpha_code,PMID,title,abstract,article_url,full_text,cleaned_text
0,Rare teratologic disease,52662,33745447,Diagnostic precision and identification of rar...,Diagnostic precision and the identification of...,https://doi.org/10.1002/Jimd.12306,Journal of Inherited Metabolic DiseaseVolume 4...,journal inherited metabolic diseasevolume issu...
1,Rare teratologic disease,52662,36401554,Prevalence and mortality among children with a...,"We examined the total prevalence, trends in pr...",https://doi.org/10.1002/Bdr2.2129,"We examined the total prevalence, trends in pr...",examined total prevalence trends prevalence mo...
2,Rare teratologic disease,52662,27126916,Frederik Ruysch (1638-1731): Historical perspe...,The Peter the Great Museum of Anthropology and...,https://doi.org/10.1002/Ajmg.A.37663,The Peter the Great Museum of Anthropology and...,peter great museum anthropology ethnography ku...
3,Rare teratologic disease,52662,35644130,A Multicountry Analysis of Prevalence and Mort...,Bladder exstrophy (BE) is a rare but severe b...,https://doi.org/10.1055/S-0042-1748318,Subscribe to RSS\nPlease copy the URL and add ...,subscribe rss please copy url add rss feed rea...
4,Rare teratologic disease,52662,33253899,Prevalence and mortality in children with cong...,"This study determined the prevalence, mortalit...",https://doi.org/10.1016/J.Annepidem.2020.11.007,"Annals of EpidemiologyVolume 56, April 2021, P...",annals epidemiologyvolume april pages articlep...


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

In [25]:


# Assuming 'preprocessed_texts' is a list of your preprocessed documents
vectorizer = TfidfVectorizer()
preprocessed_texts = df['cleaned_text'].values
tfidf_matrix = vectorizer.fit_transform(preprocessed_texts)

def find_unique_relevant_documents(query, tfidf_matrix, top_n=5):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Use argsort for indices and then unique to filter out duplicates
    unique_indices = np.unique(cosine_similarities.argsort()[::-1], return_index=True)[1]
    
    # Sort unique indices based on original similarity scores
    sorted_unique_indices = unique_indices[np.argsort(-cosine_similarities[unique_indices])]

    # Select top_n indices
    relevant_indices = sorted_unique_indices[:top_n]

    return relevant_indices, cosine_similarities[relevant_indices]


def summarize(text, language="english", sentences_count = 6):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join([str(sentence) for sentence in summary])


In [46]:
def search_documents(query, df = df, tfidf_matrix = tfidf_matrix, top_n=5, summary_sentences=5):
    relevant_indices, relevance_scores = find_unique_relevant_documents(query, tfidf_matrix, top_n)
    relevant_docs = df.iloc[relevant_indices]
    relevant_docs['relevance_score'] = relevance_scores
    
    # Keep only the rows with unique PMIDs
    relevant_docs = relevant_docs.drop_duplicates(subset='PMID', ignore_index=True)
    
    # Combine all the full_text
    combined_text = ' '.join(relevant_docs['full_text'].tolist())
    
    # Generate summary of the combined text
    summary = summarize(combined_text, sentences_count=summary_sentences)
    
    return relevant_docs, summary


In [35]:
import warnings
# Ignore warnings
warnings.filterwarnings("ignore")

In [49]:

query = "What are the rare diseases associated with the gene 'BRCA1'?"
relevant_docs, summary = search_documents(query)
print(f"Query: {query}")
print(f"Summary:")
display(summary)
print(f"Relevant Documents:")
display(relevant_docs[['PMID', 'title','article_url','relevance_score']])


Query: What are the rare diseases associated with the gene 'BRCA1'?
Summary:


'Relapsing polychondritis (RP) is a rare autoimmune-related disease and may be associated with other autoimmune diseases. In the European Union (EU) a disease is considered to be rare if not more than 5 of 10,000 people are affected by it. In the present work most of the described diseases of salivary glands and of the facial nerve fall in this category. The work is a compilation of innate andacquired rare salivary gland disorders and of rare facial nerve disorders. Due to the rarity of these diseases, it is recommended to tread these in centers with special expertise for it.'

Relevant Documents:


Unnamed: 0,PMID,title,article_url,relevance_score
0,35549688,Mutation analysis reveals novel and known muta...,https://doi.org/10.1002/Humu.24140,0.281202
1,36983602,Orphan Drugs in Neurology-A Narrative Review.,https://doi.org/10.1093/Brain/Awp294,0.273829
2,30290676,Case report of mixed-type autoimmune hemolytic...,https://doi.org/10.4103/Ajts.Ajts_74_19,0.273469
3,28877977,Rare pulmonary diseases: a common fight.,https://doi.org/10.1038/S41576-022-00478-5,0.26906
4,34352906,Rare Diseases of the Salivary Glands and of Fa...,https://doi.org/10.1055/A-1337-6994,0.241157
