In [1]:
from Bio import Entrez

In [3]:

EMAIL = 'chintansa148@gmail.com'
def search(query):
    Entrez.email = EMAIL  # Always provide your email
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='10',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = EMAIL
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

if __name__ == '__main__':
    # query = '("rare diseases"[MeSH Terms] OR "rare diseases"[All Fields]) AND "medication"[All Fields]'
    query = 'heart'
    results = search(query)
    id_list = results['IdList']
    papers = fetch_details(id_list)

    # Print the title and abstract of each paper
    for i, paper in enumerate(papers['PubmedArticle']):
        print(f"{i+1}. {paper['MedlineCitation']['Article']['ArticleTitle']}")
        print(f"   {paper['MedlineCitation']['Article'].keys()}")
        if 'Abstract' in paper['MedlineCitation']['Article']:
            print(f"   Abstract: {paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]}")
        print('\n')


1. Regeneration of the heart.
   dict_keys(['ArticleDate', 'Language', 'ELocationID', 'Journal', 'ArticleTitle', 'Pagination', 'Abstract', 'AuthorList', 'PublicationTypeList'])
   Abstract: The death of cardiac myocytes diminishes the heart's pump function and is a major cause of heart failure, one of the dominant causes of death worldwide. Other than transplantation, there are no therapies that directly address the loss of cardiac myocytes, which explains the current excitement in cardiac regeneration. The field is evolving in two important directions. First, although endogenous mammalian cardiac regeneration clearly seems to decline rapidly after birth, it may still persist in adulthood. The careful elucidation of the cellular and molecular mechanisms of endogenous heart regeneration may therefore provide an opportunity for developing therapeutic interventions that amplify this process. Second, recent breakthroughs have enabled reprogramming of cells that were apparently terminally d

--------------------------------------ORPHANET CODE------------------------------------------------

In [5]:
import xml.etree.ElementTree as ET

def parse_orphanet_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    diseases = []
    for disorder in root.findall('.//Disorder'):
        name = disorder.find('.//Name[@lang="en"]')
        orpha_code = disorder.find('.//OrphaCode')
        if name is not None and orpha_code is not None:
            diseases.append({'name': name.text, 'orpha_code': orpha_code.text})
    return diseases

orphanet_diseases = parse_orphanet_xml('Rare_cardiac_diseases.xml')


In [53]:
disease_articles = {}
for disease in orphanet_diseases[:5]:
    query = f'{disease["name"]}'
    ids = search(query)
    disease_articles[disease['name']] = ids['IdList']


In [54]:
papers_list = dict()
def fetch_pubmed_details(id_list):
    if len(id_list) == 0:
        return None
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed', id=ids, retmode='xml')
    papers = Entrez.read(handle)
    return papers

for disease, ids in disease_articles.items():
    papers_list[disease] = fetch_pubmed_details(ids)
    # Process and store these papers as needed


In [None]:
import json

combined_data = []
for disease in orphanet_diseases:
    disease_data = {
        'name': disease['name'],
        'orpha_code': disease['orpha_code'],
        'articles': []
    }
    
    if disease['name'] in disease_articles:
        papers = papers_list[disease['name']]
        if papers:
            for paper in papers['PubmedArticle']:
                # Extract necessary details from each paper
                title = paper['MedlineCitation']['Article']['ArticleTitle']
                abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0] if 'Abstract' in paper['MedlineCitation']['Article'] else ''
                disease_data['articles'].append({'title': title, 'abstract': abstract})

    combined_data.append(disease_data)

# Save to JSON
with open('Rare_cardiac_diseases.json', 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, indent=4)


In [10]:
from Bio import Entrez

def fetch_article_details(pmid):
    Entrez.email = 'your.email@example.com'  # Always provide your email
    handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')
    article_records = Entrez.read(handle)
    return article_records

pmid = '22095736'  # Replace with the PMID of the article
article_details = fetch_article_details(pmid)

# Extracting DOI from the article details
for article in article_details['PubmedArticle']:
    article_data = article['MedlineCitation']['Article']
    if 'ELocationID' in article_data:
        dois = [id for id in article_data['ELocationID'] if id.attributes['EIdType'] == 'doi']
        doi = dois[0] if dois else 'No DOI found'
        print(f"DOI: {doi}")
    else:
        print("No DOI information available")


DOI: 10.1002/emmm.201100175


In [34]:
doi = '10.1113/expphysiol.2014.080168'
article_url = f"https://doi.org/{doi}"

In [66]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup  # For HTML parsing
import time

def get_full_text_from_doi(doi_url):
    # Use Selenium to handle JavaScript-enabled requests
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920x1080')  # Set the window size
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36')
    options.add_argument('--headless')  # Run Chrome in headless mode
    with webdriver.Chrome(options=options) as driver:
        driver.get(doi_url)
        time.sleep(5)
        html = driver.page_source

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    # Locate the article text in the HTML and extract it
    # This will vary greatly between different publishers' websites
    article_text = soup.get_text()  # Placeholder; requires specific parsing logic
    return article_text


full_text = get_full_text_from_doi(article_url)


In [70]:
url = 'https://doi.org/10.1002/emmm.201100175'
full_text = get_full_text_from_doi(url)


In [71]:
print(full_text)

















Regeneration of the heart | EMBO Molecular Medicine






































Skip to Article Content
Skip to Article Information




















Log in to EMBO Press 







Email or Customer ID







Password






                                Forgot password?
                            









        You may log in using the username and password that you use to access Wiley Online Library
    



                        
                        NEW USER >


                            
                            INSTITUTIONAL LOGIN >












Change Password







Old Password






New Password




Too Short
Weak
Medium
Strong
Very Strong
Too Long






Password Changed Successfully
Your password has been changed










Create a new account





Email








                    
                    Returning user
                











Forgot your password?
Enter your email address below. 







Email










Please check yo

In [46]:
def download_pmc_pdf(pmc_id, output_filename):
    pmc_url = f'https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/pdf/'
    response = requests.get(pmc_url)
    print(response.url)
    if response.status_code == 200:
        with open(output_filename, 'wb') as f:
            f.write(response.content)
        print(f"PDF downloaded: {output_filename}")
    else:
        print("PDF not available or page not accessible")

pmc_id = 'PMC3377117'  # Replace with actual PMC ID
download_pmc_pdf(pmc_id, 'output.pdf')


https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3377117/pdf/emmm0003-0701.pdf
PDF not available or page not accessible


In [58]:
def get_unpaywall_pdf(doi):
    response = requests.get(f"https://api.unpaywall.org/v2/{doi}?email={EMAIL}")
    if response.status_code == 200 and response.json()['is_oa']:
        return response.json()
    return None

pdf_url = get_unpaywall_pdf('10.1002/emmm.201100175')  # Replace with actual DOI
if pdf_url:
    print(f"PDF URL: {pdf_url}")
else:
    print("Free PDF not available")


PDF URL: {'doi': '10.1002/emmm.201100175', 'doi_url': 'https://doi.org/10.1002/emmm.201100175', 'title': 'Regeneration of the heart', 'genre': 'journal-article', 'is_paratext': False, 'published_date': '2011-09-23', 'year': 2011, 'journal_name': 'EMBO Molecular Medicine', 'journal_issns': '1757-4676,1757-4684', 'journal_issn_l': '1757-4676', 'journal_is_oa': True, 'journal_is_in_doaj': True, 'publisher': 'EMBO', 'is_oa': True, 'oa_status': 'green', 'has_repository_copy': True, 'best_oa_location': {'updated': '2022-11-24T09:21:49.838242', 'url': 'https://europepmc.org/articles/pmc3377117?pdf=render', 'url_for_pdf': 'https://europepmc.org/articles/pmc3377117?pdf=render', 'url_for_landing_page': 'https://europepmc.org/articles/pmc3377117', 'evidence': 'oa repository (via OAI-PMH doi match)', 'license': 'unspecified-oa', 'version': 'publishedVersion', 'host_type': 'repository', 'is_best': True, 'pmh_id': 'oai:europepmc.org:Y4X5eynnJjawEb8vjySd', 'endpoint_id': 'b5e840539009389b1a6', 'repos

In [55]:
t = requests.get('https://europepmc.org/articles/pmc3377117?pdf=render')


In [None]:
from selenium import webdriver
import time

def get_page_text_with_javascript(url):
    # Set up the Selenium WebDriver. 
    # Make sure that the chromedriver executable is in your PATH or provide the path to chromedriver.
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920x1080')  # Set the window size
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36')
    options.add_argument('--headless')  # Optional argument to make Chrome run in headless mode
    with webdriver.Chrome(options=options) as driver:
        driver.get(url)
        
        # Wait for JavaScript to load. Adjust the sleep time as needed.
        time.sleep(5)
        
        # Get the text of the page
        page_text = driver.find_element_by_tag_name("body").text
        return page_text

url = 'https://doi.org/10.1002/emmm.201100175'
text = get_page_text_with_javascript(url)
print(text)


In [57]:
print(t.content)

b'%PDF-1.4\n%\xe4\xe3\xcf\xd2\n205 0 obj\n<<\n/Linearized 1.0\n/O 209\n/H [ 1985 816 ]\n/L 549638\n/E 79830\n/N 12\n/T 545493\n>>\nendobj\n                                                xref\n205 73\n0000000015 00000 n \n0000001793 00000 n \n0000001899 00000 n \n0000001927 00000 n \n0000002801 00000 n \n0000003125 00000 n \n0000003211 00000 n \n0000003297 00000 n \n0000003383 00000 n \n0000003469 00000 n \n0000003555 00000 n \n0000003641 00000 n \n0000003728 00000 n \n0000003815 00000 n \n0000003901 00000 n \n0000003987 00000 n \n0000004073 00000 n \n0000004159 00000 n \n0000004243 00000 n \n0000004564 00000 n \n0000005039 00000 n \n0000005394 00000 n \n0000010388 00000 n \n0000010864 00000 n \n0000011148 00000 n \n0000012720 00000 n \n0000013308 00000 n \n0000013497 00000 n \n0000013981 00000 n \n0000018144 00000 n \n0000018882 00000 n \n0000019375 00000 n \n0000026420 00000 n \n0000026883 00000 n \n0000027133 00000 n \n0000028859 00000 n \n0000029420 00000 n \n0000029806 00000 n \n0

In [1]:
import pandas as pd

In [3]:
df = pd.read_json('final_data/Rare_allergic_disease.json_final.json')

In [5]:
df.head()

Unnamed: 0,name,orpha_code,articles
0,Rare allergic disease,98050,"[{'PMID': '26564810', 'title': 'Idiopathic non..."
1,Non-histaminic angioedema,658,"[{'PMID': '15582173', 'title': '[Non-histamini..."
2,Hereditary angioedema,91378,"[{'PMID': '36609679', 'title': 'Hereditary Ang..."
3,Hereditary angioedema with C1Inh deficiency,528623,"[{'PMID': '36609679', 'title': 'Hereditary Ang..."
4,Hereditary angioedema type 1,100050,"[{'PMID': '30480729', 'title': 'Effect of Lana..."
