<a href="https://colab.research.google.com/github/Mankind124/RagbasedMCQ/blob/main/Importdatafrompubmed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install biopython

In [None]:
import time
from Bio import Entrez
from datetime import datetime, timedelta

def parse_pub_date(pub_date):
    """ Extracts and formats the publication date from the PubDate field. """
    if 'Year' in pub_date:
        year = pub_date['Year']
        month = pub_date.get('Month', '01')  # Default to January if month is missing
        day = pub_date.get('Day', '01')  # Default to the first day if day is missing
        return f"{year}-{month}-{day}"
    return "Not Available"

def format_article_info(pmid, title, authors, journal, pub_date, url, abstract):
    return f"PMID: {pmid}\nTitle: {title}\nAuthors: {authors}\nJournal: {journal}\nPublication Date: {pub_date}\nURL: {url}\nAbstract: {abstract}\n\n"

# Set the email address for Entrez access
Entrez.email = 'your.email@example.com'

# Define search parameters
topics = ['cardiovascular disease']

# Set date range for the past 10 years
end_date = datetime.now()
start_date = end_date - timedelta(days=10*365)
date_range = f'("{start_date.strftime("%Y/%m/%d")}"[Date - Create] : "{end_date.strftime("%Y/%m/%d")}"[Date - Create])'

# Build query
individual_queries = [f'({topic}[Title/Abstract] AND {date_range})' for topic in topics]

# Open a file to write the results
with open('cardiovascular_disease_papers.txt', 'w', encoding='utf-8') as f:
    # Process each query individually
    for query in individual_queries:
        handle = Entrez.esearch(db='pubmed', term=query, retmax=1000)  # Increased retmax to get more results
        record = Entrez.read(handle)
        id_list = record['IdList']

        # Iterate over each PMID to fetch detailed article information
        for pmid in id_list:
            handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')
            records = Entrez.read(handle)

            # Process each article found in the fetched XML
            for record in records['PubmedArticle']:
                article = record['MedlineCitation']['Article']

                title = article.get('ArticleTitle', 'Title Not Available')
                authors_list = ', '.join(a.get('ForeName', '') + ' ' + a.get('LastName', '') for a in article.get('AuthorList', [])) or 'Authors Not Available'
                journal = article['Journal'].get('Title', 'Journal Not Available')
                pub_date = parse_pub_date(article['Journal']['JournalIssue']['PubDate'])
                url = f"https://www.ncbi.nlm.nih.gov/pubmed/{pmid}"

                # Extract the abstract
                abstract = ' '.join(article['Abstract']['AbstractText']) if 'Abstract' in article else 'Abstract Not Available'

                # Format the article information and write to file
                article_info = format_article_info(pmid, title, authors_list, journal, pub_date, url, abstract)
                f.write(article_info)

            time.sleep(0.33)  # Pause to respect PubMed server load

print("Data extraction complete. Results saved in 'cardiovascular_disease_papers.txt'")