In [None]:
!pip install fpdf

In [None]:
from google.colab import drive
import pandas as pd
import os
import time
import requests
import xml.etree.ElementTree as ET
from fpdf import FPDF

In [None]:
drive.mount('/content/drive')

In [None]:
#function to extract the PubMed Extracts via API call.
#The extracted abstracts will be stored in the google drive folder

def extract_pubmed_abstracts(pmids, folder_path):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    #sample format: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&rettype=abstract&id=34469212

    for pmid in pmids:
        #the api request structure from the website: https://www.ncbi.nlm.nih.gov/home/develop/api/ also the parameters required
        if pmid and str(pmid).isdigit():
            params = {
                "db": "pubmed",
                "id": pmid,
                "retmode": "xml",
                "rettype": "abstract"
            }

            response = requests.get(url, params=params)
            if response.status_code == 200:
                xml_data = response.text
                root = ET.fromstring(xml_data)
                abstract_texts = root.findall('.//AbstractText')
                abstract = " ".join([text.text for text in abstract_texts if text.text])


                pdf = FPDF()
                pdf.add_page()
                pdf.set_font("Arial", size = 12)
                pdf.multi_cell(0, 10, abstract)


                pdf_filename = os.path.join(folder_path, f"{pmid}.pdf")
                pdf.output(pdf_filename)
                print(f"Abstract for PMID {pmid} written to {pdf_filename}")
            else:
                print(f"Error fetching PMID {pmid}: {response.status_code}")
#Only 10 requests per second is allowed in the pubmed API - so giving a rest time
            time.sleep(2)

In [None]:
excel_path = '/content/drive/My Drive/Abstract_PMID.xlsx'
df = pd.read_excel(excel_path)

In [None]:

for index, row in df.iloc[0:].iterrows():
    folder_name = row['Authorized_Claim_Number']
    folder_path = os.path.join('/content/drive/My Drive/Data_Collection/', folder_name)

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)


    if pd.isnull(row['Only Abstract PMIDS']) or row['Only Abstract PMIDS'].strip() == "":
        print(f"No PMIDs listed for {folder_name}. Folder created but left empty.")
        continue

    pmids = row['Only Abstract PMIDS'].replace('"', '').split(',')

    pmids = [pmid.strip() for pmid in pmids if pmid.strip().isdigit()]

    extract_pubmed_abstracts(pmids, folder_path)
