In [3]:
import time
import requests
import xml.etree.ElementTree as ET
import csv
import pandas as pd
import json

In [2]:
abstract_ids=['39333227', '39330834', '39327961', '39327433', '39326941',
                '39326936', '39324708', '39320566', '39318033', '39315544']

In [3]:
def fetch_mesh_terms(pmid):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if pmid in data['result']:
            mesh_heading_list = data['result'][pmid].get('meshheadinglist', [])
            return mesh_heading_list
    return []

In [4]:
def filter_mesh_terms(mesh_terms, keywords):
    relevant_terms = [term['name'] for term in mesh_terms if any(keyword in term['name'] for keyword in keywords)]
    return relevant_terms

In [5]:
# Function to search PubMed using E-utilities API for specific keywords
def search_pubmed_for_ids(keywords):
    # Construct the query using the keywords
    query = ' AND '.join(keywords)
    search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmode=json&retmax=100"

    # Perform the request
    response = requests.get(search_url)

    # Check for successful response
    if response.status_code == 200:
        data = response.json()
        return data['esearchresult']['idlist']  # Return list of PubMed IDs
    else:
        print(f"Failed to retrieve PubMed IDs. Status code: {response.status_code}")
        return []

# Example of keyword list
keywords = ["Neoplasms", "Antineoplastic Agents", "Adverse Effects", "Toxicity"]

# Fetch PubMed IDs based on the keywords
abstract_ids = search_pubmed_for_ids(keywords)

# Display a few PubMed IDs to confirm
abstract_ids[:10]  # Display first 10 PubMed IDs


['39333227',
 '39330834',
 '39327961',
 '39327433',
 '39326941',
 '39326936',
 '39324708',
 '39320566',
 '39318033',
 '39315544']

In [6]:
# Base URL for fetching PubMed abstracts
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
    'db': 'pubmed',
    'rettype': 'abstract',
    'retmode': 'xml',
    'api_key': 'd4a0e5f85881f5f38b9c0e9a84ac5338e408'
}

In [7]:
# Open the CSV file to store PubMed IDs, abstracts, and filtered MeSH terms
with open('pubmed_abstracts_with_mesh.csv', mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['PubMedID', 'Abstract', 'Filtered MeSH Terms'])

    # Iterate through provided PubMed IDs
    for pubmed_id in abstract_ids:
        params['id'] = pubmed_id  # Set the PubMed ID in the params dictionary
        response = requests.get(base_url, params=params)

        if response.status_code == 200:
            # Parse the abstract text from XML
            try:
                root = ET.fromstring(response.text)
                abstract_text_elements = root.findall('.//AbstractText')
                abstract_texts = ' '.join([element.text for element in abstract_text_elements if element.text])

                # Fetch and filter MeSH terms
                mesh_terms = fetch_mesh_terms(pubmed_id)
                filtered_mesh_terms = filter_mesh_terms(mesh_terms, keywords)

                # Write the PubMed ID, abstract, and filtered MeSH terms to the CSV
                writer.writerow([pubmed_id, abstract_texts, ', '.join(filtered_mesh_terms)])

            except ET.ParseError as e:
                print(f"XML parsing error for PubMed ID {pubmed_id}: {e}")

        else:
            print(f"Failed to fetch abstract for PubMed ID {pubmed_id}. Status code: {response.status_code}")

        time.sleep(1)

print("Extraction completed. Data saved to 'pubmed_abstracts_with_mesh.csv'.")

Extraction completed. Data saved to 'pubmed_abstracts_with_mesh.csv'.


In [11]:
from transformers import pipeline

# Load BioBERT model for NER
ner_model = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", tokenizer="dmis-lab/biobert-base-cased-v1.1")

def extract_entities(text):
    entities = ner_model(text)
    return entities

# Example text (replace with abstracts from CSV)
abstract_text = "Although antibody-drug conjugate (ADC) or immune checkpoint inhibitors (ICIs) alone fosters hope for the treatment of cancer, the effect of single drug treatment is limited and the safety profile of ADC and ICI therapy remains unclear. This meta-analysis aimed to examine the efficacy and safety of the combination of ADC and ICI therapy. This study type is a systematic review and meta-analysis. Literature retrieval was carried out through PubMed, Embase, Cochrane from inception to Jun. 5, 2024. Then, after data extraction, overall response rate (ORR) and adverse effects (AEs) were used to study its efficiency and safety. Publication bias was also calculated through Funnel plot, Begg's Test and Egger's test. Heterogeneity was investigated through subgroup and sensitivity analysis. The research protocol was registered with the PROSPERO (CRD42023375601). A total of 12 eligible clinical studies with 584 patients were included. The pooled ORR was 58% (95%CI 46%, 70%). Subgroup analysis showed an ORR of 77% (95%CI 63%, 91%) in classical Hodgkin lymphoma (cHL) and an ORR of 73% (95%CI 56%, 90%) in non-Hodgkin lymphoma (NHL). The most common AEs was peripheral neuropathy (38.0%). Meanwhile, AEs on skin (13.1-20.0%) and digestive system (9.0-36.0%) was hard be overlooked. ADC + ICI therapy may be recommended in cancer treatment, especially in cHL and NHL. However, strategies to manage toxicities warranted further exploration."

# Extract entities
entities = extract_entities(abstract_text)
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entity: although, Label: LABEL_1
Entity: anti, Label: LABEL_0
Entity: ##body, Label: LABEL_0
Entity: -, Label: LABEL_0
Entity: drug, Label: LABEL_0
Entity: con, Label: LABEL_0
Entity: ##ju, Label: LABEL_1
Entity: ##gate, Label: LABEL_1
Entity: (, Label: LABEL_0
Entity: ad, Label: LABEL_0
Entity: ##c, Label: LABEL_0
Entity: ), Label: LABEL_0
Entity: or, Label: LABEL_0
Entity: immune, Label: LABEL_0
Entity: check, Label: LABEL_0
Entity: ##point, Label: LABEL_0
Entity: inhibitor, Label: LABEL_1
Entity: ##s, Label: LABEL_0
Entity: (, Label: LABEL_0
Entity: i, Label: LABEL_0
Entity: ##cis, Label: LABEL_0
Entity: ), Label: LABEL_0
Entity: alone, Label: LABEL_0
Entity: foster, Label: LABEL_1
Entity: ##s, Label: LABEL_0
Entity: hope, Label: LABEL_0
Entity: for, Label: LABEL_0
Entity: the, Label: LABEL_0
Entity: treatment, Label: LABEL_0
Entity: of, Label: LABEL_1
Entity: cancer, Label: LABEL_0
Entity: ,, Label: LABEL_0
Entity: the, Label: LABEL_0
Entity: effect, Label: LABEL_0
Entity: of, Labe

In [23]:
pip install --upgrade openai


Collecting openai
  Downloading openai-1.50.2-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.50.2-py3-none-any.whl (382 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.0/383.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━

In [1]:
import openai

In [28]:
pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.50.2
    Uninstalling openai-1.50.2:
      Successfully uninstalled openai-1.50.2
Successfully installed openai-0.28.0


In [5]:

openai.api_key = "" #Insert openai key


# Function to extract entities and relationships using ChatGPT
def extract_entities_relationships(abstracts):
    responses = []
    for abstract in abstracts:
        # Define the prompt for ChatGPT
        prompt = f"Extract the entities and relationships from the following abstract:\n\n{abstract}\n\nProvide the output as JSON in this format:\n{{'entities': [{{'id': 'Entity1', 'type': 'Type1'}}, {{'id': 'Entity2', 'type': 'Type2'}}], 'relationships': [{{'source': 'Entity1', 'target': 'Entity2', 'relation': 'RELATION_TYPE'}}]}}"

        # Make an API call
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5,
            max_tokens=1500
        )

        # Append the response
        responses.append(response['choices'][0]['message']['content'])

    return responses

# Example abstracts (replace with your actual data)
abstracts = [
    "This convergent parallel-design mixed-methods process evaluation of the QUARTET USA (Quadruple Ultra-Low-Dose Treatment for Hypertension USA) clinical trial (NCT03640312) explores patient and health care professional perceptions about the use of low-dose quadruple therapy (LDQT) as a novel strategy for hypertension management. A survey of all 62 patients enrolled in the QUARTET USA trial was conducted. A subsample of 13 patients and 11 health care professionals, recruited via purposive sampling, took part in semistructured interviews. At enrollment, 68% of participants (mean [SD] age, 51.7 [11.5] years; 56% self-identified as Hispanic: Mexican ethnicity, 16% as Hispanic: other ethnicity, 16% as Black race, 8% as White race, and 1.6% as South Asian race) reported that their current health depended on blood pressure medications, and 48% were concerned about blood pressure medications. At trial completion, 80% were satisfied with LDQT, 96% were certain the benefits of taking LDQT outweighed the disadvantages, and 96% reported that LDQT was convenient to take. Both patients and health care professionals found LDQT acceptable because it reduced patients' perceived pill burden and facilitated medication adherence. Health care professionals stated that a perceived limitation of LDQT was the inability to titrate doses. Steps to facilitate LDQT implementation include introducing stepped-care combinations and treatment protocols, inclusion in clinical practice guidelines, and eliminating patient cost barriers. LDQT was an acceptable strategy for hypertension treatment among patients and health care professionals involved in the QUARTET USA clinical trial. Although LDQT was generally perceived as beneficial for maintaining patients' blood pressure control and facilitating adherence, some clinicians perceived limitations in titration inflexibility, adverse effects, and costs. URL: https://www.clinicaltrials.gov; Unique identifier: NCT03640312."
]

# Call the function to extract entities and relationships
results = extract_entities_relationships(abstracts)

# Print the extracted results
for result in results:
    print(json.loads(result))  # Parsing the JSON response


{'entities': [{'id': 'QUARTET USA', 'type': 'Clinical Trial'}, {'id': 'QUARTET USA trial participants', 'type': 'Patients'}, {'id': 'LDQT', 'type': 'Therapy'}, {'id': 'health care professionals', 'type': 'Health Care Professionals'}], 'relationships': [{'source': 'QUARTET USA', 'target': 'QUARTET USA trial participants', 'relation': 'Enrollment'}, {'source': 'QUARTET USA', 'target': 'LDQT', 'relation': 'Exploration'}, {'source': 'QUARTET USA trial participants', 'target': 'LDQT', 'relation': 'Satisfaction'}, {'source': 'QUARTET USA trial participants', 'target': 'LDQT', 'relation': 'Perceptions'}, {'source': 'QUARTET USA trial participants', 'target': 'health care professionals', 'relation': 'Interviews'}, {'source': 'LDQT', 'target': 'Patients', 'relation': 'Acceptability'}, {'source': 'LDQT', 'target': 'Health Care Professionals', 'relation': 'Acceptability'}, {'source': 'LDQT', 'target': 'Blood Pressure Control', 'relation': 'Beneficial'}, {'source': 'LDQT', 'target': 'Medication Ad

In [4]:
pip install boto3

Collecting boto3
  Downloading boto3-1.35.29-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.36.0,>=1.35.29 (from boto3)
  Downloading botocore-1.35.29-py3-none-any.whl.metadata (5.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.35.29-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.29-py3-none-any.whl (12.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.6/12.6 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.10.2-py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.7/82.7 kB[0m [31m6.0 MB/s[0m eta [36m0:0

In [9]:
import os
import boto3

In [21]:
def upload_file_to_s3(file_name, bucket_name, object_name=None):
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file to S3
    s3_client = boto3.client('s3')
    try:
        s3_client.upload_file(file_name, bucket_name, object_name)
        print(f"File {file_name} uploaded to S3 bucket {bucket_name} as {object_name}")
    except Exception as e:
        print(f"Error occurred while uploading file: {e}")

# Example usage
file_name = '/content/pubmed_abstracts_with_mesh.csv'  # Local path to your generated CSV file
bucket_name = 'gmu-cec-sagemaker-daen690-or'  # Name of your S3 bucket

upload_file_to_s3(file_name, bucket_name)

Error occurred while uploading file: Unable to locate credentials


In [18]:
file_name = "/content/pubmed_abstracts_with_mesh.csv"
print(os.path.exists(file_name))

True
