## Import Libraries

In [1]:
import requests
import xml.etree.ElementTree as ET
import csv
import pandas as pd
import os
import logging
import time

In [2]:
import pandas as pd
import re

In [3]:
pip install openai==0.28

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [5]:
from fuzzywuzzy import process



In [6]:
pip install "numpy<2"

Note: you may need to restart the kernel to use updated packages.


In [7]:
import openai
import json
from neo4j import GraphDatabase
import pandas as pd
from collections import defaultdict

In [8]:
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans

In [9]:
pip install torch transformers

Note: you may need to restart the kernel to use updated packages.


## Import Data

In [None]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def fetch_pubmed_count(query):
    """Fetch the total count of articles for the query."""
    search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmode=json&retmax=1"
    try:
        response = requests.get(search_url)
        response.raise_for_status()
        data = response.json()
        return int(data['esearchresult']['count'])
    except requests.RequestException as e:
        logging.error(f"Failed to retrieve PubMed count: {e}")
        return 0


In [None]:
def search_pubmed_for_ids(query, retstart=0, retmax=10000):
    """Search PubMed for articles and return a list of PubMed IDs based on pagination."""
    search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmode=json&retmax={retmax}&retstart={retstart}"
    try:
        response = requests.get(search_url)
        response.raise_for_status()
        data = response.json()
        return data['esearchresult']['idlist']
    except requests.RequestException as e:
        logging.error(f"Failed to retrieve PubMed IDs: {e}")
        return []


In [None]:
def fetch_abstract(pubmed_id, session, params):
    """Fetch an abstract from PubMed based on the PubMed ID."""
    params['id'] = pubmed_id
    try:
        response = session.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", params=params)
        response.raise_for_status()
        root = ET.fromstring(response.text)

        # Extract abstract texts
        abstract_text_elements = root.findall('.//AbstractText')
        abstract_text = ' '.join(''.join(element.itertext()).strip() for element in abstract_text_elements if element.text)
        
        # Extract MeSH headings
        mesh_heading_elements = root.findall('.//MeshHeading/DescriptorName')
        mesh_headings = ', '.join([element.text for element in mesh_heading_elements if element.text])

        return abstract_text, mesh_headings

    except (requests.RequestException, ET.ParseError) as e:
        logging.error(f"Error fetching data for PubMed ID {pubmed_id}: {e}")
        return None, None

In [None]:
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'

def clean_response_text(response_text):
    """Remove invalid control characters and return cleaned text."""
    cleaned_text = ''.join(c for c in response_text if c.isprintable())
    return cleaned_text

def write_abstracts_to_csv(abstract_ids, api_key):
    """Fetch abstracts for a list of PubMed IDs and write them to a CSV file."""
    file_exists = os.path.isfile('pubmed_abstracts_with_mesh.csv')
    params = {
        'db': 'pubmed',
        'rettype': 'abstract',
        'api_key': api_key,
    }
    batch_size = 200 
    total_abstract_count = 0
    valid_abstract_count = 0
    null_abstract_count = 0

    with open('pubmed_abstracts_with_mesh.csv', mode='a' if file_exists else 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)

        # Write the header row only if the file is being created (i.e., it does not already exist)
        if not file_exists:
            writer.writerow(['PubMedID', 'Abstract', 'Mesh Headings'])
        
        for i in range(0, len(abstract_ids), batch_size):
            batch_ids = abstract_ids[i:i + batch_size]
            params['id'] = ','.join(batch_ids)

            # Add a sleep to avoid hitting rate limits
            time.sleep(0.5)
            
            response = requests.get(base_url, params=params)

            if response.status_code == 200:
                total_abstract_count += len(batch_ids)

                # Clean and parse the XML content
                cleaned_response = clean_response_text(response.text)

                try:
                    root = ET.fromstring(cleaned_response)

                    # Find all elements with the tag 'AbstractText'
                    abstract_text_elements = root.findall('.//AbstractText')
                    abstract_texts = ' '.join(' '.join(element.itertext()).strip() for element in abstract_text_elements if element.text)

                    # Find all the MeshHeadings
                    mesh_heading_elements = root.findall('.//MeshHeading/DescriptorName')
                    mesh_headings = ', '.join([element.text for element in mesh_heading_elements if element.text])

                    # Write the PubMed ID and combined abstract into the CSV
                    for pubmed_id in batch_ids:
                        if abstract_texts:  # Only write if abstract is not null
                            writer.writerow([pubmed_id, abstract_texts, mesh_headings])
                            valid_abstract_count += 1
                        else:
                            null_abstract_count += 1

                except ET.ParseError as e:
                    logging.error(f"XML parsing error for batch starting with PubMed ID {batch_ids[0]}: {e}")
                    logging.error(f"Response content: {cleaned_response[:500]}")  # Log first 500 chars of the response

            else:
                logging.error(f"Failed to fetch data for batch starting with PubMed ID {batch_ids[0]}. Status code: {response.status_code}")

    # Log the results
    logging.info(f'Total Accessed Abstracts: {total_abstract_count}')
    logging.info(f'Valid Abstracts: {valid_abstract_count}')
    logging.info(f'Null Abstracts: {null_abstract_count}')

In [None]:
def download_all_pubmed_abstracts(keywords, api_key):
    """Download all abstracts based on keywords, utilizing pagination."""
    query = '+'.join(keywords)
    total_count = fetch_pubmed_count(query)
    batch_size = 10000

    # Iterate through all records using pagination
    for start in range(0, total_count, batch_size):
        logging.info(f"Fetching records {start} to {start + batch_size}")
        abstract_ids = search_pubmed_for_ids(query, retstart=start, retmax=batch_size)
        write_abstracts_to_csv(abstract_ids, api_key)


In [None]:

keywords = ["Neoplasms", "Antineoplastic Agents", "Adverse Effects", "Toxicity"]
api_key = 'd4a0e5f85881f5f38b9c0e9a84ac5338e408'  
download_all_pubmed_abstracts(keywords, api_key)

## Preprocess data 

In [None]:
def preprocess_text(text):
    # Remove special characters (keeping letters, numbers, and whitespace)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove formulas (anything in parentheses)
    text = re.sub(r'\(.*?\)', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove any HTTP requests
    text = re.sub(r'http[s]?://\S+', '', text)
    
    # Trim whitespace
    text = text.strip()
    
    return text

# Load the CSV file
df = pd.read_csv('pubmed_abstracts_with_mesh1.csv')

# Apply preprocessing to the 'Abstract' column
df['Abstract'] = df['Abstract'].apply(preprocess_text)

# Save the cleaned data back to a new CSV file
df.to_csv('cleaned_pubmed_abstracts_with_mesh.csv', index=False)

## Neo4J Connection

In [11]:
# Function to insert entities and relationships into Neo4j
def insert_into_neo4j(entities, relationships):
    with driver.session() as session:
        # Insert entities
        for entity in entities:
            entity_id = escape_special_chars(entity['id'])
            entity_type = escape_special_chars(entity['type'])
            query = f"""
            MERGE (e:Entity {{id: '{entity_id}', type: '{entity_type}'}})
            """
            session.run(query)
        
        # Insert relationships
        for relationship in relationships:
            source = escape_special_chars(relationship['source'])
            target = escape_special_chars(relationship['target'])
            relation = escape_special_chars(relationship['relation'].upper().replace(" ", "_"))
            query = f"""
            MATCH (source:Entity {{id: '{source}'}}),
                  (target:Entity {{id: '{target}'}})
            MERGE (source)-[:{relation}]->(target)
            """
            session.run(query)

## OpenAi Prompt

In [12]:
# Set your OpenAI API key
openai.api_key = ""  #Insert openai key here
# Initialize dynamic sets for known entities and relationships
known_entities = set()
known_relationships = set()
# Function to extract entities and relationships from an abstract using OpenAI with temperature variation
def extract_entities_relationships_multiple_runs(abstract, num_runs=3):
    combined_entities = []
    combined_relationships = []

    for i in range(num_runs):
        temperature = 0.3 + (0.4 * (i / (num_runs - 1)))
        
        prompt = f"""
        Extract the entities and relationships from the following abstract:
        {abstract}

        Provide the output as a JSON in this format:
        {{
          "entities": [
            {{"id": "Entity1", "type": "Type1"}},
            {{"id": "Entity2", "type": "Type2"}}
          ],
          "relationships": [
            {{"source": "Entity1", "target": "Entity2", "relation": "RELATION_TYPE"}}
          ]
        }}
        Ensure the output uses double quotes for property names and values.
        """
        
        # Make API call to OpenAI
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
            max_tokens=1500
        )

        # Parse the response
        result = response['choices'][0]['message']['content']
        try:
            parsed_result = json.loads(result)
            entities = parsed_result.get('entities', [])
            relationships = parsed_result.get('relationships', [])
            
            combined_entities.extend(entities)
            combined_relationships.extend(relationships)
        
        except json.JSONDecodeError:
            print(f"Error parsing JSON for temperature {temperature}: {result}")
            continue

    # Deduplicate and normalize entities and relationships
    combined_entities = normalize_and_deduplicate_entities(combined_entities)
    combined_relationships = normalize_and_deduplicate_relationships(combined_relationships)
    
    return combined_entities, combined_relationships

In [13]:
# Function to deduplicate and normalize entities
def normalize_and_deduplicate_entities(entities):
    global known_entities  # Use the dynamic known_entities set
    seen = set()
    unique_entities = []
    
    for entity in entities:
        normalized_entity = normalize_entity_name(entity['id'])
        if normalized_entity not in seen:
            seen.add(normalized_entity)
            entity['id'] = normalized_entity  # Update the entity ID to the normalized one
            unique_entities.append(entity)
            # Update the known_entities set dynamically
            known_entities.add(normalized_entity)
    
    return unique_entities

# Function to normalize and deduplicate relationships
def normalize_and_deduplicate_relationships(relationships):
    global known_relationships  # Use the dynamic known_relationships set
    seen = set()
    unique_relationships = []
    
    for relationship in relationships:
        source = normalize_entity_name(relationship['source'])
        target = normalize_entity_name(relationship['target'])
        relation = normalize_relationship_type(relationship['relation'])
        
        rel_tuple = (source, target, relation)
        if rel_tuple not in seen:
            seen.add(rel_tuple)
            unique_relationships.append({
                "source": source,
                "target": target,
                "relation": relation
            })
            # Update the known_relationships set dynamically
            known_relationships.add(relation)
    
    return unique_relationships

In [14]:
# Function to normalize entity names using fuzzy matching
def normalize_entity_name(entity_name):
    global known_entities  # Use the dynamic set of known entities
    if len(known_entities) == 0:
        # If the known_entities set is empty, return the entity name as is
        return entity_name

    # Fuzzy matching against known entities
    best_match = process.extractOne(entity_name.lower(), known_entities, scorer=process.fuzz.ratio)
    if best_match and best_match[1] > 80:  # Threshold for similarity
        return best_match[0]
    
    return entity_name

# Function to normalize relationship types using fuzzy matching
def normalize_relationship_type(relation):
    global known_relationships  # Use the dynamic set of known relationships
    if len(known_relationships) == 0:
        # If the known_relationships set is empty, return the relation as is
        return relation

    # Fuzzy matching against known relationships
    best_match = process.extractOne(relation.lower(), known_relationships, scorer=process.fuzz.ratio)
    if best_match and best_match[1] > 80:  # Threshold for similarity
        return best_match[0]
    
    return relation

In [15]:
abstract = """research on the cardiovascular toxicity of angiogenesis inhibitors among patients with cancer in taiwan is lacking this observational study explored the risk of major adverse cardiovascular events maces associated with angiogenesis inhibitors in taiwan we conducted a nested casecontrol study using the tcr taiwan cancer registry linked with the taiwan national insurance claim database we matched every case with 4 controls using riskset sampling by index date age sex cancer type and cancer diagnosis date conditional logistic regression was used to evaluate the risks of maces and different cardiovascular events using propensity score adjustment or matching sensitivity analyses were used to evaluate the risks matched by cancer stages or exposure within 1 year among a cohort of 284 292 after the exclusion of prevalent cases the incidences of maces among the overall cohort and those exposed to angiogenesis inhibitors were 225 and 325 events per 1000 personyears respectively we matched 17 817 cases with 70 740 controls with a mean age of 749 years and 568 of patients were men after propensity score adjustment angiogenesis inhibitors were associated with increased risks of maces odds ratio 456 95 ci 1781159 significantly increased risks were noted for heart failure hospitalization myocardial infarction cerebrovascular accident and venous thromboembolism but not for newonset atrial fibrillation similar results were observed after matching by cancer stage or restriction of 1year exposure angiogenesis inhibitors were associated with increased risks of maces among patients with various malignancies in taiwan but were not associated with newonset atrial fibrillation
"""
entities, relationships = extract_entities_relationships_multiple_runs(abstract, num_runs=3)

# Print the final entities and relationships
print("Entities:", entities)
print("Relationships:", relationships)

# Connect to Neo4j
uri = "neo4j://localhost:7687"  # Adjust for your Neo4j instance
username = "neo4j"
password = "gf6xb4kgeZKSS8p"
driver = GraphDatabase.driver(uri, auth=(username, password))

# Function to escape special characters for Cypher queries
def escape_special_chars(value):
    return value.replace("'", "''")

# Insert deduplicated entities and relationships into Neo4j
insert_into_neo4j(entities, relationships)

# Close Neo4j connection
driver.close()

Entities: [{'id': 'Cardiovascular toxicity', 'type': 'Medical condition'}, {'id': 'Angiogenesis inhibitors', 'type': 'Medication'}, {'id': 'Patients with cancer', 'type': 'Patient group'}, {'id': 'Taiwan', 'type': 'Location'}, {'id': 'Observational study', 'type': 'Study type'}, {'id': 'Major adverse cardiovascular events (MACEs)', 'type': 'Medical condition'}, {'id': 'Nested case-control study', 'type': 'Study design'}, {'id': 'Taiwan Cancer Registry', 'type': 'Registry'}, {'id': 'Taiwan National Insurance Claim Database', 'type': 'Database'}, {'id': 'Cases', 'type': 'Data subset'}, {'id': 'Controls', 'type': 'Data subset'}, {'id': 'Age', 'type': 'Demographic factor'}, {'id': 'Sex', 'type': 'Demographic factor'}, {'id': 'Cancer type', 'type': 'Medical condition'}, {'id': 'Cancer diagnosis date', 'type': 'Date'}, {'id': 'Logistic regression', 'type': 'Statistical analysis'}, {'id': 'Propensity score adjustment', 'type': 'Statistical analysis'}, {'id': 'Sensitivity analyses', 'type': 'S

## Lang Chain Implementation

In [16]:
pip install langchain

Note: you may need to restart the kernel to use updated packages.


In [17]:
!pip install --upgrade --quiet  langchain langchain-community langchain-groq neo4j


In [18]:
#Graph DB
NEO4J_URI="neo4j://localhost:7687"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="gf6xb4kgeZKSS8p"

In [19]:
import os
os.environ["NEO4J_URI"]=NEO4J_URI
os.environ["NEO4J_USERNAME"]=NEO4J_USERNAME
os.environ["NEO4J_PASSWORD"]=NEO4J_PASSWORD

In [20]:
from langchain_community.graphs import Neo4jGraph
graph=Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
)

In [21]:
graph

<langchain_community.graphs.neo4j_graph.Neo4jGraph at 0x28f7a7d90>

In [32]:
groq_api_key = "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"

In [33]:
from langchain_groq import ChatGroq

In [34]:
llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-8b-8192")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x29404e620>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x29404fa60>, model_name='llama3-8b-8192', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [35]:
!pip install --upgrade --quiet langchain_experimental

In [36]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_transformer=LLMGraphTransformer(llm=llm)

In [37]:
from langchain.chains import GraphCypherQAChain
chain = GraphCypherQAChain.from_llm(llm=llm, graph = graph, verbose = True, allow_dangerous_requests = True)
chain

GraphCypherQAChain(verbose=True, graph=<langchain_community.graphs.neo4j_graph.Neo4jGraph object at 0x28f7a7d90>, cypher_generation_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['question', 'schema'], input_types={}, partial_variables={}, template='Task:Generate Cypher statement to query a graph database.\nInstructions:\nUse only the provided relationship types and properties in the schema.\nDo not use any other relationship types or properties that are not provided.\nSchema:\n{schema}\nNote: Do not include any explanations or apologies in your responses.\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\nDo not include any text except the generated Cypher statement.\n\nThe question is:\n{question}'), llm=ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x29404e620>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x29404fa60>, model_name='llama3-8b-8192', mod

In [54]:
response = chain.invoke({"query": "What are different Medical condition"})
response



[1m> Entering new GraphCypherQAChain chain...[0m




Generated Cypher:
[32;1m[1;3mMATCH (e:Entity)-[:FOCUSES_ON|:EXPLORED_RISK_OF]->(m:Entity) WHERE m.type = "Medical condition" RETURN DISTINCT e.id AS id, e.type AS type, m.id AS medical_condition_id, m.type AS medical_condition_type;[0m
Full Context:
[32;1m[1;3m[{'id': 'Observational study', 'type': 'Study type', 'medical_condition_id': 'Major adverse cardiovascular events (MACEs)', 'medical_condition_type': 'Medical condition'}, {'id': 'Observational study', 'type': 'Study type', 'medical_condition_id': 'Cardiovascular toxicity', 'medical_condition_type': 'Medical condition'}][0m

[1m> Finished chain.[0m


{'query': 'What are different Medical condition',
 'result': 'Major adverse cardiovascular events (MACEs), Cardiovascular toxicity.'}

In [56]:
response = chain.invoke({"query": "Statistical analysis relation"})
response



[1m> Entering new GraphCypherQAChain chain...[0m




Generated Cypher:
[32;1m[1;3mMATCH p=(e1:Entity)-[r:USED_TO_EVALUATE|RISK_OF]->(e2:Entity) RETURN p;[0m
Full Context:
[32;1m[1;3m[{'p': [{'id': 'Logistic regression', 'type': 'Statistical analysis'}, 'USED_TO_EVALUATE', {'id': 'Major adverse cardiovascular events (MACEs)', 'type': 'Medical condition'}]}][0m

[1m> Finished chain.[0m


{'query': 'Statistical analysis relation',
 'result': 'Logistic regression is used to evaluate Major adverse cardiovascular events (MACEs).'}

In [97]:
response = chain.invoke({"query": "How is Medical condition Malignancies to Medication Angiogenesis inhibitors"})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH p = shortestPath((:Entity {type: "Medical condition", id: "Malignancies"})-[*..10]-(e:Entity {type: "Medication", id: "Angiogenesis inhibitors"})) RETURN p;[0m
Full Context:
[32;1m[1;3m[{'p': [{'id': 'Malignancies', 'type': 'Medical condition'}, 'ASSOCIATED_WITH_INCREASED_RISKS_OF', {'id': 'Angiogenesis inhibitors', 'type': 'Medication'}]}][0m

[1m> Finished chain.[0m


{'query': 'How is Medical condition Malignancies to Medication Angiogenesis inhibitors',
 'result': 'Malignancies is associated with increased risks of Angiogenesis inhibitors.'}