In [25]:
! pip install llama-index-graph-stores-neo4j # Install Neo4j with LlamaIndex.
! pip install llama-index-embeddings-openai
! pip install llama-index-llms-openai
! pip install llama-index-readers-file
! pip install --upgrade llama-index

Collecting llama-index
  Downloading llama_index-0.12.23-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.6-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.1 (from llama-index)
  Downloading llama_index_cli-0.4.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.8-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-multi-modal-llms-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_multi_modal_llms_openai-0.4.3-py3-none-any.whl.metadata (726 bytes)
Collecting llama-index-program-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_program_openai-0.3.1-py3-none-any.whl.metadata (764 bytes)
Collecting llama-index-question-gen-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_question_gen_openai-0.

In [11]:
from google.colab import drive, userdata
from llama_index.core import SimpleDirectoryReader
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.openai import OpenAIEmbedding
import os
import openai
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
import nest_asyncio
import asyncio
from transformers import AutoTokenizer, DistilBertModel
import torch
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
from llama_index.core import StorageContext, load_index_from_storage

In [12]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/Colab Notebooks/Other/RAG'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Other/RAG


In [4]:
# Apply nest_asyncio for colab/jupyter notebook, not asyncio. This helps with running tasks concurrently to stop long execution times
nest_asyncio.apply()

# Defining Functions to perform Automated Search

## Search Functions

In [5]:
def search_with_google_api(topic):
  """
  Performs a Google search programmatically using the Google Custom Search API.
  Returns a list of search results.
  """
  google_api = userdata.get('Google_API')
  search_engine_id = userdata.get('search_engine_id')
  url = f"https://www.googleapis.com/customsearch/v1?q={topic}&key={google_api}&cx={search_engine_id}"

  response = requests.get(url)
  if response.status_code == 200:
    return response.json().get("items", [])
  else:
    print(f"Error: {response.status_code}, {response.text}")
    return []

## Using OpenAI to generate the search query

In [6]:
def gpt_answers(topic):
    """
    Generates a list of 5-7 detailed and relevant search queries for financial sentiment analysis
    based on the user's input, such as a target sector, field, or region.
    """
    API_key = userdata.get('ChatGPT')
    prompt = f"""
                You are a research analyst and search query expert. Based on the following topic, generate a list of 5-7 search queries
                for identifying approachable but relevant scientific research trends.
                Ensure the queries cover recent publications, breakthroughs, applications, and reviews, and are only relevant to recent findings.
                Do not mention the year in your queries.
                The queries should focus on extracting data relevant to academic and industry advancements.
                Output the queries as a plain Python list of strings without any Markdown formatting or code block markers.
                Example:

                Topic:
                AI-driven drug discovery in antibiotic resistance

                ['AI applications in recent antibiotic resistance research breakthroughs', 'AI drug discovery in antibiotic resistance recent publications', 'machine learning applications antibiotic resistance drug development',
                 'recent synthetic biology antibiotic resistance AI-driven approaches', 'new novel algorithms for drug discovery in antibiotic resistance']

                End of example.
                Topic: {topic} """

    client = OpenAI(api_key = API_key)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system",
             "content": "You are an expert in generating search queries for scientific research trends."},
            {"role": "user",
             "content": prompt}
        ],
        max_tokens=400
    )

    # Extract and clean up the list of queries
    queries =  response.choices[0].message.content.strip()

    # Remove Markdown code block markers if present
    if queries.startswith("```"):
      # Remove the first code fence and the language hint, then the closing code fence
      queries = queries.split("```")[1].strip()
    return eval(queries)

## Parsing page results to paragraph components

In [7]:
def fetch_full_content(url):
  """
  Fetches the full content of a webpage given its URL.
  """
  headers = {
      "User-Agent": (
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
      "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
      )
  }
  try:
    response = requests.get(url, headers=headers, timeout=10)
    if response.status_code == 200:
      soup = BeautifulSoup(response.text, "html.parser")
      paragraphs = soup.find_all("p")
      full_text = "\n".join([p.get_text() for p in paragraphs])
      return full_text.strip() if full_text else None
    else:
      print(f"Error: Unable to fetch content from {url} (Status Code: {response.status_code})")
      return None
  except Exception as e:
    print(f"Error fetching content from {url}: {e}")
    return None

## Creating documents for loading to database

In [14]:
def create_dataset_from_queries(queries, directory):
  """
  Processes search queries and save results as text files in the same directory.
  """
  if not os.path.exists(directory):
      os.makedirs(directory)

  file_count = 1  # To ensure unique filenames across all queries

  for query in queries:
    print(f"Processing query: {query}")
    valid_count = 0
    page_number = 1

    while valid_count < 10:
      print(f"Fetching search results, page {page_number}...")
      results = search_with_google_api(query + f"&start={page_number * 10}")

      if not results:
        print("No more results found. Try refining the query.")
        break

      for result in results:
        if valid_count >= 10:
            break  # Stop when 10 valid documents are saved

        title = result["title"]
        link = result["link"]
        snippet = result.get("snippet", "No snippet")

        # Fetch full content of the link
        full_content = fetch_full_content(link)
        if full_content:  # Save only if content is valid
          filename = f"{directory}/doc_{file_count}.txt"
          with open(filename, "w", encoding="utf-8") as f:
            f.write(f"Query: {query}\n")
            f.write(f"Title: {title}\n")
            f.write(f"Link: {link}\n")
            f.write(f"Snippet: {snippet}\n\n")
            f.write(f"Full Content:\n{full_content}")
          print(f"Saved: {filename}")
          valid_count += 1
          file_count += 1
        else:
          print(f"Skipped: {link} (No valid content)")

      page_number += 1  # Move to the next page of results

  print(f"Finished processing all queries. Total files saved: {file_count - 1}")

# Running this section of Code

In [16]:
def run_main(input_name, directory_name):
  if not os.path.exists(directory_name):
    queries = gpt_answers(input_name)
    create_dataset_from_queries(queries, directory=directory_name)
  documents = SimpleDirectoryReader(directory_name).load_data()
  return documents

In [17]:
input_name = "AI-driven drug discovery in antibiotic resistance"
documents = run_main(input_name, "AI_antibiotic_resistance_dataset")

# Building GraphRAG

In [33]:
from llama_index.llms.openai import OpenAI

OPENAI_API_KEY = userdata.get('ChatGPT')
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
neo_4j = userdata.get('neo_4j')

# Instantiate property_graph_store
graph_store = Neo4jPropertyGraphStore(
    username="neo4j",
    password=neo_4j,
    url="neo4j+s://1a90ca9c.databases.neo4j.io"
)

In [44]:
index = PropertyGraphIndex.from_documents(  #indices structure the data in intermediate reprezentations that are easy and performant for LLMs to consume.
    documents,
    embed_model = OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        SchemaLLMPathExtractor(
            llm=OpenAI(model="gpt-3.5-turbo", temperature=0.0) #Using GPT 3.5 turbo to extract the subject predicate object triples
        )
    ],
    property_graph_store=graph_store,
    show_progress=True,
    use_async=False
)

Parsing nodes:   0%|          | 0/70 [00:00<?, ?it/s]


Extracting paths from text with schema:   0%|          | 0/834 [00:00<?, ?it/s][A
Extracting paths from text with schema:   0%|          | 1/834 [00:02<33:43,  2.43s/it][A
Extracting paths from text with schema:   0%|          | 2/834 [00:02<15:10,  1.09s/it][A
Extracting paths from text with schema:   0%|          | 3/834 [00:04<22:36,  1.63s/it][A
Extracting paths from text with schema:   0%|          | 4/834 [00:05<15:48,  1.14s/it][A
Extracting paths from text with schema:   1%|          | 5/834 [00:06<17:05,  1.24s/it][A
Extracting paths from text with schema:   1%|          | 6/834 [00:06<12:30,  1.10it/s][A
Extracting paths from text with schema:   1%|          | 7/834 [00:09<19:01,  1.38s/it][A
Extracting paths from text with schema:   1%|          | 8/834 [00:10<19:52,  1.44s/it][A
Extracting paths from text with schema:   1%|          | 9/834 [00:12<21:57,  1.60s/it][A
Extracting paths from text with schema:   1%|          | 10/834 [00:13<17:34,  1.28s/it][A
Extra

Generating embeddings:   0%|          | 0/834 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/6350 [00:00<?, ?it/s]

In [45]:
# save and load using locally stored graphs if you dont want to implement neo4j, you can directly use locally stored index in storage folder
index.storage_context.persist(persist_dir="./storage")

index = load_index_from_storage(
    StorageContext.from_defaults(persist_dir="./storage")
)

In [46]:
# load from existing graph/vector store
index = PropertyGraphIndex.from_existing(
    property_graph_store=graph_store
)

# Querying GraphRAG

In [48]:
# Define retriever
retriever = index.as_retriever(
    include_text=False,  # Default is true
)
results = retriever.retrieve("What is the summary of the texts?")
for record in results:
    print(record.text)

Senescau A -> WORKED_ON -> Innovative DendrisChipsÂ® Technology for a Syndromic Approach of In Vitro Diagnosis: Application to the Respiratory Infectious Diseases
Hsueh, P.R. -> WORKED_ON -> Decreasing rates of resistance to pencillin, but not erythromycin, in Streptococcus pneumoniae after introduction of a policy to restrict antibiotic use in Taiwan
Hsueh, P. -> WORKED_ON -> Decreased erythromycin use after antimicrobial reimbursement restriction for undocumented bacterial upper respiratory tract infections significantly reduced erythromycin resistance in Streptococcus pyogenes in Taiwan
Hsueh, P.R. -> WORKED_ON -> Changes in macrolide resistance among respiratory pathogens after decreased erythromycin consumption in Taiwan


In [50]:
query_engine = index.as_query_engine(include_text=True)
response = query_engine.query("Create a summary of 5 notable AI-driven drugs discovered for antibiotic resistance")
print(response)

1. Cesar de la Fuente and team have utilized AI and machine learning to accelerate the discovery and development of antimicrobial peptides, reducing the timeline significantly.
2. Berman and Krysan have worked on drug resistance and tolerance in fungi, showcasing the potential of AI in addressing antifungal resistance.
3. L. Zurek and A. Ghosh have collaborated on research linking insects to the spread of antibiotic resistance traits, highlighting the importance of understanding diverse pathways of resistance.
4. R. Laxminarayan has contributed to the global need for solutions in antibiotic resistance, emphasizing the urgency for innovative approaches in combating resistance.
5. Cuomo and Rogers have delved into the molecular and genetic basis of antifungal resistance in emerging pathogens like Candida auris, demonstrating the application of AI in understanding resistance mechanisms.


In [51]:
query_engine = index.as_query_engine(include_text=True)
response = query_engine.query("Give an example of AI being used to discover drugs for antibiotic resistance")
print(response)

de Heer K worked on the detection of invasive pulmonary aspergillosis using electronic nose technology, which is an example of AI being used to discover drugs for antibiotic resistance.


In [52]:
query_engine = index.as_query_engine(include_text=True)
response = query_engine.query("What is invasive pulmonary aspergillosis?")
print(response)

Invasive pulmonary aspergillosis is a severe fungal infection caused by the Aspergillus species that primarily affects the lungs.


In [54]:
query_engine = index.as_query_engine(include_text=True)
response = query_engine.query("What contribution has Regina Barzilay made to AI-driven drugs discovery?")
print(response)

Regina Barzilay has not been mentioned in the provided context information.


In [55]:
query_engine = index.as_query_engine(include_text=True)
response = query_engine.query("What contribution has Alexander Fleming made to AI-driven drugs discovery?")
print(response)

Alexander Fleming's contribution to AI-driven drug discovery is not mentioned in the provided context information.
