# RAG

## Retrieve and Generate

In [35]:
from langchain_voyageai import VoyageAIEmbeddings
import os
import boto3
from urllib.parse import urlparse
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import ChatOpenAI
import openai
from langchain.chains import LLMChain, RetrievalQA
import time
import re
from langchain_pinecone import PineconeVectorStore
from langchain.memory import ConversationBufferMemory
from langchain.schema import HumanMessage
from IPython.display import Markdown, display
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

os.environ["VOYAGE_AI_API_KEY"] = os.getenv("VOYAGE_AI_API_KEY")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")


model_name = "voyage-large-2"  
embeddings = VoyageAIEmbeddings(
    model=model_name,  
    voyage_api_key=os.environ["VOYAGE_AI_API_KEY"]
)

batch size None


In [36]:
# Function to generate pre-signed URL
def generate_presigned_url(s3_uri):
    # Parse the S3 URI
    parsed_url = urlparse(s3_uri)
    bucket_name = parsed_url.netloc
    object_key = parsed_url.path.lstrip('/')
    
    # Create an S3 client
    s3_client = boto3.client('s3')
    
    # Generate a pre-signed URL for the S3 object
    presigned_url = s3_client.generate_presigned_url(
        'get_object',
        Params={'Bucket': bucket_name, 'Key': object_key},
        ExpiresIn=3600  # URL expiration time in seconds
    )
    return presigned_url

# Function to retrieve documents, generate URLs, and format the response
def retrieve_and_format_response(query, retriever, llm):
    docs = retriever.get_relevant_documents(query)
    
    formatted_docs = []
    for doc in docs:
        content_data = doc.page_content
        s3_uri = doc.metadata['id']
        s3_gen_url = generate_presigned_url(s3_uri)
        formatted_doc = f"{content_data}\n\n[More Info]({s3_gen_url})"
        formatted_docs.append(formatted_doc)
    
    combined_content = "\n\n".join(formatted_docs)
    # print(combined_content)
    
    # Create a prompt for the LLM to generate an explanation based on the retrieved content
    prompt = f"Instruction: Based on the following information, provide a summarized & concise explanation using a couple of sentences. \
               Only respond with the information relevant to the user query {query}, if there are none, make sure you say 'I don't know, I did not find the relevant data in the knowledge base.' \
               In the event that there's relevant info, make sure to attach the download button at the very end: \n\n[More Info]({s3_gen_url}) \
               Context: {combined_content}"
    
    # Create the messages for the LLM input
    messages = [HumanMessage(content=prompt)]
    
    # Generate the response using the LLM
    response = llm(messages=messages)
    return {"answer": response.content}

In [37]:
# PINECONE
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

index_name = "test"

# Retriever
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

# Initialize LLM
llm = ChatOpenAI(model="gpt-4o", openai_api_key=openai.api_key)
# Initialize Memory
memory = ConversationBufferMemory()

In [40]:
query1 = "What is Cyclacillin?"
query2 = "What drugs are being used to treat HIV?"
docs = docsearch.as_retriever().get_relevant_documents(query1)
docs

[Document(page_content='DrugBank ID: DB01000\nName: Cyclacillin\nDescription: A cyclohexylamido analog of penicillanic acid.\nMechanism of Action: The bactericidal activity of cyclacillin results from the inhibition of cell wall synthesis via affinity for penicillin-binding proteins (PBPs). Cyclacillin is stable in the presence of a variety of b-lactamases, including penicillinases and some cephalosporinases.\nTargetActionsOrganismAPenicillin-binding protein 1AinhibitorStreptococcus pneumoniae (strain ATCC BAA-255 / R6)APenicillin-binding protein 3inhibitorStreptococcus pneumoniaeAPenicillin binding protein 2ainhibitorStaphylococcus aureusAPenicillin-binding proteininhibitorGram positive and gram negative bacteria\nIndication: For the treatment of bacterial infections caused by susceptible organisms.\nReduce drug development failure ratesBuild, train, & validate machine-learning modelswith evidence-based and structured datasets.See how  Build, train, & validate predictive machine-learn

In [41]:
# Example usage
response = retrieve_and_format_response(query2, docsearch.as_retriever(), llm=llm)

# Display the formatted response as Markdown
display(Markdown(response["answer"]))

The drugs used to treat HIV include Fosamprenavir, Darunavir, Atazanavir, and Lopinavir. These are all HIV protease inhibitors used in combination with other antiretroviral agents to manage HIV-1 infection.

[Download](https://data-chunking-us.s3.amazonaws.com/staging/DB01319_parag_00000_b5941031.json?AWSAccessKeyId=AKIA5FTZBA2QHXXSXKKA&Signature=GQdLw3bLFPhcYlp9t0%2FQsOYCxZ8%3D&Expires=1718979710)

---

In [42]:
index = pc.Index(index_name)
response = index.query(vector=[0]*1536, top_k=5)  # Adjust dimensions as per your embeddings
for match in response['matches']:
    print(match)


{'id': 's3://data-chunking-us/staging/Autoimmune_disorders_parag_00000_6b75fc7f.json',
 'score': 0.0,
 'values': []}
{'id': 's3://data-chunking-us/staging/Autoimmune_disorders_parag_00001_4b5d59c2.json',
 'score': 0.0,
 'values': []}
{'id': 's3://data-chunking-us/staging/Atrial_septal_defect_ASD_parag_00001_3e681e3a.json',
 'score': 0.0,
 'values': []}
{'id': 's3://data-chunking-us/staging/Addison_disease_parag_00000_5b48cc00.json',
 'score': 0.0,
 'values': []}
{'id': 's3://data-chunking-us/staging/DB01382_parag_00000_74417f4a.json',
 'score': 0.0,
 'values': []}
