# **NOTEBOOK FOR RAG TASK**

In [None]:
#Installing all the libraries:

!pip install -qU \
    langchain==0.0.354 \
    langchain-community\
    openai==1.6.1 \
    pinecone-client==3.1.0 \
    tiktoken==0.5.2 \
    gradio==3.40.0
!pip install pinecone python-dotenv langchain sentence-transformers



In [None]:
#Importing some pinecone functions to create the vector database
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="0698da66-0b58-42b1-a069-e379cd92332d")

In [None]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [None]:
# This is primarily responsible for setting up and interacting with a Pinecone index
import os
index_name = "rag-hybrid-search-langchain-gradio2"
if index_name not in pc.list_indexes().names():
  pc.create_index(
    name=index_name,
    dimension=384, #dimensions of dense vector
    metric="dotproduct", # sparse values supported only for dotproduct
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

index=pc.Index(index_name)
index
index_stats = index.describe_index_stats()
print(index_stats)

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 18}},
 'total_vector_count': 18}


In [None]:
pip install pinecone-text



In [None]:
# This code initializes and loads a sentence transformer model from the HuggingFace library to generate embeddings for the text documents
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')#sentence tranformer
embeddings



HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
# BM encoder needed for sparse encodings
from pinecone_text.sparse import BM25Encoder
bm25_encoder=BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7ec80540fc40>

In [None]:
#This code defines a custom hybrid retriever that performs both dense (using transformer embeddings) and sparse (using BM25) retrieval.

from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_community.vectorstores import Pinecone
from langchain.docstore.document import Document

class CustomPineconeHybridSearchRetriever(PineconeHybridSearchRetriever):
    def get_relevant_documents(self, query: str):
        docs = super().get_relevant_documents(query)
        return [
            Document(
                page_content=getattr(doc.metadata, 'text', str(doc.metadata)),
                metadata=doc.metadata
            ) for doc in docs
        ]

retriever = CustomPineconeHybridSearchRetriever(
    index=index,
    embeddings=embeddings,
    sparse_encoder=bm25_encoder,
    alpha=0.5  # Adjust this value to balance between sparse and dense retrieval
)

In [None]:
#This code creates a Pinecone vector store that can store documents as dense embeddings (vectors) generated by a transformer model.
from langchain_community.vectorstores import Pinecone

# Recreate the vector store
vectorstore = Pinecone(index, embeddings.embed_query, "text")



In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:
# Check retriever configuration(just trouble shooting)
print(retriever)


tags=['Pinecone'] vectorstore=<langchain_community.vectorstores.pinecone.Pinecone object at 0x7ec805441bd0> search_kwargs={'k': 5}


In [None]:
#Mounting the data files using drive directly
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

# Path to the directory containing the textbook text files
textbooks_dir = '/content/drive/My Drive/data_clean/textbooks/en'

# Check if the directory exists
if os.path.exists(textbooks_dir):
    print("Directory exists.")
else:
    print("Directory does not exist.")


Directory exists.


In [None]:
# List the files in the directory
files = os.listdir(textbooks_dir)
print("Files in the textbooks directory:")
for file in files:
    print(file)


Files in the textbooks directory:
Anatomy_Gray.txt
Biochemistry_Lippincott.txt
Cell_Biology_Alberts.txt
First_Aid_Step1.txt
First_Aid_Step2.txt
Gynecology_Novak.txt
Histology_Ross.txt
Immunology_Janeway.txt
InternalMed_Harrison.txt
Neurology_Adams.txt
Obstentrics_Williams.txt
Pathology_Robbins.txt
Pathoma_Husain.txt
Pediatrics_Nelson.txt
Pharmacology_Katzung.txt
Physiology_Levy.txt
Psichiatry_DSM-5.txt
Surgery_Schwartz.txt


In [None]:
documents = []

# Loop through the textbook files and read content
for filename in os.listdir(textbooks_dir):
    if filename.endswith('.txt'):  # Ensure we're only reading .txt files
        filepath = os.path.join(textbooks_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:  # Use utf-8 encoding for reading
            text = file.read()
            documents.append(text)

print(f"Loaded {len(documents)} textbooks.")


Loaded 18 textbooks.


In [None]:
#checking initial files and embeddings
for i in range(min(5, len(documents))):  # Check first 5 documents
    print(f"Document {i}: {documents[i]}")
    print(f"Embedding {i}: {embeddings.embed_query(documents[i])}\n")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Manage renal or cardiac failure.

The mortality rate associated with TSS is 3–6%.

Three major causes of death are ARDS, intractable hypotension, and hemorrhage 2° to DIC.

TSS is a rare but potentially fatal reaction to S. aureus toxin. Diagnosis is clinical because reaction is to the toxin produced by the bacteria, not to the bacterium itself. The f rst steps in treatment are rapid rehydration and antibiotic treatment.

Gynecologic cancers include uterine, endometrial, ovarian, cervical, and vulvar neoplasms. Ovarian cancer carries the highest mortality.

The most common benign neoplasm of the female genital tract. The tumor is discrete, round, frm, and often multiple and is composed of smooth muscle and connective tissue. Tumors are estrogen and progesterone sensitive, so they often ↑ in size during pregnancy and ↓ after menopause. Malignant transformation to leiomyosarcoma is rare (0.1–0.5%). Prevalence is 25% among 

In [None]:
#This code is used to generate embeddings for a batch of documents and upsert (insert/update) those embeddings into a Pinecone index.
from tqdm.auto import tqdm

# Generate embeddings and upsert to Pinecone
batch_size = 100  # Adjust based on your needs
for i in tqdm(range(0, len(documents), batch_size)):
    i_end = min(i+batch_size, len(documents))
    batch = documents[i:i_end]
    ids = [f"doc_{j}" for j in range(i, i_end)]
    embeds = embeddings.embed_documents(batch)

    # Create metadata for each document
    metadata = [{"text": doc[:1000]} for doc in batch]  # Limit metadata to first 1000 characters

    # Prepare vectors for upsert
    to_upsert = [{"id": id, "values": embed, "metadata": meta} for id, embed, meta in zip(ids, embeds, metadata)]

    # Upsert to Pinecone
    _ = index.upsert(vectors=to_upsert)

print("Finished upserting all documents.")

  0%|          | 0/1 [00:00<?, ?it/s]

Finished upserting all documents.


In [None]:
#importing these files for usage(PS. did not get the chance to use them yet due to some issues)
import json
from pathlib import Path

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Load the datasets
us_qbank_path = Path('/content/drive/My Drive/data_clean/questions/US/US_qbank.jsonl')
us_train_path = Path('/content/drive/My Drive/data_clean/questions/US/train.jsonl')
us_dev_path = Path('/content/drive/My Drive/data_clean/questions/US/dev.jsonl')
us_test_path = Path('/content/drive/My Drive/data_clean/questions/US/test.jsonl')

questions_data = read_jsonl(us_qbank_path)
train_data = read_jsonl(us_train_path)
dev_data = read_jsonl(us_dev_path)
test_data = read_jsonl(us_test_path)


In [None]:
# Retrieve index description to get the host URL
index_info = pc.describe_index(index_name)
print(index_info)


{'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'rag-hybrid-search-langchain-gradio2-ur093ky.svc.aped-4627-b74a.pinecone.io',
 'metric': 'dotproduct',
 'name': 'rag-hybrid-search-langchain-gradio2',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [None]:
!pip install transformers accelerate




In [None]:
#just testing to see if its retrieving
test_queries = [
    "diabetes symptoms",
    "symptoms of diabetes",
    "diabetes",
    "common symptoms",
]

for query in test_queries:
    try:
        docs = retriever.get_relevant_documents(query)
        if docs:
            print(f"Results for '{query}':")
            print("\n".join([doc.page_content for doc in docs]))
        else:
            print(f"No relevant documents found for '{query}'.")
    except Exception as e:
        print(f"Error occurred for '{query}': {str(e)}")

Results for 'diabetes symptoms':
Jonathan S. Berek Paula J. Adams Hillard

We are all products of our environment, our background, and our culture. The importance of ascertaining the patient’s general, social, and familial situation cannot be overemphasized. The physician should avoid being judgmental, particularly with respect to questions about sexual practices and sexual orientation.

Good communication is essential to patient assessment and treatment. The foundation of communication is based on key skills: empathy, attentive listening, expert knowledge, and rapport. These skills can be learned and refined.

The Hippocratic Oath demands that physicians be circumspect with all patient-related information. For physician–patient communication to be effective, the patient must feel that she is able to discuss her problems in depth and in confidence.

Different styles of communication may affect the physician’s ability to perceive the patient’s status and to achieve the goal of optimal a

In [None]:
#This code integrates the GPT-Neo model from Hugging Face with Langchain to create a RetrievalQA chain that answers questions by retrieving relevant documents from your Pinecone index and generating text based on those documents.

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Use a free model like GPT-Neo (non-gated)
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a Hugging Face pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id, max_length=256, max_new_tokens=50, truncation=True)


# Wrap the pipeline in a HuggingFacePipeline for use in Langchain
llm = HuggingFacePipeline(pipeline=pipe)

# Create the RetrievalQA chain with the free GPT-Neo model
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,  # Keep your existing retriever
    return_source_documents=True
)
def generate_answer_with_rag(question):
    try:
        result = qa_chain({"query": question})
        answer = result.get('result', "No answer returned.")
        sources = [doc.page_content for doc in result.get('source_documents', [])]
        return f"Answer: {answer}\n\nSources:\n" + "\n".join(sources)
    except Exception as e:
        return f"An error occurred: {str(e)}"

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [None]:
# The test output is giving the correct answer, but the UI is somehow not being able to reproduce this
test_output = pipe("What is hemoglobin?")
print(test_output)


Both `max_new_tokens` (=150) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'What is hemoglobin?\n\nHemoglobin is the most common protein found in red blood cells, and is responsible for transporting oxygen to tissues. In normal conditions, an adult healthy person contains about 8 to 10 grams of hemoglobin, but about a third of the adult hemoglobin is made up of different combinations. Some of the most common hemoglobin combinations include:\n\n-   A.1 hemoglobin. The A.1 hemoglobin mixture consists of the common A and B subtypes and the rare A.6 subtype.\n\n-   A.2 hemoglobin. The A.2 hemoglobin mixture consists of the common A and B subtypes and the rare A.5, A.6, and A.9 subtypes.'}]


In [None]:
# UI setup, used gradio
import gradio as gr

def gradio_interface(question):
    try:
        # Get the generated answer from the model
        result = generate_answer_with_rag(question)
        return result
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Define Gradio interface with inputs and outputs as text
iface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text")
iface.launch()



IMPORTANT: You are using gradio version 3.40.0, however version 4.29.0 is available, please upgrade.
--------
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



I was not able to get the chatbot running properly. The test output gave correct answers however the UI setup was not able to integrate properly. It is giving me the whole textbook/chapter where it found the keyword but it is not able to give me exact answer. I tried fixing it but without much knowledge of LLMS or chatbots, this was all I was able to do till today :(