Step 1: Read the docs

Step 2: Convert that docs into chunks so that it will be easy for LLMs to read

Step 3: COnvert these chunks into vectors using OpenAIEmbeddings

Step 4: Store these in VectorSearchDB (Using Pinecone)

Step 5: Apply any kind of similarity search to get the docs

### Importing the libraries

In [2]:
import openai 
import langchain
import pinecone
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain_openai import OpenAI


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

openai_key = os.getenv('OPENAI_API_KEY')

### Reading the Document

In [3]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [4]:
doc = read_doc('Document/')
doc

[Document(metadata={'source': 'Document\\USMLERxStep1.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'Document\\USMLERxStep1.pdf', 'page': 1}, page_content='FOR  \nTHE® \nNew York / Chicago / San Francisco / Athens / London / Madrid / Mexico City  \nMilan / New Delhi / Singapore / Sydney / TorontoUSMLE  \nSTEP 1  \n2023FIRST  AID\nTAO LE, MD, MHS\nFounder, ScholarRx\nAssociate Clinical Professor, Department of MedicineUniversity of Louisville School of MedicineVIKAS BHUSHAN, MD\nFounder, First Aid for the USMLE Step 1Boracay, Philippines\nCONNIE QIU, MD, P hD\nResident, Department of Dermatology\nJohns Hopkins Hospital\nPANAGIOTIS KAPARALIOTIS, MD\nUniversity of Athens Medical School, Greece \nKIMBERLY KALLIANOS, MD\nAssistant Professor, Department of Radiology and Biomedical ImagingUniversity of California, San Francisco School of MedicineANUP CHALISE, MBBS, MS, MRCSE d\nKathmandu, Nepal \nCAROLINE COLEMAN, MD\nResident, Department of MedicineEmory University School

In [5]:
len(doc)

849

### Let's divide our doc into text chunks

In [6]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return docs

In [7]:
documents = chunk_data(docs=doc)
documents

[Document(metadata={'source': 'Document\\USMLERxStep1.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'Document\\USMLERxStep1.pdf', 'page': 1}, page_content='FOR  \nTHE® \nNew York / Chicago / San Francisco / Athens / London / Madrid / Mexico City  \nMilan / New Delhi / Singapore / Sydney / TorontoUSMLE  \nSTEP 1  \n2023FIRST  AID\nTAO LE, MD, MHS\nFounder, ScholarRx\nAssociate Clinical Professor, Department of MedicineUniversity of Louisville School of MedicineVIKAS BHUSHAN, MD\nFounder, First Aid for the USMLE Step 1Boracay, Philippines\nCONNIE QIU, MD, P hD\nResident, Department of Dermatology\nJohns Hopkins Hospital\nPANAGIOTIS KAPARALIOTIS, MD\nUniversity of Athens Medical School, Greece \nKIMBERLY KALLIANOS, MD\nAssistant Professor, Department of Radiology and Biomedical ImagingUniversity of California, San Francisco School of MedicineANUP CHALISE, MBBS, MS, MRCSE d\nKathmandu, Nepal \nCAROLINE COLEMAN, MD\nResident, Department of MedicineEmory University School

In [8]:
len(documents)

849

### Embedding

In [9]:
embeddings =OpenAIEmbeddings(api_key=openai_key)
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001C7A13D8820>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001C7A300D1E0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [10]:
# Let's test the vectors
vectors = embeddings.embed_query("How are you?")
vectors

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
# Let's create Vector Search DB in Pinecone

from pinecone import Pinecone
pinecone_api = os.getenv('PINECONE_API_KEY')
index = Pinecone.index='langchainvectors'

In [None]:
from langchain_pinecone import PineconeVectorStore   
vectorstore = PineconeVectorStore(index_name=index, 
                                  embedding=embeddings, 
                                  pinecone_api_key=pinecone_api)



In [None]:
vectorstore_db = PineconeVectorStore.from_documents(documents=documents,
                                                    embedding=embeddings,
                                                    index_name=index)

In [None]:
# Cosine Similarity Retrieve Results from VectorDB

def retrieve_query(query, k=2):
    matching_results = vectorstore_db.similarity_search(query, k=k)
    return matching_results

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI

llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0.5)
chain =load_qa_chain(llm=llm, chain_type='stuff')


In [None]:
# Search ans from Vector DB

def retrieve_ans(query):
    doc_search = retrieve_query(query)
    print(doc_search)
    response = chain.run(
        input_documents=doc_search,
        question=query
    )
    return response

In [None]:
# Testing the query and search

our_query = "Where does Osteoma located?"
answer = retrieve_ans(our_query)
print(answer)

[Document(metadata={'page': 490.0, 'source': 'Document\\FA 2023 - @USMLERxStep1.pdf'}, page_content='Musculoskeletal, skin, and c onnective tissue  `\u2009pathology Musculoskeletal, skin, and connective  tissue  `\u2009pathol ogy section  iii 470\nPrimary bone tumors Metastatic disease is more common than 1° bone tumors. Benign b one tumors that start with o are \nmore common in b oys.\ntUm o R t yp E Ep idEmi ology loCat ion ChaRaCtERis tiCs\nBenign tumors\nOsteochondroma Most common benign \nbone tumor\nMales < 25 years oldMetaphysis of long bones Lateral bony projection of growth \nplate (continuous with marrow space) \ncovered by cartilaginous cap A\nRarely transforms to chondrosarcoma\nOsteoma Middle age Surface of facial bones Associated with Gardner syndrome\nOsteoid osteoma Adults < 25 years oldMales > femalesCortex of long bones Presents as bone pain (worse at night) \nthat is relieved by NSAIDs\nBony mass (< 2 cm) with radiolucent \nosteoid core \nB\nOsteoblastoma Males > fem