## Install All the Required Packages

In [None]:
!pip install langchain
!pip install pinecone-client
!pip install pypdf

In [None]:
!pip install openai
!pip install tiktoken

## Import All the Required Libraries

In [1]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

  from tqdm.autonotebook import tqdm


## Load the PDF Files

In [None]:
!mkdir pdfs

In [None]:
!gdown 1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE -O pdfs/yolov7paper.pdf
!gdown 1vILwiv6nS2wI3chxNabMgry3qnV67TxM -O pdfs/rachelgreecv.pdf

## Extract the Text from the PDF's

In [2]:
loader = PyPDFDirectoryLoader("pdfs")
data = loader.load()

In [3]:
data

[Document(metadata={'producer': '', 'creator': 'PyPDF', 'creationdate': '2025-02-02T02:45:44+05:30', 'source': 'pdfs\\Keplr pitch deck v1.pdf', 'total_pages': 21, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': '', 'creator': 'PyPDF', 'creationdate': '2025-02-02T02:45:44+05:30', 'source': 'pdfs\\Keplr pitch deck v1.pdf', 'total_pages': 21, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': '', 'creator': 'PyPDF', 'creationdate': '2025-02-02T02:45:44+05:30', 'source': 'pdfs\\Keplr pitch deck v1.pdf', 'total_pages': 21, 'page': 2, 'page_label': '3'}, page_content=''),
 Document(metadata={'producer': '', 'creator': 'PyPDF', 'creationdate': '2025-02-02T02:45:44+05:30', 'source': 'pdfs\\Keplr pitch deck v1.pdf', 'total_pages': 21, 'page': 3, 'page_label': '4'}, page_content=''),
 Document(metadata={'producer': '', 'creator': 'PyPDF', 'creationdate': '2025-02-02T02:45:44+05:30', 'source': 'pdfs\\Keplr pitch deck v1.pdf', 'total_p

## Split the Extracted Data into Text Chunks

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

In [5]:
text_chunks = text_splitter.split_documents(data)

In [6]:
text_chunks

[Document(metadata={'producer': 'macOS Version 10.15.7 (Build 19H1615) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20241029231321Z00'00'", 'title': 'OWASP LLM/GenAI Security Solutions Reference Guide v1.0', 'moddate': "D:20241029231321Z00'00'", 'source': 'pdfs\\OWASP_LLMGenAI_SecuritySolutions_ReferenceGuide.pdf', 'total_pages': 34, 'page': 1, 'page_label': '2'}, page_content='| GenAIƬLLMSecOpsandSecķriĴřSolķĴionLandscape\nPķblishedƭųŲƸŴŶƸŴŶ\nRe˩i˦ionHi˦˧orˬ\nRevision Date Authors Description\nƫ01 6Ƹ4Ƹ2024 ScottClinton InitialDraftƬCharter\nƫ05 8Ƹ10Ƹ2024 ScottClintonƬ\nContributorsInputs\nUpdatedwithinitial\nfeedback\nƫ06 10Ƹ15Ƹ2024 ScottClintonƬ\nContributorsƬReviewer\nInputs\nReǀfactorSolutions\nLandscapecategoriesƬ\n1ƫ0 10Ƹ15Ƹ2024 ContributorsƬReviewers FinalReleaseCandidate\nTheinformationprovidedinthisdocumentdoesnotƬandisnotintendedtoƬconstitutelegal'),
 Document(metadata={'producer': 'macOS Version 10.15.7 (Build 19H1615) Quartz PDFContext', 'creator': 'PyPDF', 'c

In [7]:
len(text_chunks)

132

In [8]:
text_chunks[1]

Document(metadata={'producer': 'macOS Version 10.15.7 (Build 19H1615) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20241029231321Z00'00'", 'title': 'OWASP LLM/GenAI Security Solutions Reference Guide v1.0', 'moddate': "D:20241029231321Z00'00'", 'source': 'pdfs\\OWASP_LLMGenAI_SecuritySolutions_ReferenceGuide.pdf', 'total_pages': 34, 'page': 1, 'page_label': '2'}, page_content='adviceƫAllinformationisforgeneralinformationalpurposesonlyƫThisdocumentcontainslinksto\notherthirdǀpartywebsitesƫSuchlinksareonlyforconvenienceƬandOWASPdoesnotrecommend\norendorsethecontentsofthethirdǀpartysitesƫ\nLicen˦eandU˦age\nTh˜˦dˢc˨ˠeˡ˧˜˦˟˜ceˡ˦ed˨ˡde˥C˥ea˧˜˩eCˢˠˠˢˡ˦,CCBY-SA4.0\nYouarefreetoƭ\nƔ Shareǃcopyandredistributethematerialinanymediumorformat\nƔ AdaptǃremixƬtransformƬandbuilduponthematerialforanypurposeƬeven\ncommerciallyƫ\nƔ Underthefollowingtermsƭ')

In [9]:
text_chunks[2]

Document(metadata={'producer': 'macOS Version 10.15.7 (Build 19H1615) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20241029231321Z00'00'", 'title': 'OWASP LLM/GenAI Security Solutions Reference Guide v1.0', 'moddate': "D:20241029231321Z00'00'", 'source': 'pdfs\\OWASP_LLMGenAI_SecuritySolutions_ReferenceGuide.pdf', 'total_pages': 34, 'page': 1, 'page_label': '2'}, page_content='ż AttributionǃYoumustgiveappropriatecreditƬprovidealinktothelicenseƬ\nandindicateifchangesweremadeƫYoumaydosoinanyreasonablemanner\nbutnotinanywaythatsuggeststhelicensorendorsesyouoryouruseƫ\nż AttributionGuidelinesǀmustincludetheprojectnameaswellasthename\noftheassetReferenced\nŶ OWASPTop10forLLMsǀLLMSecOpsSolutionsLandscape\nŶ OWASPTop10forLLMsǀCyberSecuritySolutionandLLMSecOps\nLandscapeGuide\nƔ ShareAlikeǃIfyouremixƬtransformƬorbuilduponthematerialƬyoumustdistribute')

In [10]:
text_chunks[3]

Document(metadata={'producer': 'macOS Version 10.15.7 (Build 19H1615) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20241029231321Z00'00'", 'title': 'OWASP LLM/GenAI Security Solutions Reference Guide v1.0', 'moddate': "D:20241029231321Z00'00'", 'source': 'pdfs\\OWASP_LLMGenAI_SecuritySolutions_ReferenceGuide.pdf', 'total_pages': 34, 'page': 1, 'page_label': '2'}, page_content='yourcontributionsunderthesamelicenseastheoriginalƫ\nLinkĴďfķlllicenĮeĴeŘĴƭhttpsƭƸƸcreativecommons.orgƸlicensesƸbyǀsaƸ4.0Ƹlegalcode\nTheinformationprovidedinthisdocumentdoesnotƬandisnotintendedtoƬconstitutelegaladviceƫAllinformationisfor\ngeneralinformationalpurposesonlyƫThisdocumentcontainslinkstootherthirdǀpartywebsitesƫSuchlinksareonlyfor\nconvenienceandOWASPdoesnotrecommendorendorsethecontentsofthethirdǀpartysitesƫ\nVersion1ƫ0 1of34')

## Downlaod the Embeddings

In [None]:
import os

os.environ['OPENAI_API_KEY'] = "API_KEY HERE"

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
result = embeddings.embed_query("How are you!")

In [None]:
len(result)

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")


  embedding = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")





In [12]:
result = embedding.embed_query("How are you!")

In [14]:
len(result)
result

[0.03249841183423996,
 -0.0028303347062319517,
 -0.02268761396408081,
 -0.04591881111264229,
 0.0375046543776989,
 -0.026747148483991623,
 0.003607534570619464,
 0.030987821519374847,
 0.025472017005085945,
 -0.04854434356093407,
 0.022451164200901985,
 0.013826059177517891,
 -0.062417905777692795,
 -0.046441491693258286,
 -0.02380739338696003,
 -0.04958273842930794,
 -0.009265409782528877,
 0.010306350886821747,
 0.01101015880703926,
 -0.005084506701678038,
 0.007220338564366102,
 0.005014514084905386,
 -0.017288735136389732,
 -0.030236100777983665,
 -0.004849307704716921,
 -0.0006354793440550566,
 -0.06904599815607071,
 -0.015170414000749588,
 0.0039957063272595406,
 -0.04811578989028931,
 -0.004840983543545008,
 0.020140668377280235,
 -0.015129863284528255,
 -0.010669797658920288,
 -0.027719825506210327,
 0.04947809875011444,
 0.021604644134640694,
 0.01565464586019516,
 -0.0247923843562603,
 0.038189295679330826,
 -0.042693473398685455,
 0.044052258133888245,
 -0.021930186077952385

## Initializing the Pinecone

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'API_KEY HERE')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

NameError: name 'os' is not defined

In [2]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "test"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        deletion_protection="enabled",  # Defaults to "disabled"
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)




  from tqdm.autonotebook import tqdm


NameError: name 'PINECONE_API_KEY' is not defined

In [17]:
index.describe_index_stats()  

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
def process_chunks(text_chunks, batch_size=32):
    """Process text chunks in batches with error handling"""
    for i in range(0, len(text_chunks), batch_size):
        batch = text_chunks[i:i+batch_size]
        vectors = []
        
        try:
            for chunk in batch:
                # Generate embedding
                embedding = embedding.embed_query(chunk.page_content)
                
                # Prepare vector data
                vector_data = {
                    "id": f"page_{chunk.metadata['page']}",
                    "values": embedding,
                    "metadata": {
                        **chunk.metadata,
                        "text": chunk.page_content  # Store original text
                    }
                }
                vectors.append(vector_data)
            
            # Upsert batch
            index.upsert(
                vectors=vectors,
                namespace="owasp-documents"
            )
            print(f"Upserted batch {i//batch_size + 1}/{len(text_chunks)//batch_size}")
            
        except Exception as e:
            print(f"Error processing batch {i//batch_size + 1}: {str(e)}")
            time.sleep(10)  # Backoff for rate limits

## Create Embeddings for each of the Text Chunk

In [18]:
from langchain_pinecone import PineconeVectorStore  
 
vectorstore = PineconeVectorStore(  
    index, embedding, index_name)


## If you already have an index, you can load it like this

In [1]:
docsearch = Pinecone.from_texts([chunk.page_content for chunk in text_chunks],embedding,index_name=index_name,client=pc)
docsearch

NameError: name 'Pinecone' is not defined

## Similarity Search

In [19]:
query = "Tell me about LLM security"

In [20]:
docs = vectorstore.similarity_search(query, k=3)

In [21]:
docs

[]

## Creating a LLM Model Wrapper

In [None]:
llm = OpenAI()

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())


## Q/A

In [None]:
query = "YOLOv7 outperforms which models"

In [None]:
qa.run(query)

In [None]:
query = "Rachel Green Experience"

In [None]:
qa.run(query)

In [None]:
import sys

In [None]:
while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")