In [1]:
from dotenv import load_dotenv
import os
from langchain_groq import ChatGroq
from langchain_pinecone import PineconeVectorStore
from pinecone import ServerlessSpec , Pinecone
import fitz
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [2]:
load_dotenv()

True

In [3]:
pine_cone = os.getenv('pine_cone')
groq = os.getenv('groq')
hugging_face = os.getenv("hugging_face")

In [4]:
files = ["Miller & Freund's Probability and Statistics for Engineers.pdf",
        "vdoc.pub_probability-statistics-and-random-processes.pdf"]

In [5]:
text = ""
for file in files:
    doc = fitz.open(file)
    for page in doc:
        text += page.get_text()

In [6]:
text[:500]

'GLOBAL \nEDITION\nMiller & Freund’s\nProbability and Statistics \nfor Engineers\nNINTH EDITION\nRichard A. Johnson\nMILLER & FREUND’S\nPROBABILITY AND STATISTICS\nFOR ENGINEERS\nNINTH EDITION\nGlobal Edition\nRichard A. Johnson\nUniversity of Wisconsin–Madison\nBoston\nColumbus\nIndianapolis\nNew York\nSan Francisco\nAmsterdam\nCape Town\nDubai\nLondon\nMadrid\nMilan\nMunich\nParis\nMontréal\nToronto\nDelhi\nMexico City\nSão Paulo\nSydney\nHong Kong\nSeoul\nSingapore\nTaipei\nTokyo\nEditorial Director, Mathematics: Christine Hoag\nEd'

In [7]:
docs = [text[i:i+1000] for i in range(0, len(text), 1000)]

In [8]:
docs[1]

'ke Smith\nField Marketing Manager: Evan St. Cyr\nSenior Author Support/Technology Specialist: Joe Vetere\nMedia Production Manager, Global Edition: Vikram Kumar\nSenior Procurement Specialist: Carol Melville\nSenior Manufacturing Controller, Global Editions: Kay Holman\nInterior Design, Production Management, and Answer Art:\niEnergizer Aptara Limited/Falls Church\nCover Image: © MOLPIX/Shutterstock.com\nFor permission to use copyrighted material, grateful acknowledgement is made to these copyright holders: Screenshots from Minitab. Courtesy of\nMinitab Corporation. SAS Output Created with SAS® software. Copyright © 2013, SAS Institute Inc., Cary, NC, USA. All rights Reserved.\nReproduced with permission of SAS Institute Inc., Cary, NC.\nPEARSON AND ALWAYS LEARNING are exclusive trademarks in the U.S. and/or other countries owned by Pearson Education, Inc. or its affiliates.\nPearson Education Limited\nEdinburgh Gate\nHarlow\nEssex CM20 2JE\nEngland\nand Associated Companies throughou

In [9]:
embeddings = HuggingFaceBgeEmbeddings(model_name='BAAI/bge-base-en-v1.5',
                                     model_kwargs={'token': hugging_face})

  embeddings = HuggingFaceBgeEmbeddings(model_name='BAAI/bge-base-en-v1.5',


In [10]:
pc = Pinecone(api_key=pine_cone)

In [12]:
index_name = "probandstat"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        spec=ServerlessSpec(cloud="AWS", region="us-east-1"),
        dimension=768,
        metric='cosine'
    )

In [13]:
pinecone_index = pc.Index(name=index_name, host=os.getenv("host_bg"))

In [14]:
docx = [Document(page_content=docss) for docss in docs]

In [15]:
vector = PineconeVectorStore(
    index=pinecone_index,
    embedding=embeddings,
    text_key="page_content"
)

In [16]:
from tqdm import tqdm
batch_size = 32
for i in tqdm(range(0, len(docx), batch_size)):
    batch = docx[i:i+batch_size]
    try:
        vector.add_documents(batch)
    except Exception as e:
        print(f"Error in batch {i}-{i+batch_size}: {e}")

100%|██████████████████████████████████████████████████████████████████████████████████| 68/68 [13:47<00:00, 12.17s/it]


In [19]:
retriever = vector.as_retriever()

In [20]:
retriever.invoke("what is probability")

[Document(id='596f3007-2845-41e1-bc2d-62ca58389657', metadata={}, page_content='ch throws, where we predict the result with the help of the previous knowledge. So here is how the \nterm ‘probability’ can be defined with the help of some other terms given below.\nProbability\u2003\n\ue06a\u2003 5\nSome Definitions of Probability\n1.\u2003 Random experiment: An experiment whose outcome or results are not unique and which cannot \nbe predicted with certainty is called random experiment.\nExample: Tossing of a coin, throwing a dice, etc.\n2.\u2003 Sample space: The set of all outcomes of a random experiment is called sample space and is \ndenoted by S.\nExample: When a dice is thrown the sample space is, S = {1, 2, 3, 4, 5, 6}.\n3.\u2003 Event: It is a subset of sample space.\nExample: E1 = {1, 3, 5}.\nThe UNION of two events A and B denoted by A\nB\n∪\n is the event containing all the elements that \nbelong to A or B or both.\nExample: Let A = {1, 2, 3}, B = {3, 4, 5, 6}\nThen, A\nB\n∪\n 