In [1]:
import os
import fitz
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from pinecone import Pinecone , ServerlessSpec
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.schema import Document

In [2]:
load_dotenv()

True

In [3]:
pine_cone = os.getenv('pine_cone')
groq = os.getenv('groq')
hugging_face = os.getenv("hugging_face")

In [4]:
files= ["toc-klp-mishra.pdf",
       "Hopcroft-Motwani-Ullman-2001.pdf"]

In [5]:
text = ""
for file in files:
    doc = fitz.open(file)
    for page in doc:
        text += page.get_text()
    

In [6]:
text[:5000]

"http://engineeringbooks.net\nTHEORY OF COMPUTER SCIENCE\nAutomata, Languages and Computation\nTHIRD EDITION\nK.l.P. MISHRA\nFormerly Professor\nDepartment of Electrical and Electronics Engineering\nand Principal/ Regional Engineering College\nTiruchirapal/i\nN. CHANDRASEKARAN\nProfessor\nDepartment of Mathematics\nSt. Joseph/s College\nTiruchirapalli\nPrentice'Hall of India [P[?lmGJD@ LsOWJov8d]\nNew Delhi - 110 '001\n2008\nhttp://engineeringbooks.net\nPreface\nNotations\nContents\nix\nXl\n1.\nPROPOSITIONS AND PREDICATES\n1-35\n1.1\nPropositions (or Statements)\n1\n1.1.1\nConnectives (Propositional Connectives\nor Logical Connectives)\n2\n1.1.2\nWell-formed Formulas\n6\n1.1.3\nTruth Table for a Well-formed Formula\n7\n1.1.4 Equivalence of Well-formed Formulas\n9\n1.1.5\nLogical Identities\n9\n1.2\nNormal Forms of Well-formed Formulas\n11\n1.2.1\nConstruction to Obtain a Disjunctive Normal\nForm of a Given Formula\nII\n1.2.2\nConstruction to Obtain the Principal\nDisjunctive Normal For

In [7]:
docs = [text[i:i+1000] for i in range(0 , len(text) , 1000)] 

In [8]:
docs[2]

'.2\nDescription of a Finite Automaton\n73\n3.3\nTransition Systems\n74\n3.4\nPropeliies of Transition Functions\n75\n3.5\nAcceptability of a String by a Finite Automaton\n77\n3.6\nNondeterministic Finite State Machines\n78\n3.7\nThe Equivalence of DFA and NDFA\n80\n3.8\nMealy and Moore Models\n84\n3.8.1\nFinite Automata with Outputs\n84\n3.8.2\nProcedure for Transforming a Mealy Machine\ninto a Moore Machine\n85\n3.8.3\nProcedure for Transforming a Moore Machine\ninto a Mealy Machine\n87\n3.9\nMinimization of Finite Automata\n91\n3.9.1\nConstruction of Minimum Automaton\n92\n3.10 Supplementary Examples\n97\nSelf-Test\n103\nExercises\n]04\nhttp://engineeringbooks.net\nContents\n!O!l\nv\n4.\nFORMAL LANGUAGES\n4.1\nBasic Definitions and Examples\n107\n4.1.1\nDefinition of a Grammar\n109\n4.1.2\nDerivations and the Language Generated by a\nGrammar\n110\n4.2\nChomsky Classification of Languages\n120\n4.3\nLanguages and Their Relation\n123\n4.4\nRecursive and Recursively Enumerable Sets\n12

In [9]:
embeddings = HuggingFaceBgeEmbeddings(model_name='BAAI/bge-base-en-v1.5',
                                     model_kwargs={'token': hugging_face})

  embeddings = HuggingFaceBgeEmbeddings(model_name='BAAI/bge-base-en-v1.5',


In [10]:
pc = Pinecone(api_key=pine_cone)

In [11]:
index = "toc"

In [12]:
if index not in pc.list_indexes().names():
    pc.create_index(name = index,
    spec = ServerlessSpec(cloud="AWS" , region="us-east-1"),
    dimension= 768,
    metric="cosine")

In [13]:
pinecone_index = pc.Index(name=index, host=os.getenv("host_bg"))

In [14]:
docx = [Document(page_content=doxx) for doxx in docs]

In [15]:
vector = PineconeVectorStore(
    index=pinecone_index,
    embedding=embeddings,
    text_key="page_content"
)

In [16]:
from tqdm import tqdm
batch_size = 32
for i in tqdm(range(0, len(docx), batch_size)):
    batch = docx[i:i+batch_size]
    try:
        vector.add_documents(batch)
    except Exception as e:
        print(f"Error in batch {i}-{i+batch_size}: {e}")

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [01:27<00:00,  3.79s/it]


In [17]:
retriever =  vector.as_retriever()

In [18]:
retriever.invoke("what is TOC")

[Document(id='2a740e93-fa65-49c5-89cf-a0d05d63f917', metadata={}, page_content='be used\nfor proving many theorems throughout the book.\n2.1\nSETS, RELATIONS AND FUNCTIONS\n2.1.1\nSETS AND SUBSETS\nA set is a well-defined collection of objects, for example, the set of all students\nin a college. Similarly. the collection of all books in a college library is also a\nset. The individual objects are called members or elements of the set.\nWe use the capital letters A, B, C, ... for denoting sets. The small letters\na, b, c, ... are used to denote the elements of any set. When a is an element\nof the set A. we write a E A. "\\Then a is not an element of A, we write a rl. A.\nVarious Ways of Describing a Set\n(i) By listing its elements. We write all the elements of the set (without\nrepetition) and enclose them within braces. We can write the elements\nin any order. For example, the set of all positive integers divisible by\n15 and less than 100 can be wlitten as {IS. 30, 45. 60. 75. 90}.\