In [1]:
from dotenv import load_dotenv
from langchain_groq import ChatGroq
import os
from pinecone import Pinecone , ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import fitz
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.schema import Document

In [2]:
load_dotenv()

True

In [3]:
pdf_files = ["Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf" , 
             "MachineLearningTomMitchell.pdf"]

In [4]:
text = ''
for file in pdf_files:
    doc = fitz.open(file)
    print(f"--- {file} ---")
    for page in doc:
        text += page.get_text()

--- Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf ---
--- MachineLearningTomMitchell.pdf ---


In [5]:
text[:500]

'Information Science and Statistics\nSeries Editors:\nM. Jordan\nJ. Kleinberg\nB. Scho¨lkopf\nInformation Science and Statistics \nAkaike and Kitagawa: The Practice of Time Series Analysis. \nBishop:  Pattern Recognition and Machine Learning. \nCowell, Dawid, Lauritzen, and Spiegelhalter: Probabilistic Networks and\nExpert Systems. \nDoucet, de Freitas, and Gordon: Sequential Monte Carlo Methods in Practice. \nFine: Feedforward Neural Network Methodology. \nHawkins and Olwell: Cumulative Sum Charts and Chart'

In [6]:
chunk = [text[i:i+1000] for i in range(0 , len(text) ,1000 )]

In [7]:
chunk[100]

'), β−1\x0b\n(1.60)\nwhere, for consistency with the notation in later chapters, we have deﬁned a preci-\nsion parameter β corresponding to the inverse variance of the distribution. This is\nillustrated schematically in Figure 1.16.\n1.2. Probability Theory\n29\nFigure 1.16\nSchematic illustration of a Gaus-\nsian conditional distribution for t given x given by\n(1.60), in which the mean is given by the polyno-\nmial function y(x, w), and the precision is given\nby the parameter β, which is related to the vari-\nance by β−1 = σ2.\nt\nx\nx0\n2σ\ny(x0, w)\ny(x, w)\np(t|x0, w, β)\nWe now use the training data {x, t} to determine the values of the unknown\nparameters w and β by maximum likelihood. If the data are assumed to be drawn\nindependently from the distribution (1.60), then the likelihood function is given by\np(t|x, w, β) =\nN\n\x0e\nn=1\nN\n\ntn|y(xn, w), β−1\x0b\n.\n(1.61)\nAs we did in the case of the simple Gaussian distribution earlier, it is convenient to\nmaximize the logar

In [8]:
pine_cone = os.getenv('pine_cone')
groq = os.getenv('groq')
hugging_face = os.getenv("hugging_face")

In [16]:
embeddings =  HuggingFaceBgeEmbeddings(model_name='BAAI/bge-base-en-v1.5',
                                      model_kwargs={"token" : hugging_face})

In [17]:
pc = Pinecone(api_key = pine_cone)

In [18]:
index = "intoductionml"
if index not in pc.list_indexes().names():
    pc.create_index(name= index,
    spec = ServerlessSpec(region="us-east-1" , cloud="AWS"),
    dimension= 768,
    metric= 'cosine',
    )

In [19]:
index = pc.Index(name = index , host = os.getenv("host_bg"))

In [20]:
docs = [Document(page_content = chunks) for chunks in chunk]

In [21]:
vector = PineconeVectorStore(index=index,
    embedding=embeddings,
    text_key="page_content",)

In [22]:
from tqdm import tqdm
batch_size = 32
for i in tqdm(range(0, len(docs), batch_size)):
    batch = docs[i:i+batch_size]
    try:
        vector.add_documents(batch)
    except Exception as e:
        print(f"Error in batch {i}-{i+batch_size}: {e}")

100%|██████████████████████████████████████████████████████████████████████████████████| 88/88 [09:50<00:00,  6.71s/it]


In [23]:
retrive = vector.as_retriever(search_kwargs={"k":3})

In [24]:
retrive.invoke("what is machine learning")

[Document(id='0cda2617-9da4-458f-9906-7a0aca5b4a8e', metadata={}, page_content=' \nBook Info: Presents the key algorithms and theory that form the core of machine learning. \nDiscusses such theoretical issues as How does learning performance vary with the number of \ntraining examples presented? and Which learning algorithms are most appropriate for various \ntypes of learning tasks? DLC: Computer algorithms.  \nBook Description: This book covers the field of machine learning, which is the study of \nalgorithms that allow computer programs to automatically improve through experience. The \nbook is intended to support upper level undergraduate and introductory level graduate courses in \nmachine learning \nPREFACE \nThe field of machine learning is concerned with the question of how to construct \ncomputer programs that automatically improve with experience. In recent years \nmany successful machine learning applications have been developed, ranging from \ndata-mining programs that lear