In [None]:
%pip install PyMuPDF faiss-cpu sentence-transformers requests


In [None]:
import fitz  # PyMuPDF
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import requests
import json
from dotenv import load_dotenv
import os

In [None]:
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

# Define functions
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    pdf_document.close()
    return text

def chunk_text(text, chunk_size=1000):
    """Split text into chunks of a specific size."""
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

def get_embeddings(texts, model):
    """Get embeddings for a list of texts using a specified model."""
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings

def create_faiss_index(embeddings):
    """Create and populate a FAISS index."""
    embeddings_np = embeddings.cpu().numpy()  # Move to CPU and convert to numpy
    dim = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dim)
    faiss_index = faiss.IndexIDMap(index)
    faiss_index.add_with_ids(embeddings_np, np.arange(len(embeddings_np)))
    return faiss_index

def query_faiss_index(index, query_embedding, k=5):
    """Query the FAISS index for similar embeddings."""
    query_embedding_np = query_embedding.cpu().numpy()  # Move to CPU and convert to numpy
    distances, indices = index.search(query_embedding_np, k)
    return distances, indices

In [None]:
# Define paths and model
data_folder = 'data'
chunk_size = 1000
model_name = 'all-MiniLM-L6-v2'  # Example model


In [None]:
# Process all PDFs in the data folder
all_chunks = []
pdf_files = [f for f in os.listdir(data_folder) if f.endswith('.pdf')]

for pdf_file in pdf_files:
    pdf_path = os.path.join(data_folder, pdf_file)
    text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(text, chunk_size)
    all_chunks.extend(chunks)


In [None]:
# Initialize the sentence transformer model
model = SentenceTransformer(model_name)

# Process all PDFs in the data folder
all_chunks = []
pdf_files = [f for f in os.listdir(data_folder) if f.endswith('.pdf')]

for pdf_file in pdf_files:
    pdf_path = os.path.join(data_folder, pdf_file)
    text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(text, chunk_size)
    all_chunks.extend(chunks)

# Get embeddings for all chunks
embeddings = get_embeddings(all_chunks, model)

# Create FAISS index
faiss_index = create_faiss_index(embeddings)


In [None]:
from langchain_huggingface import HuggingFaceEndpoint
import os
from langchain import PromptTemplate, LLMChain

In [None]:
# Set the Hugging Face token
sec_key = os.getenv('HF_TOKEN')
os.environ["HUGGINGFACEHUB_API_TOKEN"] = sec_key

# Define the Hugging Face model endpoint
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm = HuggingFaceEndpoint(repo_id=repo_id, max_length=128, temperature=0.7, token=sec_key)


In [None]:
# Query
query_text = "what is machine learning?"
query_embedding = get_embeddings([query_text], model)

# Query the FAISS index
distances, indices = query_faiss_index(faiss_index, query_embedding)

# Get similar chunks
similar_chunks = [all_chunks[i] for i in indices[0]]


In [None]:
# Define the prompt template with similar chunks
template = """Based on the following chunks: {similar_chunks}
Question: {question}
Answer: Let's think step by step."""

prompt_template = PromptTemplate(
    input_variables=["similar_chunks", "question"],
    template=template,
)


In [None]:
# Create the LLMChain with the LLM and prompt template
llm_chain = LLMChain(llm=llm, prompt=prompt_template)

In [None]:



# Use the LLMChain to answer the question based on similar chunks
text = llm_chain.invoke({
    "similar_chunks": similar_chunks,
    "question": query_text
})

print(text['question'])
print(text['text'])