# Downloads

In [10]:
#!pip install langchain langchain_community langchain_huggingface faiss-cpu
#!pip install sentence-transformers
#!pip install hf_xet
#!pip install pypdf
#!pip install SpeechRecognition pyttsx3
#!pip install transformers torch faiss-cpu



# Imports

In [43]:
import pandas as pd
import os
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from PIL import Image
import numpy as np
import torch
from langchain.embeddings.base import Embeddings

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

# Loading Data

In [44]:
DATA_PATH="../Cleaned Data/"
def load_pdf_files(data):
    loader = DirectoryLoader(data,
                             glob='*.pdf',
                             loader_cls=PyPDFLoader)
    
    documents=loader.load()
    return documents

documents=load_pdf_files(data=DATA_PATH)

# Creating Chunks

Find that answers are missing context, increase the `chunk_overlap`
Find that the chatbot is retrieving too much irrelevant information, decrease the `chunk_size`

Test one - `chunk_size` = 500, `chunk_overlap`=50

`chunk_overlap` is used to retain continuity and context(history)

In [45]:
def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [46]:
text_chunks= create_chunks(documents)
print(f"Length of text chunks {len(text_chunks)}")

Length of text chunks 1915


# Creating Vector Embeddings

In [47]:
def get_ve_model():
    ve_model= HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
    return ve_model


In [48]:
embedding_model = get_ve_model()

# Storing Vector Embeddings in FAISS

In [49]:
FAISS_DB_PATH = 'FAISS Database/'

db=FAISS.from_documents(text_chunks, embedding_model)
db.save_local(FAISS_DB_PATH)

# Handling Images

In [38]:
class CLIPEmbeddings(Embeddings):
    def __init__(self, model, processor):
        self.model = model
        self.processor = processor
    

    def embed_documents(self, texts):
        pil_images = [Image.open(path) for path in texts]
        inputs = self.processor(images=pil_images, return_tensors="pt", padding=True)
        with torch.no_grad():
            image_features = self.model.get_image_features(**inputs)
        return image_features.cpu().numpy().tolist()

    def embed_query(self, text):
        inputs = self.processor(text=text, return_tensors="pt", padding=True)
        with torch.no_grad():
            text_features = self.model.get_text_features(**inputs)
        return text_features.cpu().numpy().tolist()[0]

def load_image_files(base_image_path, model, processor):
    all_image_paths = []
    
    for root, dirs, files in os.walk(base_image_path):
        for filename in files:
            if filename.endswith(('.png', '.jpg', '.jpeg')):
                all_image_paths.append(os.path.join(root, filename))

    if not all_image_paths:
        print(f"No images found in {base_image_path}")
        return None

    documents = [Document(page_content=path, metadata={"source": path}) for path in all_image_paths]

    clip_embeddings = CLIPEmbeddings(model=model, processor=processor)

    faiss_db = FAISS.from_documents(documents, clip_embeddings)
    
    print("FAISS image database created successfully.")
    return faiss_db

In [39]:
def load_clip_model():
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    return model, processor

In [40]:
def get_text_embedding_with_clip(text, model, processor):
    inputs = processor(text=text, return_tensors="pt", padding=True)
    text_features = model.get_text_features(**inputs)
    return text_features.detach().numpy()

In [41]:
def get_image_embedding_with_clip(image_path, model, processor):
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")
    image_features = model.get_image_features(**inputs)
    return image_features.detach().numpy()

In [42]:
BASE_IMAGE_PATH = '../Data/Images'
clip_model, clip_processor = load_clip_model()
image_db = load_image_files(BASE_IMAGE_PATH, clip_model, clip_processor)

IMAGE_FAISS_DB_PATH = 'FAISS_Image_DB/'
if not os.path.exists(IMAGE_FAISS_DB_PATH):
    os.makedirs(IMAGE_FAISS_DB_PATH)
image_db.save_local(IMAGE_FAISS_DB_PATH)

Found 205 images. Processing embeddings...
FAISS image database created successfully.
