In [2]:
# Libraries 

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import faiss
import fitz

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Embedding Model

embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [4]:
pdf_path = "ds_interview_500.pdf"
text = ""
with fitz.open(pdf_path) as pdf:
    for page_num in range(len(pdf)):
        page = pdf[page_num]
        text += page.get_text()  # Extract text from each page

In [5]:
# Chunking text 

def chunk_text(text, chunk_size, overlap_size):
    """
    Splits text into chunks of specified size with specified overlap.
    
    Args:
        text (str): The full text to split into chunks.
        chunk_size (int): The number of characters in each chunk.
        overlap_size (int): The number of overlapping characters between chunks.
    
    Returns:
        list of str: List of text chunks.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))  # End of chunk, not to exceed text length
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap_size  # Advance start by chunk size minus overlap

    return chunks

chunk_size = 500  # Set the desired chunk size
overlap_size = 100  # Set the desired overlap size

chunks = chunk_text(text, chunk_size, overlap_size)

In [6]:
# Documents to embed

documents = chunks

In [7]:
# Encode the documents

document_embeddings = embedder.encode(documents)

In [8]:
# Initialize a FAISS index

embedding_dim = document_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(document_embeddings))


In [9]:
# Function for retrieval

def retrieve(query, k=2):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(query_embedding, k)
    return [(documents[idx], round(float(distances[0][i]),2)) for i, idx in enumerate(indices[0])]

In [10]:
# Query
query = "What Is A Recommender System?"

In [11]:
# Retrieval Result

retrieved_docs = retrieve(query, k=2)
retrieved_docs # <- Pass these too LLM as context along side the prompt

[('Answer:\nA recommender system is today widely deployed in multiple fields like movie\nrecommendations, music preferences, social tags, research articles, search\nqueries and so on. The recommender systems work as per collaborative and\ncontent-based filte\nring or by deploying a personality-based approach. This type of system works\nbased on a person’s past behavior in order to build a model for the future. This\nwill predict the future product buying, movie viewing or book reading by\npeople. It also c',
  17.7),
 ('unt of money spent\nfor election campaigning of a particular candidate, the amount of time spent in\ncampaigning, etc.\n225. What are Recommender Systems?\nRecommender Systems are a subclass of information filtering systems that are\nmeant to predict the preferences or ratings that a user would give to a\nproduct. Recommender systems are widely used in movies, news, research\narticles, products, social tags, music, etc.\nExamples include movie recommenders in IMDB, Netf