In [1]:
# imports
import pypdf # convert pdf to text
from sentence_transformers import SentenceTransformer # embedding model
from transformers import AutoTokenizer, AutoModelForCausalLM # language model
import torch # helper for hugging face tools
import numpy as np # handling vectores
import faiss # vectore database
import os

In [2]:
# find uploaded document path
def documents_path(directory='/content/'):
  doc_list = os.listdir('/content/')
  doc_list.remove('.config')
  doc_list.remove('sample_data')
  return doc_list

# convert pdf to text
def pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = pypdf.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

# saving all texts as 1 text file
def text_of_docs(docs_path):
  full_text = ""
  for p in docs_path:
    full_text += pdf_to_text("/content/" + p)
  return full_text

# chunk documents
def chunk_text(text, n, overlap):
    chunks = []
    for i in range(0, len(text), n - overlap):
        chunks.append(text[i:i + n])
    return chunks

# embedding chunks
def embedding_chunks(chunks, model_name = 'nomic-ai/modernbert-embed-base'):
  embedding_model = SentenceTransformer(model_name)
  embeddings = embedding_model.encode(chunks)
  return embeddings

# vectore database
def vector_db(embeddings, chunks):
  # Create a FAISS index
  index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 is Euclidean distance

  # Add our vectors to the index
  index.add(embeddings.astype(np.float32))  # FAISS requires float32

  # Create a mapping from index position to document chunk for retrieval
  index_to_doc_chunk = {i: doc for i, doc in enumerate(chunks)}

  return index, index_to_doc_chunk

# LLM model
def LLM(model_name = 'Qwen/Qwen2.5-7B-Instruct'):
  model = AutoModelForCausalLM.from_pretrained(
          model_name,
          torch_dtype="auto",
          device_map="auto"
  )
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  return model, tokenizer

# generate prompt according to context and query
def query2prompt(query, index, index_mapping, top_k=10):
  # Step 1: Convert query to embedding
  query_embedding = embedding_chunks([query])
  query_embedding = query_embedding.astype(np.float32)  # Convert to float32 for FAISS

  # Step 2: Search for similar documents
  distances, indices = index.search(query_embedding, 10)

  # Step 3: Retrieve the actual document chunks
  retrieved_docs = [index_mapping[idx] for idx in indices[0]]

  # Create context from retrieved documents
  context = "nn".join(retrieved_docs)

  prompt = f"""
  Context:
     {context}
   <|user|>
   {query}
   <|assistant|>
   """

  return prompt

# generate output by prompt
def generate_message(prompt):
  messages = [
      {"role": "system", "content": '''You are a helpful AI assistant. Answer the question based only on the provided context.
      If you don't know the answer based on the context, say "I don't have enough information to answer this question."'''},
      {"role": "user", "content": prompt}
  ]

  text = tokenizer.apply_chat_template(
          messages,
          tokenize=False,
          add_generation_prompt=True
  )

  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  generated_ids = model.generate(
          **model_inputs,
          max_new_tokens=512
  )

  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]

  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  return response

In [3]:
%%time
doc_list = documents_path()

CPU times: user 675 µs, sys: 0 ns, total: 675 µs
Wall time: 543 µs


In [4]:
%time
full_text = text_of_docs(doc_list)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


In [5]:
%%time
chunks = chunk_text(full_text, 500, 50)

CPU times: user 0 ns, sys: 922 µs, total: 922 µs
Wall time: 930 µs


In [6]:
%%time
embeddings = embedding_chunks(chunks)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


CPU times: user 17.4 s, sys: 1.54 s, total: 18.9 s
Wall time: 33.5 s


In [7]:
%%time
vector_database = vector_db(embeddings, chunks)
index = vector_database[0]
index_to_doc_mapping = vector_database[1]

CPU times: user 1.24 ms, sys: 4.02 ms, total: 5.26 ms
Wall time: 5.13 ms


In [8]:
%%time
LLM_model = LLM(model_name = 'Qwen/Qwen2.5-7B-Instruct')
model = LLM_model[0]
tokenizer = LLM_model[1]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



CPU times: user 4.36 s, sys: 4.89 s, total: 9.26 s
Wall time: 51.4 s


In [9]:
query = "who was defence against the dark arts professor"

In [10]:
%%time
prompt = query2prompt(query, index, index_to_doc_mapping, top_k=5)

CPU times: user 1.01 s, sys: 141 ms, total: 1.15 s
Wall time: 3.37 s


In [11]:
%%time
response = generate_message(prompt)

CPU times: user 24.2 s, sys: 2.68 s, total: 26.8 s
Wall time: 39.6 s


In [12]:
response

'According to the context provided, Professor Quirrell was the Defense Against the Dark Arts professor.'