In [1]:
!pip install faiss-cpu
!pip install transformers
!pip install datasets
!pip install PyPDF2
!pip install sentence-transformers
!pip install langchain


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
# Sample text simulating a document
document_text = """
Welcome to the XYZ Product Manual.
To reset the device, press and hold the power button for 10 seconds.
For troubleshooting connectivity issues, refer to chapter 3.
Ensure that the device firmware is updated regularly.
Customer support can be contacted via support@xyz.com.
"""

print(document_text)



Welcome to the XYZ Product Manual. 
To reset the device, press and hold the power button for 10 seconds. 
For troubleshooting connectivity issues, refer to chapter 3. 
Ensure that the device firmware is updated regularly. 
Customer support can be contacted via support@xyz.com.



In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
docs = text_splitter.split_text(document_text)

# Check the chunks
for i, chunk in enumerate(docs):
    print(f"Chunk {i+1}:\n{chunk}\n")


Chunk 1:
Welcome to the XYZ Product Manual.

Chunk 2:
To reset the device, press and hold the power button for 10 seconds.

Chunk 3:
For troubleshooting connectivity issues, refer to chapter 3.

Chunk 4:
Ensure that the device firmware is updated regularly.

Chunk 5:
Customer support can be contacted via support@xyz.com.



In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for each chunk
embeddings = model.encode(docs)

print(f"Number of chunks: {len(docs)}")
print(f"Embedding shape: {embeddings.shape}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of chunks: 5
Embedding shape: (5, 384)


In [5]:
import faiss

# Get embedding dimension
dimension = embeddings.shape[1]

# Create FAISS index
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Test a query
query = "How do I reset the device?"
query_vec = model.encode([query])
D, I = index.search(np.array(query_vec), k=3)  # top 3 chunks

print("Top chunks for the query:")
for i in I[0]:
    print(docs[i])


Top chunks for the query:
To reset the device, press and hold the power button for 10 seconds.
Ensure that the device firmware is updated regularly.
Customer support can be contacted via support@xyz.com.


In [6]:
from transformers import pipeline

# Load a text generation model
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")

# Combine top chunks into context
context = " ".join([docs[i] for i in I[0]])

# Ask the model a question
question = "How do I reset the device?"
prompt = f"Answer the question: {question} based on the context: {context}"
answer = qa_pipeline(prompt)

print("Answer from the model:")
print(answer[0]['generated_text'])


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Answer from the model:
Press and hold the power button for 10 seconds. Ensure that the device firmware is updated regularly. Customer support can be contacted via support@xyz.com.
