<a href="https://colab.research.google.com/github/pratheerth/oreilly_slm/blob/main/O'Reilly_SLM_Part2_Document_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Small Language Models Part 2 - Building an Intelligent Q&A System

### Setting Up The Foundation

In [None]:
!pip install llama-index llama-index-llms-huggingface llama-index-embeddings-huggingface sentence-transformers transformers accelerate torch pypdf bitsandbytes --quiet
print("LlamaIndex and related libraries installed successfully!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m128.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch # Base PyTorch library, Gemma runs on this
from transformers import AutoModelForCausalLM, AutoTokenizer # For loading Gemma from Hugging Face

# LlamaIndex specific imports
from llama_index.core import Settings # This is a central place to configure defaults in LlamaIndex
from llama_index.llms.huggingface import HuggingFaceLLM # A wrapper to make our Hugging Face Gemma model compatible with LlamaIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding # A wrapper for using Hugging Face embedding models

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# --- Load Gemma 2 2B ---
model_id_gemma = "google/gemma-2-2b-it"
tokenizer_gemma = AutoTokenizer.from_pretrained(model_id_gemma)
model_gemma = AutoModelForCausalLM.from_pretrained(
    model_id_gemma,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    device_map="auto"
)
print("\nGemma 2 2B loaded")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]


Gemma 2 2B loaded


In [None]:
# Define a system prompt to guide Gemma's behavior for our Q&A task
# This helps instruct the model to focus on the provided context
system_prompt = "You are a helpful and precise Q&A assistant. Your primary goal is to answer questions based *only* on the contextual information provided to you. If the answer cannot be found within the provided context, please state clearly 'The provided context does not contain the answer to this question.'"

# Create the LlamaIndex SLM wrapper for our loaded Gemma model
llm_gemma = HuggingFaceLLM(
    model=model_gemma,                     # The loaded Hugging Face model object
    tokenizer=tokenizer_gemma,             # The loaded Hugging Face tokenizer object
    context_window=8192,                   # Gemma 2 2B's maximum context window size in tokens
    max_new_tokens=500,                    # Default maximum number of tokens for generated answers
    generate_kwargs={"temperature": 0.2, "do_sample": True}, # Default generation parameters (low temp for factual Q&A)
    system_prompt=system_prompt,           # The system prompt defined above
    device_map="auto"                      # Ensure LlamaIndex knows the model is on the GPU
)

# Set this configured SLM as the default for all LlamaIndex operations
Settings.llm = llm_gemma
print("\n Gemma 2 2B SLM wrapped and set as default in LlamaIndex Settings.")


 Gemma 2 2B SLM wrapped and set as default in LlamaIndex Settings.


In [None]:
# Define the Hugging Face ID for a good, efficient embedding model
embed_model_id = "sentence-transformers/all-MiniLM-L6-v2"

# Create the LlamaIndex embedding model object using the HuggingFaceEmbedding wrapper
embed_model = HuggingFaceEmbedding(model_name=embed_model_id)

# Set this embedding model as the default for all LlamaIndex operations
Settings.embed_model = embed_model

# We can also globally set a default chunk size that LlamaIndex will use
# when it splits our document into manageable pieces. 512 is a common starting point.
Settings.chunk_size = 512

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Building the Q&A Pipeline

In [None]:
from llama_index.core import SimpleDirectoryReader, Document

# Define the filename exactly as it appears in your Colab files after uploading
pdf_filename = "Artificial intelligence - Wikipedia.pdf" # <--- Make sure this matches your uploaded filename!

print(f"\nAttempting to load the document: {pdf_filename}...")
try:
    # SimpleDirectoryReader can take a list of input files.
    # It will use appropriate loaders based on file extensions.
    loader = SimpleDirectoryReader(input_files=[pdf_filename])
    documents = loader.load_data() # This loads and parses the PDF.

    if documents:
        print(f"Successfully loaded {len(documents)} 'Document' object(s) from the PDF.")
    else:
        print("No documents were loaded. Please double-check the filename and ensure it was uploaded correctly.")
        #documents = [] # Initialize to prevent errors in subsequent cells

except FileNotFoundError:
    print(f"ERROR: The file '{pdf_filename}' was not found in your Colab session storage.")
    print("Please make sure you've uploaded it and the filename matches exactly.")
    #documents = [] # Initialize
except Exception as e:
    print(f"An unexpected error occurred while loading the PDF: {e}")
    #documents = [] # Initialize




Attempting to load the document: Artificial intelligence - Wikipedia.pdf...
Successfully loaded 33 'Document' object(s) from the PDF.


In [None]:
from llama_index.core import VectorStoreIndex

print("\nBuilding the VectorStoreIndex from the loaded document(s)...")
# This command tells LlamaIndex to take our list of 'documents',
# process them (chunk, embed using Settings.embed_model and Settings.chunk_size),
# and build a searchable vector index.
# This step can take a little while for a 33-page PDF, as it's embedding all the chunks.
index = VectorStoreIndex.from_documents(documents)
print("Index built successfully!")


Building the VectorStoreIndex from the loaded document(s)...
Index built successfully!


In [None]:
query_engine = index.as_query_engine()
print("Query engine created and ready to answer questions.")

Query engine created and ready to answer questions.


### Testing the System

In [None]:
question1 = "What are some of the techniques used in AI research?"
print("Question 1:", question1)

# Send the question to the query engine
response1 = query_engine.query(question1)

print("\nAnswer 1:", response1)

Question 1: What are some of the techniques used in AI research?

Answer 1: 
The techniques used in AI research include:
- Search and mathematical optimization
- Formal logic
- Artificial neural networks
- Methods based on statistics, operations research, and economics. 
- Also, AI draws upon psychology, linguistics, philosophy, neuroscience, and other fields. 



In [None]:
question2 = "What is the difference between AI and AGI?"
print("Question 2:", question2)
response2 = query_engine.query(question2)
print("\nAnswer 2:", response2)

Question 2: What is the difference between AI and AGI?

Answer 2: 
The provided context does not contain the answer to this question. 



In [None]:
question3 = "Can you explain NLP to me like I'm 5? Make it detailed"
print("Question 3:", question3)
response3 = query_engine.query(question3)
print("\nAnswer 3:", response3)

Question 3: Can you explain NLP to me like I'm 5? Make it detailed

Answer 3: 
Imagine you have a robot friend who wants to talk to you. But your robot friend doesn't understand human language, it only understands numbers and symbols. 

So, we need to teach our robot friend how to understand human language. We can do this by using "Natural Language Processing" or NLP. 

NLP is like giving our robot friend a big book full of human language, like stories, songs, and poems. This book helps the robot learn how to understand and speak human language. 

We can also teach our robot friend how to understand different words and their meanings. This is called "word-sense disambiguation". 

Finally, we can use "machine learning" to train our robot friend to understand human language even better. 

So, NLP is like teaching a robot friend how to talk to us in human language. 





### Adding Conversational Capabilities

In [None]:
    # .as_chat_engine() creates an engine suitable for conversation.
    # It typically uses a ChatMemoryBuffer by default to store conversation history.
    # 'chat_mode="condense_question"' is a good default for RAG:
    # It takes the new question and recent chat history, condenses them into a
    # standalone question, and then queries the index with that improved question.
chat_engine = index.as_chat_engine(
    chat_mode="condense_question",
    verbose=True # Set to True to see some of the internal workings, like the condensed question
)
print("Chat engine created successfully! It's ready for a conversation.")

Chat engine created successfully! It's ready for a conversation.


In [None]:
# First question
q1 = "Please list six current, real-world applications of artificial intelligence, numbering them 1-6."
print("User:", q1)
r1 = chat_engine.chat(q1)
print("Assistant:", r1)

User: Please list six current, real-world applications of artificial intelligence, numbering them 1-6.
Querying with: Please list six current, real-world applications of artificial intelligence, numbering them 1-6.
Assistant: 
1. Advanced web search engines (e.g., Google Search)
2. Recommendation systems (used by YouTube, Amazon, and Netflix)
3. Virtual assistants (e.g., Google Assistant, Siri, and Alexa)
4. Autonomous vehicles (e.g., Waymo)
5. Generative and creative tools (e.g., ChatGPT and AI art)
6. Superhuman play and analysis in strategy games (e.g., chess and Go) 



In [None]:
# Next follow up
q2 = "Can you elaborate a little more on 5?"
print("User:", q2)
streaming_r2 = chat_engine.stream_chat(q2)
for token in streaming_r2.response_gen:
  print(token, end="")

User: Can you elaborate a little more on 5?
Querying with: What are some examples of generative and creative tools powered by AI? 


Some examples of generative and creative tools powered by AI include ChatGPT, and AI art. 


In [None]:
# to clear conversational history
chat_engine.reset()