In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

access_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

### **Load the data**

In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "data/attention-is-all-you-need.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
len(docs)

  from .autonotebook import tqdm as notebook_tqdm


11

In [3]:
print(docs[0].page_content[:100])

Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brai


### **Splitting Documents**

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

In [5]:
len(all_splits)

43

### **Store in a vector database**

In [6]:
### Select an embeddings model
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [7]:
### Define vector store
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [8]:
document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

['539242ad-86b0-431e-a0c4-8c129ecbad0b', '1329841d-b8ad-4a6b-81c0-481dfdc208e4', 'c51c8706-37ee-4a48-877d-792c23bfb8b6']


### **Download our model Locally**

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

model_name="meta-llama/Llama-3.2-3B-Instruct"

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.57s/it]


In [11]:
tokenizer.save_pretrained(f"tokenizer/{model_name}")
model.save_pretrained(f"models/{model_name}")

In [12]:
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."<|eot_id|>


In [13]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.1
)

llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu


### **Create a RAG pipeline**

In [14]:
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate

In [15]:
retriever = vector_store.as_retriever(
    search_type="similarity",  # or "mmr"
    search_kwargs={"k": 3}     # number of docs to retrieve per query
)

In [20]:
prompt_template = """
    You are an intelligent and helpful AI assistant designed to provide accurate, reliable, and natural responses.

    Your primary goal is to answer questions based on the provided context.
    If the context is relevant to the question, use it to produce a clear, well-structured answer.
    If the context does not contain the answer, rely on your general knowledge to respond accurately and naturally.
    If neither context nor general knowledge provides a valid answer, say "I don't know."

    You should:
    - Use plain text only (no bullet points, tables, or markdown formatting).
    - Respond naturally, like a human in conversation (e.g., if greeted, respond casually).
    - Provide concise answers when appropriate, but be detailed when necessary.
    - Avoid hallucination and never make up facts.
    - Be able to answer general world questions as well (e.g., “What is the capital of Bangladesh?”).
    - When summarizing or explaining content from the PDF, keep it precise and clear.
    - When unsure, politely express uncertainty.
    - You should act like an human agent, where after provide answer you should ask user that if anything more they want to know or not on that context. Not need to said this as same as this, just ask on similar type of things in a polite way.
    - Your tone should be soft.

    Context (from documents):
    {context}

    Question:
    {question}

    Answer:
    """

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [21]:
from langchain_classic.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,           
    chain_type="stuff", 
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt} 
)


In [22]:
question = "What is the model architecture in given context?"
answer = qa_chain.run(question)
print(answer)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



    You are an intelligent and helpful AI assistant designed to provide accurate, reliable, and natural responses.

    Your primary goal is to answer questions based on the provided context.
    If the context is relevant to the question, use it to produce a clear, well-structured answer.
    If the context does not contain the answer, rely on your general knowledge to respond accurately and naturally.
    If neither context nor general knowledge provides a valid answer, say "I don't know."

    You should:
    - Use plain text only (no bullet points, tables, or markdown formatting).
    - Respond naturally, like a human in conversation (e.g., if greeted, respond casually).
    - Provide concise answers when appropriate, but be detailed when necessary.
    - Avoid hallucination and never make up facts.
    - Be able to answer general world questions as well (e.g., “What is the capital of Bangladesh?”).
    - When summarizing or explaining content from the PDF, keep it precise and cle