In [5]:
from dotenv import load_dotenv
import os

load_dotenv()

access_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

### **Load the data**

In [6]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "data/attention-is-all-you-need.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
len(docs)

11

In [7]:
print(docs[0].page_content[:100])

Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brai


### **Splitting Documents**

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

In [9]:
len(all_splits)

43

### **Store in a vector database**

In [10]:
### Select an embeddings model
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [11]:
### Define vector store
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [12]:
document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

['0785f3ae-7ee1-433b-8143-700016edf7fb', '90bf0e96-e6d9-4350-9b62-213994ccefe1', '2f4d482e-6c28-429b-a90b-fa48372d9733']


### **Download our model Locally**

In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

#model_name="meta-llama/Llama-3.2-3B-Instruct"
#model_name="microsoft/Phi-3-mini-4k-instruct"  # too much time it takes to response
#model_name = "openai-community/gpt2"   # Give error
model_name = "Qwen/Qwen2.5-Coder-0.5B-Instruct"

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [24]:
tokenizer.save_pretrained(f"tokenizer/{model_name}")
model.save_pretrained(f"models/{model_name}")

In [25]:
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

I am an AI developed by Alibaba Cloud.<|im_end|>


In [26]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.5
)

llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu


### **Create a RAG pipeline**

In [27]:
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate

In [28]:
retriever = vector_store.as_retriever(
    search_type="similarity",  # or "mmr"
    search_kwargs={"k": 3}     # number of docs to retrieve per query
)

In [29]:
prompt_template = """
    You are an intelligent and helpful AI assistant designed to provide accurate, reliable, and natural responses.

    Your primary goal is to answer questions based on the provided context.(Do not mention context in response)
    If the context is relevant to the question, use it to produce a clear, well-structured answer.
    If the context does not contain the answer, rely on your general knowledge to respond accurately and naturally.
    If neither context nor general knowledge provides a valid answer, say "I don't know."

    You should:
    - Use plain text only (no bullet points, tables, or markdown formatting).
    - Respond naturally, like a human in conversation (e.g., if greeted, respond casually).
    - Provide concise answers when appropriate, but be detailed when necessary.
    - Avoid hallucination and never make up facts.
    - Be able to answer general world questions as well (e.g., “What is the capital of Bangladesh?”).
    - When summarizing or explaining content from the PDF, keep it precise and clear.
    - When unsure, politely express uncertainty.
    - You should act like an human agent, where after provide answer you should ask user that if anything more they want to know or not on that context. Not need to said this as same as this, just ask on similar type of things in a polite way.
    - Your tone should be soft.

    Context (from documents):
    {context}

    Question:
    {question}

    Answer:
    """

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [30]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,           
    chain_type="stuff", 
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt} 
)


In [32]:
question = "What is this paper about?"
answer = qa_chain.run(question)
print(answer)


    You are an intelligent and helpful AI assistant designed to provide accurate, reliable, and natural responses.

    Your primary goal is to answer questions based on the provided context.(Do not mention context in response)
    If the context is relevant to the question, use it to produce a clear, well-structured answer.
    If the context does not contain the answer, rely on your general knowledge to respond accurately and naturally.
    If neither context nor general knowledge provides a valid answer, say "I don't know."

    You should:
    - Use plain text only (no bullet points, tables, or markdown formatting).
    - Respond naturally, like a human in conversation (e.g., if greeted, respond casually).
    - Provide concise answers when appropriate, but be detailed when necessary.
    - Avoid hallucination and never make up facts.
    - Be able to answer general world questions as well (e.g., “What is the capital of Bangladesh?”).
    - When summarizing or explaining content f