In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

access_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

### **Load the data**

In [4]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "data/attention-is-all-you-need.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
len(docs)

11

In [5]:
print(docs[0].page_content[:100])

Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brai


### **Splitting Documents**

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

In [7]:
len(all_splits)

43

### **Store in a vector database**

In [8]:
### Select an embeddings model
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [9]:
### Define vector store
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [10]:
document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

['00408df4-23ee-4ff0-af03-0306314be719', 'cfb8639b-3231-425d-a97b-1f09db3baf16', '6d842f06-36da-4c5d-ac78-32346e96e579']


### **Download our model Locally**

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

#model_name="meta-llama/Llama-3.2-3B-Instruct"
#model_name="microsoft/Phi-3-mini-4k-instruct"  # too much time it takes to response
#model_name = "openai-community/gpt2"   # Give error
model_name = "Qwen/Qwen2.5-Coder-0.5B-Instruct"

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [12]:
tokenizer.save_pretrained(f"tokenizer/{model_name}")
model.save_pretrained(f"models/{model_name}")

In [13]:
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

I am Qwen, an Alibaba Cloud developed model designed to assist with answering questions and providing information on a wide range of topics.<|im_end|>


In [34]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.5
)

llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu


### **Create a RAG pipeline**

In [35]:
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate

In [36]:
retriever = vector_store.as_retriever(
    search_type="mmr",  # or "similarity"
    search_kwargs={"k": 3}     # number of docs to retrieve per query
)

In [48]:
prompt_template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: {question} 
    Context: {context} 
    Answer:
    """

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [49]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,           
    chain_type="stuff", 
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt} 
)

In [51]:
question = "What is the main contribution of this paper?"
answer = qa_chain.run(question)
print(answer)


    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: What is the main contribution of this paper? 
    Context: 2017.
[16] Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks.
In International Conference on Learning Representations, 2017.
[17] Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In ICLR, 2015.
[18] Oleksii Kuchaiev and Boris Ginsburg. Factorization tricks for LSTM networks. arXiv preprint
arXiv:1703.10722, 2017.
[19] Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen
Zhou, and Yoshua Bengio. A structured self-attentive sentence embedding. arXiv preprint
arXiv:1703.03130, 2017.
[20] Samy Bengio Łukasz Kaiser. Can active memory replace attention? In Advances in Neural
Information Proces