In [49]:
import os
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from dotenv import load_dotenv

### Load api key 
load_dotenv()
HUGGINGFACEHUB_API_KEY = os.getenv("HUGGINGFACEHUB_API_TOKEN")

### define llm model
llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.1-8B-Instruct",
    temperature=0.7,
    max_new_tokens=1024,
    huggingfacehub_api_token= HUGGINGFACEHUB_API_KEY
)
model = ChatHuggingFace(llm=llm)

In [50]:
### Select an embeddings model
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [51]:
### Define vector store
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

### **Loading documents**

In [52]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "data/attention-is-all-you-need.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
len(docs)

11

In [53]:
print(docs[0].page_content[:100])

Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brai


### **Splitting documents**

In [54]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

In [55]:
len(all_splits)

43

### **Storing documents**

In [56]:
document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

['52e81fa8-60f6-479c-9964-47015effc93f', 'f4ff3249-a44d-486b-af2c-a878d6bff025', '64ace985-8974-4258-9820-2573f30554d9']


In [None]:
response= model.invoke("What is Python?")
print(response)

content='**What is Python?**\n\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by Guido van Rossum and is now maintained by the Python Software Foundation.\n\n**Key Features of Python:**\n\n1.  **Easy to Learn:** Python has a simple syntax and is relatively easy to learn, making it a great language for beginners.\n2.  **High-Level Language:** Python is a high-level language, meaning it abstracts away many low-level details, allowing developers to focus on the logic of the program without worrying about the underlying implementation.\n3.  **Interpreted Language:** Python code is interpreted at runtime, which means that the code is not compiled before it is executed. This makes it easier to write and test code quickly.\n4.  **Object-Oriented:** Python is an object-oriented language, which means it supports the

### **RAG QNA**

In [71]:
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate


In [72]:
retriever = vector_store.as_retriever(
    search_type="similarity",  # or "mmr"
    search_kwargs={"k": 3}     # number of docs to retrieve per query
)


In [73]:
prompt_template = """
Use the following context to answer the question.
If the answer is not contained in the context, respond as "I don't know".

Context:
{context}

Question:
{question}
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)


In [74]:
qa_chain = RetrievalQA.from_chain_type(
    llm=model,           # your ChatHuggingFace instance
    chain_type="stuff",  # "stuff", "map_reduce", "refine"
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}  # optional
)


In [79]:
question = "What is the model architecture in given context?"
answer = qa_chain.run(question)
print(answer)


The Transformer model architecture is described as a fully connected feed-forward network. It consists of a stack of identical layers, which includes two sub-layers in each encoder layer and three sub-layers in each decoder layer. 

In the encoder layer, the two sub-layers are connected with residual connections, followed by layer normalization. The output of each sub-layer is calculated as LayerNorm(x+ Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself.

In the decoder layer, a third sub-layer is added, which performs multi-head attention over the output of the encoder stack. The decoder layers also employ residual connections and layer normalization, with the self-attention sub-layer modified to prevent positions from attending to subsequent positions.
