In [9]:
import os
from dotenv import load_dotenv 
load_dotenv() 
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_community.document_loaders import PyPDFLoader 
loader = PyPDFLoader("attention.pdf")
docs = loader.load()

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 20)
documents = text_splitter.split_documents(docs)

In [5]:
from langchain_openai import OpenAIEmbeddings, OpenAI 
from langchain_community.vectorstores import FAISS 

db = FAISS.from_documents(documents, OpenAIEmbeddings())

In [7]:
query = "An attention function can be described as mapping a query"
result = db.similarity_search(query)
result[0].page_content

'Instead of performing a single attention function with dmodel-dimensional keys, values and queries,\nwe found it beneficial to linearly project the queries, keys and values h times with different, learned\nlinear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of\nqueries, keys and values we then perform the attention function in parallel, yielding dv-dimensional\n4To illustrate why the dot products get large, assume that the components of q and k are independent random\nvariables with mean 0 and variance 1. Then their dot product, q · k = Pdk\ni=1 qiki, has mean 0 and variance dk.\n4'

In [13]:
llm = OpenAI()

In [14]:
from langchain_core.prompts import ChatPromptTemplate 
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer. 
I will reward you, if the user finds the answer helpful
<context>
{context}
</context>
Question: {input}
""")

In [16]:
from langchain.chains.combine_documents import create_stuff_documents_chain 

document_chain = create_stuff_documents_chain(
    llm = llm, 
    prompt=prompt
)

In [19]:
retriever = db.as_retriever()

In [20]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [26]:
response = retrieval_chain.invoke({"input": "How many heads are there is Multi-head Self Attention mechanism"})

In [27]:
print(response['answer'])

Answer: There are a total of 8 parallel attention layers, or heads, used in the Multi-head Self Attention mechanism. Each of these heads uses dk = dv = dmodel/h = 64, resulting in a reduced dimension for each head and a constant number of operations for relating signals from two arbitrary input or output positions.
