In [34]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")

In [35]:
# Data Ingestion using PDFLoader
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader('attention.pdf')
docs=loader.load()
docs[:5]

[Document(metadata={'source': 'attention.pdf', 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network arc

In [36]:
#Converting text docs into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(docs)
len(documents), documents[:5]

(52,
 [Document(metadata={'source': 'attention.pdf', 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple netwo

In [37]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "langchainvectors"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

In [38]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x269a7591590>

In [39]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embed_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
text_chunks = [doc.page_content for doc in documents]
embeddings = embed_model.embed_documents(text_chunks)
len(embeddings), embeddings[0][:5]

(52,
 [-0.014783564954996109,
  -0.010832996107637882,
  -0.05402161180973053,
  0.03313504159450531,
  0.04836973920464516])

In [40]:
vectors = [
    (f"pdf-chunk-{i}", embeddings[i], {"text": text_chunks[i]}) 
    for i in range(len(text_chunks))
]
index.upsert(vectors)

{'upserted_count': 52}

In [41]:
query = "What is Attention?"
query_embedding = embed_model.embed_query(query)
results = index.query(vector=query_embedding, top_k=3, include_metadata=True)
retrieved_chunks = []
for match in results["matches"]:
    text_content = match["metadata"].get("text", "No text found")
    retrieved_chunks.append(text_content)

# Format prompt for LLM
context_text = "\n\n".join(retrieved_chunks) 
context_text

'3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3\n\nin the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes\nit more difficult to learn dependencies between distant positions [ 12]. In the Transformer this is\nreduced to a constant number of operations, albeit at the cost of reduced effective resolution due\nto averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as\ndescribed in section 3.2.\nSelf-attention, sometimes called intra-attention is an attention mechanism relating different positions\nof a single sequence in order to compute a representation of the sequence. Self-attention has been\nused successfully in a variety of tasks including reading comprehension, abstractive summarization,\ntextual entailment and learning task-indepen

In [42]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    timeout=10,
)

In [43]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """ 
                You are an AI assistant. Use the following extracted text from a document and your own knowledge to answer the query accurately:
                --- Start of Extracted Text ---
                {context_text}
                --- End of Extracted Text ---
            """
        ),
        ("human", "{query}"),
    ]
)

In [44]:
chain = prompt | llm

In [45]:
chain.invoke({"context_text": context_text, "query": query}).content

'Attention is a function that maps a query and a set of key-value pairs to an output vector.  The output is a weighted sum of the values, where the weight assigned to each value is determined by a compatibility function of the query and the corresponding key.  Think of it as a way for the model to focus on the most relevant parts of the input when producing the output.\n\nMore specifically, the query, keys, and values are all vectors.  These are packed into matrices (Q, K, and V respectively). The attention mechanism calculates the output as:\n\nAttention(Q, K, V) = softmax(QKT / √dk)V\n\nWhere:\n\n* **Q** represents the matrix of queries.\n* **K** represents the matrix of keys.\n* **dk** is the dimension of the keys.\n* **V** represents the matrix of values.\n* **softmax** normalizes the weights to sum to 1.\n\nThe scaling factor 1/√dk is used to prevent the dot products from growing too large, which can push the softmax function into regions where it has extremely small gradients.  T