In [1]:
import os
from pathlib import Path
import nest_asyncio; nest_asyncio.apply()
import dotenv; dotenv.load_dotenv()
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

current_dir = Path.cwd()
data_dir = current_dir / 'data'
pdf_dir = data_dir / 'attention.pdf'

In [2]:
llamaindex_api = os.getenv('LLAMAINDEX_API')

In [3]:
parser = LlamaParse(api_key=llamaindex_api, result_type='markdown')
docs = SimpleDirectoryReader(input_dir=data_dir, file_extractor={".pdf": parser}).load_data(num_workers=6)

In [4]:
docs

[Document(id_='df6c50b0-cf01-411b-8b99-1e9efdbd9c44', embedding=None, metadata={'file_path': 'd:\\GDPLabs_AIE_Naufal\\part_3\\data\\attention.pdf', 'file_name': 'attention.pdf', 'file_type': 'application/pdf', 'file_size': 2215244, 'creation_date': '2024-07-07', 'last_modified_date': '2024-07-07'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='# Attention Is All You Need\n\n|Ashish Vaswani∗|Noam Shazeer∗|Niki Parmar∗|Jakob Uszkoreit∗|\n|---|---|---|---|\n|Google Brain|Google Brain|Google Research|Google Research|\n|avaswani@google.com|noam@google.com|nikip@google.com|usz@google.com|\n\n|Llion Jones∗|Aidan N. Gomez∗ †|Łukasz Kaiser∗|\n|---|---|---|\n|Google Research|University of Toronto|Google Brain|\n|llion@google.com|aidan@cs.toronto.edu|lukaszkaiser

In [5]:
len(docs)

15

In [6]:
vars(docs[0])

{'id_': 'df6c50b0-cf01-411b-8b99-1e9efdbd9c44',
 'embedding': None,
 'metadata': {'file_path': 'd:\\GDPLabs_AIE_Naufal\\part_3\\data\\attention.pdf',
  'file_name': 'attention.pdf',
  'file_type': 'application/pdf',
  'file_size': 2215244,
  'creation_date': '2024-07-07',
  'last_modified_date': '2024-07-07'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'text': '# Attention Is All You Need\n\n|Ashish Vaswani∗|Noam Shazeer∗|Niki Parmar∗|Jakob Uszkoreit∗|\n|---|---|---|---|\n|Google Brain|Google Brain|Google Research|Google Research|\n|avaswani@google.com|noam@google.com|nikip@google.com|usz@google.com|\n\n|Llion Jones∗|Aidan N. Gomez∗ †|Łukasz Kaiser∗|\n|---|---|---|\n|Google Research|University of Toronto|Google Brain|\n|ll

In [7]:
print(docs[0].text)

# Attention Is All You Need

|Ashish Vaswani∗|Noam Shazeer∗|Niki Parmar∗|Jakob Uszkoreit∗|
|---|---|---|---|
|Google Brain|Google Brain|Google Research|Google Research|
|avaswani@google.com|noam@google.com|nikip@google.com|usz@google.com|

|Llion Jones∗|Aidan N. Gomez∗ †|Łukasz Kaiser∗|
|---|---|---|
|Google Research|University of Toronto|Google Brain|
|llion@google.com|aidan@cs.toronto.edu|lukaszkaiser@google.com|

Illia Polosukhin∗ ‡illia.polosukhin@gmail.com

Abstract

The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and re

In [8]:
# adding `page` metadata
for idx, doc in enumerate(docs):
    doc.metadata['page'] = idx + 1
    doc.metadata['source'] = doc.metadata['file_path']

In [9]:
doc.metadata

{'file_path': 'd:\\GDPLabs_AIE_Naufal\\part_3\\data\\attention.pdf',
 'file_name': 'attention.pdf',
 'file_type': 'application/pdf',
 'file_size': 2215244,
 'creation_date': '2024-07-07',
 'last_modified_date': '2024-07-07',
 'page': 15,
 'source': 'd:\\GDPLabs_AIE_Naufal\\part_3\\data\\attention.pdf'}

In [10]:
docs[0].metadata['page']

1

In [11]:
docs[0].metadata.get('page')

1

In [13]:
# convert llama_index Document so it has .page_content attribute. Needed so that we can use TokenTextSplitter from Langchain
from llama_index.core.schema import Document as BaseDocument

class CustomDocument(BaseDocument):
    def __init__(self, page_content: str, **kwargs):
        super().__init__(text=page_content, **kwargs)     

    @property
    def page_content(self):
        return self.text

    @page_content.setter
    def page_content(self, value):
        self.text = value

def convert_documents(documents):
    return [CustomDocument(page_content=doc.text, **{k: v for k, v in doc.__dict__.items() if k != 'text'}) for doc in documents]

In [14]:
original_documents = docs.copy()
converted_documents = convert_documents(original_documents)

In [15]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(converted_documents)

In [16]:
len(chunks)

18

In [17]:
chunks[0].metadata

{'file_path': 'd:\\GDPLabs_AIE_Naufal\\part_3\\data\\attention.pdf',
 'file_name': 'attention.pdf',
 'file_type': 'application/pdf',
 'file_size': 2215244,
 'creation_date': '2024-07-07',
 'last_modified_date': '2024-07-07',
 'page': 1,
 'source': 'd:\\GDPLabs_AIE_Naufal\\part_3\\data\\attention.pdf'}

In [18]:
print(chunks[2].page_content)

# Softmax

# Linear

# Add & Norm

# Feed Forward

# Add & Norm

|Add & Norm|Multi-Head|
|---|---|
|Feed Forward|Attention|

# Add & Norm

# Add & Norm

|Masked|Multi-Head|
|---|---|
|Multi-Head|Attention|

# Input

# EmbeddingOutput

Figure 1: The Transformer - model architecture.

The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.

# Encoder and Decoder Stacks

Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network. We employ a residual connection [11] around each of the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-la

In [19]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [20]:
from langchain_chroma import Chroma

# load it into Chroma and save to disk
db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")

In [28]:
# query it
query = "What is attention"
docs = db.similarity_search(query)

In [29]:
print(docs[0].page_content)

Input-Input Layer5

Attention Visualizations

It is in this spirit that a majority of American governments have passed new laws since 2009.

Figure 3: An example of the attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of the verb ‘making’, completing the phrase ‘making...more difficult’. Attentions here shown only for the word ‘making’. Different colors represent different heads. Best viewed in color.

or voting process more difficult.

..

EOS

pad

pad

pad

pad

pad

pad
