In [1]:
pdf_path = '/content/Attention_is_All_you_need.pdf'

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
# Checking if pdf file is valid or corrupt
from PyPDF2 import PdfReader

def check_pdf_with_pypdf2(pdf_path: str):
    try:
        reader = PdfReader(pdf_path)
        number_of_pages = len(reader.pages)
        print(f"PDF has {number_of_pages} pages.")
        for page_number in range(number_of_pages):
            page = reader.pages[page_number]
            text = page.extract_text()
            print(f"Page {page_number + 1} text length: {len(text)}")
    except Exception as e:
        print(f"An error occurred: {e}")

check_pdf_with_pypdf2(pdf_path)

PDF has 15 pages.
Page 1 text length: 2853
Page 2 text length: 4260
Page 3 text length: 1826
Page 4 text length: 2481
Page 5 text length: 3169
Page 6 text length: 3448
Page 7 text length: 3305
Page 8 text length: 3149
Page 9 text length: 2969
Page 10 text length: 3111
Page 11 text length: 3229
Page 12 text length: 3229
Page 13 text length: 812
Page 14 text length: 814
Page 15 text length: 817


In [4]:
from tqdm.auto import tqdm

def remove_escape(text: str) -> str:
  # Replacing \n with a space
    text = text.replace("\n", " ").strip()
    return text

def pdf_info(pdf_path: str) -> list[dict]:
    reader = PdfReader(pdf_path)
    page_info = []
    for page_number in tqdm(range(len(reader.pages))):
        page = reader.pages[page_number]
        text = page.extract_text()
        if text is not None:
            text = remove_escape(text=text)
            page_info.append({
                "page_number": page_number,
                "page_token_count": len(text) / 4,
                "text": text
            })
    return page_info

pdf_info = pdf_info(pdf_path=pdf_path)
print(pdf_info[:2])

  0%|          | 0/15 [00:00<?, ?it/s]

[{'page_number': 0, 'page_token_count': 713.25, 'text': 'Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.comNoam Shazeer∗ Google Brain noam@google.comNiki Parmar∗ Google Research nikip@google.comJakob Uszkoreit∗ Google Research usz@google.com Llion Jones∗ Google Research llion@google.comAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.eduŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispe

In [5]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x781c9859a900>

In [6]:
for item in tqdm(pdf_info):
  item['sentences'] = list(nlp(item["text"]).sents)
  item['sentences'] = [str(sentence) for sentence in item['sentences']]
  item['page_sentence_count_spacy'] = len(item['sentences'])

  0%|          | 0/15 [00:00<?, ?it/s]

In [7]:
chunk_size = 10

split_list = lambda lst, n=chunk_size: [lst[i:i + n] for i in range(0, len(lst), n)]

for item in tqdm(pdf_info):
    sentences = item.get('sentences', [])
    chunks = split_list(sentences)
    item.update({'sentence_chunks': chunks, 'num_chunks': len(chunks)})

  0%|          | 0/15 [00:00<?, ?it/s]

In [8]:
import re

for item in tqdm(pdf_info):
  if 'sentence_chunks' in item:
    for sentence_chunk in item['sentence_chunks']:
      joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
      item['joined_sentence_chunk'] = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
      item['chunk_token_count'] = len(joined_sentence_chunk) / 4
len(pdf_info)

  0%|          | 0/15 [00:00<?, ?it/s]

15

In [9]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [10]:
from tqdm import tqdm
import re
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True).cuda()

for item in tqdm(pdf_info):
    if 'joined_sentence_chunk' in item:
        chunk = item['joined_sentence_chunk']
        embedding = model.encode(chunk)
        item['embedding'] = embedding

print(len(pdf_info))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

100%|██████████| 15/15 [00:01<00:00, 10.09it/s]

15





In [11]:
embedding = model.encode("I like Pizza")
print(len(embedding))
embedding

768


array([ 7.54513979e-01,  5.87947726e-01, -1.51963031e+00, -5.22277713e-01,
       -2.50049457e-02, -8.84583831e-01,  5.56818962e-01,  2.59808428e-03,
        3.76229346e-01, -3.29874843e-01, -5.31627893e-01, -1.61104107e+00,
       -5.92847206e-02, -1.71345580e+00,  5.24561346e-01, -1.15593478e-01,
       -1.51762903e+00, -6.56484246e-01,  3.92965794e-01,  6.25241816e-01,
       -7.54365772e-02, -1.74663961e-01,  7.66955316e-02,  1.06768537e+00,
       -2.55090464e-02,  5.59810460e-01, -2.40105633e-02,  5.18924817e-02,
       -5.37153520e-02, -6.45093858e-01,  1.10591197e+00,  5.43537021e-01,
        3.28345329e-01, -2.33200744e-01, -6.13644063e-01, -7.65958428e-01,
        5.34898639e-01,  1.31778204e+00,  1.69039273e+00, -3.55745286e-01,
        2.36047849e-01, -1.02639294e+00, -4.05598402e-01,  4.52501982e-01,
        3.21355104e-01,  8.29237163e-01, -3.57529968e-02,  8.11312973e-01,
        4.93091494e-02,  2.01072723e-01,  7.16835111e-02,  3.47668305e-02,
       -2.60068148e-01,  

In [12]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.30.5-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.18.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.26.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_pro

In [13]:
import chromadb
from sentence_transformers import SentenceTransformer

client = chromadb.Client()

collection_name = 'pdf_embeddings'
collection = client.create_collection(name=collection_name)

ids = []
embeddings = []

for i, item in enumerate(tqdm(pdf_info)):
    if 'joined_sentence_chunk' in item:
        chunk = item['joined_sentence_chunk']
        embedding = model.encode(chunk)

        item['embedding'] = embedding
        ids.append(str(i))
        embeddings.append(embedding.tolist())

collection.add(ids=ids, embeddings=embeddings)

print("Data successfully stored in Chroma database.")

100%|██████████| 15/15 [00:00<00:00, 45.36it/s]

Data successfully stored in Chroma database.





In [14]:
query = "What is Positional Encoding"
query_embedding = model.encode(query).tolist()
results = collection.query(query_embeddings=[query_embedding], n_results=1)
matching_id = results['ids'][0][0]
matching_text = pdf_info[int(matching_id)]['joined_sentence_chunk']
print(f"Text similar to the query: {matching_text}")

Text similar to the query: Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position ican depend only on the known outputs at positions less than i. 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum 3


In [15]:
!pip install huggingface_hub



In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
!pip install accelerate



In [21]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=32)

output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Write me a poem about Machine Learning.

A tapestry of data, woven tight,
Machine learning, a guiding light.
Algorithms dance, a rhythmic sway,
Learning patterns, come what may


In [30]:
prompt_tuning =
"Your role is to take the Retrieved data on the query and convert that retrieved data as the query suggest and answer in detail"
input_text = prompt_tuning + query + matching_text
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=200)

output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
output_text = output_text[len(input_text):].strip()

print(output_text)

.3 Positional Encoding Positional encoding is a technique used to inject positional information into the input sequence. This information is crucial for the model to understand the order of the words in a sentence. Without positional encoding, the model would treat all words as equally important, regardless of their position in the sentence. 

**Query:** Explain the concept of positional encoding in the context of a Transformer model.

**Answer:**

Positional encoding is a crucial technique used in Transformer models to inject positional information into the input sequence. This information is essential for the model to understand the order of words in a sentence. 

**How it works:**

* **Without positional encoding:**  A Transformer model would treat all words as equally important, regardless of their position in the sentence. This is because the model doesn't have any way to distinguish between words that are close to each other or far apart.
* **With positional encoding:**  Position