<a href="https://colab.research.google.com/github/MohsenTaheriShalmani/practice_on_LLM/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# One simple example

In [22]:
import numpy as np
import pandas as pd
import math

### Manually Tokenization

In [23]:
# Cat sat on the mat as a dictionary
word_embeddings = {}
word_embeddings["cat"] = [1, 2, 3]
word_embeddings["sat"] = [4, 5, 6]
word_embeddings["on"] = [7, 8, 9]
word_embeddings["the"] = [10, 11, 12]
word_embeddings["mat"] = [13, 14, 15]
print(word_embeddings)

{'cat': [1, 2, 3], 'sat': [4, 5, 6], 'on': [7, 8, 9], 'the': [10, 11, 12], 'mat': [13, 14, 15]}


### Positional Encoding

In [9]:
def positional_encoding(position, d_model):
  if position % 2 == 0:
    return math.sin(position / d_model)
  else:
    return math.cos(position / d_model)

In [16]:
sentence = ["cat", "sat", "on", "the", "mat"]
d_model = 3  # Embedding dimension (example)

for i, word in enumerate(sentence):
  position_code = [positional_encoding(i, d_model) for _ in range(d_model)]
  word_embedding = word_embeddings[word]
  word_embedding_with_position = [a + b for a, b in zip(word_embedding, position_code)]

  print(f"Word: {word}, Embedding with Positional Encoding: {word_embedding_with_position}")

Word: cat, Embedding with Positional Encoding: [1.0, 2.0, 3.0]
Word: sat, Embedding with Positional Encoding: [4.944956946314738, 5.944956946314738, 6.944956946314738]
Word: on, Embedding with Positional Encoding: [7.618369803069737, 8.618369803069736, 9.618369803069736]
Word: the, Embedding with Positional Encoding: [10.54030230586814, 11.54030230586814, 12.54030230586814]
Word: mat, Embedding with Positional Encoding: [13.971937901363313, 14.971937901363313, 15.971937901363313]


## Using nltk library

In [24]:
import numpy as np

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [26]:
# Sample text
text = "This is just a simple example."

# Tokenize the text into words
tokens = word_tokenize(text)

# Print the tokens
print(tokens)

['This', 'is', 'just', 'a', 'simple', 'example', '.']


In [28]:
import numpy as np

def positional_encoding(max_len, d_model):
    # Initialize an empty positional encoding matrix
    pos_enc = np.zeros((max_len, d_model))

    # Compute the positional encoding
    for pos in range(max_len):
        for i in range(d_model):
            if i % 2 == 0:
                pos_enc[pos, i] = np.sin(pos / (10000 ** (i / d_model)))
            else:
                pos_enc[pos, i] = np.cos(pos / (10000 ** ((i - 1) / d_model)))

    return pos_enc

# Example usage
max_len = 10  # Maximum sequence length
d_model = 512  # Dimensionality of the embedding vectors
pos_enc = positional_encoding(max_len, d_model)

print("Positional Encoding Matrix:")
print(pos_enc)


Positional Encoding Matrix:
[[ 0.00000000e+00  1.00000000e+00  0.00000000e+00 ...  1.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 8.41470985e-01  5.40302306e-01  8.21856190e-01 ...  9.99999994e-01
   1.03663293e-04  9.99999995e-01]
 [ 9.09297427e-01 -4.16146837e-01  9.36414739e-01 ...  9.99999977e-01
   2.07326584e-04  9.99999979e-01]
 ...
 [ 6.56986599e-01  7.53902254e-01  4.52392316e-01 ...  9.99999717e-01
   7.25642986e-04  9.99999737e-01]
 [ 9.89358247e-01 -1.45500034e-01  9.90672639e-01 ...  9.99999630e-01
   8.29306248e-04  9.99999656e-01]
 [ 4.12118485e-01 -9.11130262e-01  6.76370200e-01 ...  9.99999532e-01
   9.32969500e-04  9.99999565e-01]]


## positional encoding via Transformers library

In [30]:
from transformers import RobertaModel, RobertaTokenizer
import torch

In [32]:
from transformers import BertTokenizer, BertModel

In [33]:
sentence = "This is a sample sentence for positional encoding."
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize(sentence)  # Tokenized sentence with special tokens


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [34]:
model = BertModel.from_pretrained('bert-base-uncased')
encoded_inputs = tokenizer(sentence, return_tensors="pt")  # Convert to PyTorch tensors
with torch.no_grad():
  outputs = model(**encoded_inputs)  # Forward pass through model (without gradients)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [35]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.3443, -0.5155, -0.3294,  ..., -0.4838,  0.1622,  0.8352],
         [-0.4179, -0.7518, -0.5235,  ..., -0.3927,  0.8812,  0.3113],
         [-0.3230, -0.6726, -0.0495,  ..., -0.1681,  0.1336,  0.7826],
         ...,
         [ 0.4675, -0.0990, -0.3110,  ..., -0.3320, -0.2903, -0.0065],
         [ 0.7603, -0.0464, -0.6887,  ...,  0.3285, -0.6701, -0.3824],
         [ 0.4325, -0.0964, -0.4619,  ...,  0.5389, -0.8563, -0.2270]]]), pooler_output=tensor([[-0.9018, -0.5277, -0.8623,  0.7583,  0.7527, -0.3536,  0.8539,  0.3927,
         -0.7795, -1.0000, -0.3886,  0.8755,  0.9738,  0.2502,  0.8829, -0.7222,
         -0.1103, -0.6287,  0.4651, -0.5081,  0.6191,  1.0000,  0.2201,  0.4512,
          0.5439,  0.9578, -0.7257,  0.9191,  0.9477,  0.8010, -0.7269,  0.2866,
         -0.9874, -0.3390, -0.9423, -0.9915,  0.5251, -0.7793, -0.1366, -0.1961,
         -0.9011,  0.4773,  1.0000, -0.2665,  0.5660, -0.4084, -1.0000,  0.