In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn import Embedding, Linear, Module, CrossEntropyLoss, BCELoss, MSELoss
import torch.optim as optim
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from util.util import BPEs

In [3]:
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", use_fast=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
encoding = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", output_hidden_states=True)

In [4]:
def load_data(path):
    with open(path, 'r') as f:
        data = f.read()
    return data
def preprocess_data(data):
    # Example preprocessing: lowercasing and stripping whitespace
    data = data.split()
    return [" ".join(data[i:i + 100]) for i in range(0, len(data), 20)]
def tokenize_data(data):
    return tokenizer(data, padding=True, truncation=True,max_length=128, return_tensors="pt")
def encoding_data(data):
    return encoding(data)

In [5]:
path = "./data/PythonCodeDataSmall_TextOnly/Python_code_data.txt"
path2 = "./data/PythonCodeDataSmall_TextOnly/Python_code_data.txt"
data = load_data(path)
new_tokenizer = BPEs(vocab_size=10240)
new_tokenizer.train([path2])






In [6]:
new_tokenizer.load("./model/BPE_model/tokenizer-bpe-5k.json")

In [19]:
new_tokenizer.tokenizer.get_vocab_size()

10240

In [7]:
test_text = """This is a test sentence for the tokenizer.
This is a test sentence for the tokenizer."""

In [8]:
test_text

'This is a test sentence for the tokenizer.\nThis is a test sentence for the tokenizer.'

In [9]:
new_tokenizer.tokenizer.get_vocab_size()

10240

In [10]:
new_tokenizer.tokenizer.encode(test_text).ids

[2504,
 209,
 112,
 1940,
 925,
 187,
 155,
 5853,
 6280,
 19,
 4,
 2504,
 209,
 112,
 1940,
 925,
 187,
 155,
 5853,
 6280,
 19]

In [11]:
new_tokenizer.decode(new_tokenizer.tokenizer.encode(test_text).ids)

'This Ġis Ġa Ġtest Ġsentence Ġfor Ġthe Ġtoken izer . Ċ This Ġis Ġa Ġtest Ġsentence Ġfor Ġthe Ġtoken izer .'

In [12]:
print(new_tokenizer.decode_clean(new_tokenizer.encode(test_text).ids))

This is a test sentence for the tokenizer.
This is a test sentence for the tokenizer.


In [13]:
# new_tokenizer.tokenizer.add_special_tokens(["/n",""])

In [14]:
new_tokenizer.tokenizer.id_to_token(5000)

'Ġstr:'

In [15]:
(new_tokenizer.tokenizer.encode(test_text).tokens)

['This',
 'Ġis',
 'Ġa',
 'Ġtest',
 'Ġsentence',
 'Ġfor',
 'Ġthe',
 'Ġtoken',
 'izer',
 '.',
 'Ċ',
 'This',
 'Ġis',
 'Ġa',
 'Ġtest',
 'Ġsentence',
 'Ġfor',
 'Ġthe',
 'Ġtoken',
 'izer',
 '.']

In [16]:
tokenizer.tokenize(test_text)

['This',
 'Ġis',
 'Ġa',
 'Ġtest',
 'Ġsentence',
 'Ġfor',
 'Ġthe',
 'Ġtoken',
 'izer',
 '.',
 'Ċ',
 'This',
 'Ġis',
 'Ġa',
 'Ġtest',
 'Ġsentence',
 'Ġfor',
 'Ġthe',
 'Ġtoken',
 'izer',
 '.']

In [17]:
test_text

'This is a test sentence for the tokenizer.\nThis is a test sentence for the tokenizer.'

In [18]:
tokenizer.tokenize(" ")

['Ġ']

In [37]:
new_tokenizer.tokenizer.encode("test test\ntest").tokens

['test', 'Ġtest', 'Ċ', 'test']

In [38]:
new_tokenizer.tokenizer.encode("test test\ntest").ids

[251, 1940, 4, 251]

In [39]:
new_tokenizer.tokenizer.encode("\n").ids

[4]

In [40]:
new_tokenizer.tokenizer.decode([4],skip_special_tokens=False)

'Ċ'

In [41]:
new_tokenizer.tokenizer.decode(new_tokenizer.tokenizer.encode("test test\ntest").ids,skip_special_tokens=False)

'test Ġtest Ċ test'

In [238]:
tokenizer(test_text, return_tensors="pt")

{'input_ids': tensor([[  101,  2023,  2003,  1037,  3231,  6251,  2005,  1996, 19204, 17629,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [239]:
# new_tokenizer.save_model("./data/PythonCodeDataSmall_TextOnly/BPE_data/bpe_modelV1_1024_5.pkl")

In [240]:
test_text

'This is a test sentence for the tokenizer.'

In [241]:
[1] + new_tokenizer.tokenizer.encode(test_text).ids + [3,0,0,0,0]

[1, 1161, 155, 68, 208, 524, 144, 140, 2396, 4115, 17, 3, 0, 0, 0, 0]

In [242]:
new_tokenizer.tokenizer.decode([1] + new_tokenizer.tokenizer.encode(test_text).ids + [3,0,0,0,0,0])

'This is a test sentence for the token izer .'

In [7]:
class data_loader(Dataset):
    def __init__(self, data):
        self.pre_data = preprocess_data(data)
        self.tokens_data = tokenize_data(self.pre_data)['input_ids'].to(dtype=torch.long)
        # self.tokens_data_new = new_tokenizer.tokenize(data)
        tt = [F.pad(torch.tensor(new_tokenizer.tokenizer.encode(dd).ids, dtype=torch.int), mode='constant', pad=(0, max(128 - len(new_tokenizer.tokenizer.encode(dd).tokens), -10000000)), value=0) for dd in self.pre_data]
        self.tokens_data_new = torch.stack(tt)
    def __len__(self):
        return len(self.tokens_data)

    def __getitem__(self, idx):
        return self.tokens_data[idx], self.tokens_data_new[idx]

In [8]:
data_loader_instance = data_loader(data)

In [21]:
dataset_loader = DataLoader(data_loader_instance, batch_size=1, shuffle=True)

In [10]:
class LearnablePositionalEmbedding(Module):
    def __init__(self, max_seq_len: int, d_model: int):
        super().__init__()
        self.pos_embedding = Embedding(max_seq_len, d_model)

    def forward(self, x):
        """
        x: (batch_size, seq_len, d_model)
        """
        batch_size, seq_len, _ = x.size()
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        return x + self.pos_embedding(positions)

In [11]:
class embedding(Module):
    def __init__(self, vocab_size, d_model, max_seq_len):
        super().__init__()
        self.word_embedding1 = Embedding(vocab_size, d_model)
        # self.layer_norm1 = torch.nn.LayerNorm(d_model)
        # self.word_embedding2 = Linear(vocab_size//2, vocab_size//4)
        # self.layer_norm2 = torch.nn.LayerNorm(vocab_size//4)
        # self.word_embedding3 = Linear(vocab_size//4, d_model)
        # self.layer_norm3 = torch.nn.LayerNorm(d_model)
        # self.tanh = torch.nn.Tanh()
        self.pos_embedding = LearnablePositionalEmbedding(max_seq_len, d_model)

    def forward(self, x):
        x = self.word_embedding1(x)
        # x = self.layer_norm1(x)
        # x = self.word_embedding2(x)
        # x = self.layer_norm2(x)
        # x = self.word_embedding3(x)
        # x = self.layer_norm3(x)
        # x = self.tanh(x)
        x = self.pos_embedding(x)
        # x = self.tanh(x)
        return x

In [12]:
encoding

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [13]:
class large_embedding(Module):
    def __init__(self, vocab_size, d_model, max_seq_len):
        super().__init__()
        self.word_embedding1 = encoding.embeddings.word_embeddings
        self.positional_embedding = encoding.embeddings.position_embeddings
        # self.layer_norm = encoding.embeddings.LayerNorm
        # self.dropout = encoding.embeddings.dropout

    def forward(self, x):
        x_e = self.word_embedding1(x)
        # x = self.layer_norm(x)
        # x = self.dropout(x)
        x_p = self.positional_embedding(torch.arange(0, x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1))
        x = x_e + x_p
        # x = self.dropout(x)
        return x

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
a = [1,2,3,4,5,6,7,8,9,10]

In [16]:
a[0:-1]

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [17]:
embedding_model = embedding(vocab_size=1024*5, d_model=384*2, max_seq_len=128)
embedding_model.train()
for layer in embedding_model.parameters():
    layer.requires_grad = True

# large_embedding_model = large_embedding(vocab_size=1024*5, d_model=384, max_seq_len=128)
# large_embedding_model.eval()
# for layer in large_embedding_model.parameters():
#     layer.requires_grad = False

encoding.to(device)
embedding_model.to(device)


embedding(
  (word_embedding1): Embedding(5120, 768)
  (pos_embedding): LearnablePositionalEmbedding(
    (pos_embedding): Embedding(128, 768)
  )
)

In [18]:
optimizer = optim.Adam(embedding_model.parameters(), lr=0.001)
criterion = MSELoss()

In [19]:
for layer in encoding.parameters():
    layer.requires_grad = False

In [20]:
epoch = 20
batch_loss = 0
for e in range(epoch):
    for batch, new_batch in dataset_loader:
        batch = batch.to(device)
        n_batch = batch.size(0)
        max_seq_len = batch.size(1)
        target = encoding(batch)
        optimizer.zero_grad()
        new_batch = new_batch.to(device)
        output = embedding_model(new_batch)
        loss = criterion(output, target.last_hidden_state)
        loss.backward()
        optimizer.step()
        # print(f"Epoch {e}, Batch Loss: {loss.item()}")
        batch_loss += loss.item()
    print(f"Epoch {e}, Loss: {batch_loss / len(dataset_loader)}")
    batch_loss = 0

/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [49]:
batch.size()

torch.Size([100, 128])

In [273]:
torch.save(embedding_model.state_dict(), "./model/Transformer/embedding_model.pth")

In [264]:
target.last_hidden_state.size()

torch.Size([88, 128, 384])

In [265]:
output.size()

torch.Size([88, 128, 384])

In [271]:
target.last_hidden_state[0][0][:10]

tensor([ 0.0287,  0.1369, -0.2417,  0.0326,  0.0839, -0.1499, -0.0455,  0.0542,
        -0.3117, -0.3579], device='cuda:0')

In [272]:
output[0][0][:10]

tensor([ 0.0639,  0.2435, -0.0936,  0.0264, -0.1986, -0.1604,  0.0187, -0.0192,
        -0.2563, -0.1595], device='cuda:0', grad_fn=<SliceBackward0>)

In [47]:
import gc
gc.collect()
torch.cuda.empty_cache()
del embedding_model
del encoding
del optimizer
del criterion
del target
del output
del batch

In [15]:
target.last_hidden_state.size()

torch.Size([100, 512, 384])

In [14]:
output.size()

torch.Size([100, 512, 384])

In [278]:
list(range(10,20)) + list(range(0,-1))

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [41]:
with open("./data/PythonCodeDataSmall_TextOnly/Python_code_data.txt", "r") as f:
    data = f.read(-1)
    data = data.split("\n# ")
    data = [data[0].strip("\n")] + [("# " + c).strip("\n") for c in data[1:] if len(c) >= 80]

In [42]:
data[0]

"# write a python program to add two numbers \nnum1 = 1.5\nnum2 = 6.3\nsum = num1 + num2\nprint(f'Sum: {sum}')"

In [43]:
len(new_tokenizer.tokenizer.encode(data[0]).ids)

33

In [44]:
new_tokenizer.tokenizer.encode(data[0]).ids

[6,
 159,
 68,
 167,
 174,
 126,
 391,
 286,
 249,
 384,
 32,
 20,
 17,
 24,
 402,
 32,
 25,
 17,
 22,
 225,
 32,
 384,
 14,
 402,
 121,
 11,
 73,
 10,
 876,
 29,
 94,
 225,
 641]

In [45]:
new_tokenizer.tokenizer.decode([1] + new_tokenizer.tokenizer.encode(data[0]).ids + [3])

"# write a python program to add two numbers num1 = 1 . 5 num2 = 6 . 3 sum = num1 + num2 print ( f ' Sum : { sum }')"

In [291]:
new_tokenizer.tokenizer.encode(data[0]).tokens

['#',
 'write',
 'a',
 'python',
 'program',
 'to',
 'add',
 'two',
 'numbers',
 'num1',
 '=',
 '1',
 '.',
 '5',
 'num2',
 '=',
 '6',
 '.',
 '3',
 'sum',
 '=',
 'num1',
 '+',
 'num2',
 'print',
 '(',
 'f',
 "'",
 'Sum',
 ':',
 '{',
 'sum',
 "}')"]

In [294]:
[[b for b in range(10)] for a in range(10)]

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]

In [2]:
128*4

512

In [45]:
128*4/8

64.0