In [71]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [72]:
%cd /content/drive/MyDrive/hw2

/content/drive/MyDrive/hw2


In [73]:
import torch
import model
import utils
import train

In [83]:
import torch
torch.manual_seed(42)

<torch._C.Generator at 0x7a59197b2ff0>

In [75]:
text = ["I love machine learning"]
tokenizer = utils.Tokenizer(24)
tokenizer.build_tokenization(text)
tokenized_text = tokenizer.tokenize(text)

In [76]:
one_layer_config = model.GPTConfig
one_layer_config.vocab_size = tokenizer.vocab_size
one_layer_config.n_layer = 1
small_gpt = model.GPT(one_layer_config)

number of parameters: 7.10M


### Regular inference sampling

In [77]:
# Basic train loop on single vector
device = "cuda" if torch.cuda.is_available() else "cpu"
embedded, mask = torch.from_numpy(tokenized_text[0]),torch.from_numpy(tokenized_text[1])
loss = 1 # initial value for loop condition
small_gpt.train()
small_gpt.to(device)
optimizer = small_gpt.configure_optimizers(learning_rate=1e-5, weight_decay=0.0, betas =(0.9,0.95),device_type=device)
embedded = embedded.to(device)
mask = mask.to(device)
i = 0
while loss > 1e-16:
  optimizer.zero_grad()
  if i%100 == 1:
    print("Step:",i," Loss:",loss.item())
  pred_logits = small_gpt(embedded)
  loss_per_token = torch.nn.functional.cross_entropy(pred_logits.flatten(end_dim=1),embedded.view(-1),reduction="none")
  loss = (loss_per_token * mask.view(-1)).sum() / mask.sum()
  loss.backward()
  optimizer.step()
  i += 1
print("Step:",i," Loss:",loss.item())

num decayed parameter tensors: 6, with 7,875,840 parameters
num non-decayed parameter tensors: 10, with 11,520 parameters
using fused AdamW: True
Step: 1  Loss: 1.7886563539505005
Step: 101  Loss: 0.0017960845725610852
Step: 201  Loss: 0.00020159268751740456
Step: 301  Loss: 2.3670527298236266e-05
Step: 401  Loss: 4.224145868647611e-06
Step: 501  Loss: 1.4564252523996402e-06
Step: 601  Loss: 7.048894872241362e-07
Step: 701  Loss: 3.783598288009671e-07
Step: 801  Loss: 2.0213745699493302e-07
Step: 901  Loss: 1.0884325263305072e-07
Step: 1001  Loss: 8.81112072192991e-08
Step: 1101  Loss: 4.6647109286368504e-08
Step: 1201  Loss: 3.109807167334111e-08
Step: 1301  Loss: 1.5549035836670555e-08
Step: 1333  Loss: 0.0


In [78]:
small_gpt.eval()
with torch.no_grad():
  sampling = torch.zeros_like(embedded)
  sampling[0,0:3] = embedded[0,0:3]
  for i in range(23):
    pred_logits = small_gpt(sampling)
    sampling[0,i] = pred_logits.argmax(dim=2)[0,i]
    print(tokenizer.untokenize(sampling.cpu().numpy()))

['I l']
['I l']
['I l']
['I lo']
['I lov']
['I love']
['I love ']
['I love m']
['I love ma']
['I love mac']
['I love mach']
['I love machi']
['I love machin']
['I love machine']
['I love machine ']
['I love machine l']
['I love machine le']
['I love machine lea']
['I love machine lear']
['I love machine learn']
['I love machine learni']
['I love machine learnin']
['I love machine learning']


### Same but with first three masked

In [87]:
one_layer_config = model.GPTConfig
one_layer_config.vocab_size = tokenizer.vocab_size
one_layer_config.n_layer = 1
small_gpt = model.GPT(one_layer_config)

number of parameters: 7.10M


In [88]:
# Basic train loop on single vector
device = "cuda" if torch.cuda.is_available() else "cpu"
embedded, mask = torch.from_numpy(tokenized_text[0]),torch.from_numpy(tokenized_text[1])
masked_start = mask.clone()
masked_start[0,0:3] = False
loss = 1 # initial value for loop condition
small_gpt.train()
small_gpt.to(device)
optimizer = small_gpt.configure_optimizers(learning_rate=1e-5, weight_decay=0.0, betas =(0.9,0.95),device_type=device)
embedded = embedded.to(device)
masked_start = masked_start.to(device)
i = 0
while loss > 1e-16:
  optimizer.zero_grad()
  if i%100 == 1:
    print("Step:",i," Loss:",loss.item())
  pred_logits = small_gpt(embedded)
  loss_per_token = torch.nn.functional.cross_entropy(pred_logits.flatten(end_dim=1),embedded.view(-1),reduction="none")
  loss = (loss_per_token * masked_start.view(-1)).sum() / masked_start.sum()
  loss.backward()
  optimizer.step()
  i += 1

print("Step:",i," Loss:",loss.item())

num decayed parameter tensors: 6, with 7,875,840 parameters
num non-decayed parameter tensors: 10, with 11,520 parameters
using fused AdamW: True
Step: 1  Loss: 1.7234783172607422
Step: 101  Loss: 0.0014911503531038761
Step: 201  Loss: 0.00017536508676130325
Step: 301  Loss: 2.0843524907832034e-05
Step: 401  Loss: 3.898136128555052e-06
Step: 501  Loss: 1.3709058066524449e-06
Step: 601  Loss: 7.152553962441743e-07
Step: 701  Loss: 3.576278118089249e-07
Step: 801  Loss: 2.2649760467174929e-07
Step: 901  Loss: 1.311302071371756e-07
Step: 1001  Loss: 7.152556946721234e-08
Step: 1101  Loss: 5.364417532405241e-08
Step: 1201  Loss: 3.576278473360617e-08
Step: 1301  Loss: 6.55651035685878e-08
Step: 1401  Loss: 6.55651035685878e-08
Step: 1484  Loss: 0.0


In [89]:
small_gpt.eval()
with torch.no_grad():
  sampling = torch.zeros_like(embedded)
  sampling[0,0:3] = embedded[0,0:3]
  for i in range(23):
    pred_logits = small_gpt(sampling)
    sampling[0,i] = pred_logits.argmax(dim=2)[0,i]
    print(tokenizer.untokenize(sampling.cpu().numpy()))

['a l']
['a l']
['a o']
['a oo']
['a ooo']
['a oooe']
['a oooe ']
['a oooe m']
['a oooe ma']
['a oooe mac']
['a oooe mach']
['a oooe machi']
['a oooe machin']
['a oooe machine']
['a oooe machine ']
['a oooe machine l']
['a oooe machine le']
['a oooe machine lea']
['a oooe machine lear']
['a oooe machine learn']
['a oooe machine learni']
['a oooe machine learnin']
['a oooe machine learning']


In [82]:
small_gpt.eval()
with torch.no_grad():
  sampling = torch.zeros_like(embedded)
  sampling[0,0:3] = embedded[0,0:3]
  for i in range(20):
    pred_logits = small_gpt(sampling)
    sampling[0,i+3] = pred_logits.argmax(dim=2)[0,i+3]
    print(tokenizer.untokenize(sampling.cpu().numpy()))

['I lo']
['I lov']
['I love']
['I love ']
['I love m']
['I love ma']
['I love mac']
['I love mach']
['I love machi']
['I love machin']
['I love machine']
['I love machine ']
['I love machine l']
['I love machine le']
['I love machine lea']
['I love machine lear']
['I love machine learn']
['I love machine learni']
['I love machine learnin']
['I love machine learning']
