In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/hw2

/content/drive/MyDrive/hw2


In [7]:
import torch
import model
import utils

In [8]:
text = ["I love machine learning"]
tokenizer = utils.Tokenizer(24)
tokenizer.build_tokenization(text)
tokenized_text = tokenizer.tokenize(text)

In [13]:
one_layer_config = model.GPTConfig
one_layer_config.vocab_size = tokenizer.vocab_size
one_layer_config.n_layer = 1
small_gpt = model.GPT(one_layer_config)

number of parameters: 7.10M


### Regular inference sampling

In [12]:
# Basic train loop on single vector
device = "cuda" if torch.cuda.is_available() else "cpu"
embedded, mask = torch.from_numpy(tokenized_text[0]),torch.from_numpy(tokenized_text[1])
loss = 1 # initial value for loop condition
small_gpt.train()
small_gpt.to(device)
optimizer = small_gpt.configure_optimizers(learning_rate=1e-5, weight_decay=0.0, betas =(0.9,0.95),device_type=device)
embedded = embedded.to(device)
mask = mask.to(device)
i = 0
while loss > 1e-16:
  optimizer.zero_grad()
  if i%100 == 1:
    print(loss.item())
  pred_logits = small_gpt(embedded)
  loss_per_token = torch.nn.functional.cross_entropy(pred_logits.flatten(end_dim=1),embedded.view(-1),reduction="none")
  loss = (loss_per_token * mask.view(-1)).sum() / mask.sum()
  loss.backward()
  optimizer.step()
  i += 1


num decayed parameter tensors: 6, with 7,875,840 parameters
num non-decayed parameter tensors: 10, with 11,520 parameters
using fused AdamW: True
1.7325066328048706
0.0015796147054061294
0.00017355283489450812
2.0032135580549948e-05


KeyboardInterrupt: 

In [133]:
print(loss)

tensor(0., device='cuda:0', grad_fn=<DivBackward0>)


In [134]:
small_gpt.eval()
with torch.no_grad():
  sampling = torch.zeros_like(embedded)
  sampling[0,0:3] = embedded[0,0:3]
  for i in range(20):
    pred_logits = small_gpt(sampling)
    sampling[0,0:3+i+1] = pred_logits.argmax(dim=2)[0,0:3+i+1]
    print(tokenizer.untokenize(sampling.cpu().numpy()))

['I lo']
['I lov']
['I love']
['I love ']
['I love m']
['I love ma']
['I love mac']
['I love mach']
['I love machi']
['I love machin']
['I love machine']
['I love machine ']
['I love machine l']
['I love machine le']
['I love machine lea']
['I love machine lear']
['I love machine learn']
['I love machine learni']
['I love machine learnin']
['I love machine learning']


### Same but with first three masked

In [15]:
# Basic train loop on single vector
device = "cuda" if torch.cuda.is_available() else "cpu"
embedded, mask = torch.from_numpy(tokenized_text[0]),torch.from_numpy(tokenized_text[1])
masked_start = mask
masked_start[0,0:3] = False
loss = 1 # initial value for loop condition
small_gpt.train()
small_gpt.to(device)
optimizer = small_gpt.configure_optimizers(learning_rate=1e-5, weight_decay=0.0, betas =(0.9,0.95),device_type=device)
embedded = embedded.to(device)
masked_start = masked_start.to(device)
i = 0
while loss > 1e-16:
  optimizer.zero_grad()
  if i%100 == 1:
    print(loss.item())
  pred_logits = small_gpt(embedded)
  loss_per_token = torch.nn.functional.cross_entropy(pred_logits.flatten(end_dim=1),embedded.view(-1),reduction="none")
  loss = (loss_per_token * masked_start.view(-1)).sum() / masked_start.sum()
  loss.backward()
  optimizer.step()
  i += 1


num decayed parameter tensors: 6, with 7,875,840 parameters
num non-decayed parameter tensors: 10, with 11,520 parameters
using fused AdamW: True
1.6450914144515991
0.0013822962064296007
0.00016000140749383718
1.8662036382011138e-05
3.3855380934255663e-06
1.1682502645271597e-06
5.781648724223487e-07
3.159045718348352e-07
1.7881390590446244e-07
1.0728835064810482e-07
5.960463766996327e-08
4.172324707951702e-08
4.768370942542788e-08
5.364417532405241e-08


In [17]:
small_gpt.eval()
with torch.no_grad():
  sampling = torch.zeros_like(embedded)
  sampling[0,0:3] = embedded[0,0:3]
  print(tokenizer.untokenize(sampling.cpu().numpy()))
  for i in range(20):
    pred_logits = small_gpt(sampling)
    sampling[0,0:3+i+1] = pred_logits.argmax(dim=2)[0,0:3+i+1]
    print(tokenizer.untokenize(sampling.cpu().numpy()))

['I l']
['o oo']
['o oov']
['o oove']
['o oovee']
['o ooveem']
['o ooveeme']
['o ooveemee']
['o ooveemeee']
['o ooveemeeei']
['o ooveemeeeie']
['o ooveemeeeiee']
['o ooveemeeeieee']
['o ooveemeeeieeee']
['o ooveemeeeieeeee']
['o ooveemeeeieeeeee']
['o ooveemeeeieeeeeee']
['o ooveemeeeieeeeeeee']
['o ooveemeeeieeeeeeeee']
['o ooveemeeeieeeeeeeeee']
['o ooveemeeeieeeeeeeeeee']
