In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/hw2

/content/drive/MyDrive/hw2


In [3]:
import torch
import model
import utils

In [4]:
import torch
torch.manual_seed(42)

<torch._C.Generator at 0x7f8a70722ff0>

In [10]:
text = ["I love machine learning"]
tokenizer = utils.Tokenizer(24)
tokenizer.build_tokenization(text)
tokenized_text = tokenizer.tokenize(text)

In [15]:
one_layer_config = model.GPTConfig
one_layer_config.vocab_size = tokenizer.vocab_size
one_layer_config.n_layer = 1
small_gpt = model.GPT(one_layer_config)

number of parameters: 7.10M


### Regular inference sampling

In [16]:
# Basic train loop on single vector
device = "cuda" if torch.cuda.is_available() else "cpu"
embedded, mask = torch.from_numpy(tokenized_text[0]),torch.from_numpy(tokenized_text[1])
loss = 1 # initial value for loop condition
small_gpt.train()
small_gpt.to(device)
optimizer = small_gpt.configure_optimizers(learning_rate=1e-5, weight_decay=0.0, betas =(0.9,0.95),device_type=device)
embedded = embedded.to(device)
mask = mask.to(device)
i = 0
while loss > 1e-16:
  optimizer.zero_grad()
  if i%100 == 1:
    print("Step:",i," Loss:",loss.item())
  pred_logits = small_gpt(embedded,attn_mask=mask)
  loss_per_token = torch.nn.functional.cross_entropy(pred_logits.flatten(end_dim=1),embedded.view(-1),reduction="none")
  loss = (loss_per_token * mask.view(-1)).sum() / mask.sum()
  loss.backward()
  optimizer.step()
  i += 1
print("Step:",i," Loss:",loss.item())

num decayed parameter tensors: 6, with 7,875,840 parameters
num non-decayed parameter tensors: 10, with 11,520 parameters
using fused AdamW: True
Step: 1  Loss: 1.8072611093521118
Step: 101  Loss: 0.001599593204446137
Step: 201  Loss: 0.00018330039165448397
Step: 301  Loss: 2.1379692043410614e-05
Step: 401  Loss: 3.81987319997279e-06
Step: 501  Loss: 1.2853861335315742e-06
Step: 601  Loss: 6.37510311207734e-07
Step: 701  Loss: 3.4207874932690174e-07
Step: 801  Loss: 2.0732048255922564e-07
Step: 901  Loss: 1.3994133496453287e-07
Step: 1001  Loss: 7.256217315898539e-08
Step: 1101  Loss: 5.7013131993244315e-08
Step: 1201  Loss: 4.146409438021692e-08
Step: 1301  Loss: 5.7013131993244315e-08
Step: 1379  Loss: 0.0


In [17]:
small_gpt.eval()
with torch.no_grad():
  sampling = torch.zeros_like(embedded)
  sampling[0,0:3] = embedded[0,0:3]
  for i in range(23):
    pred_logits = small_gpt(sampling)
    sampling[0,i] = pred_logits.argmax(dim=2)[0,i]
    print(tokenizer.untokenize(sampling.cpu().numpy()))

['I l']
['I l']
['I l']
['I lo']
['I lov']
['I love']
['I love ']
['I love m']
['I love ma']
['I love mac']
['I love mach']
['I love machi']
['I love machin']
['I love machine']
['I love machine ']
['I love machine l']
['I love machine le']
['I love machine lea']
['I love machine lear']
['I love machine learn']
['I love machine learni']
['I love machine learnin']
['I love machine learning']


In [18]:
# Save model:
torch.save(small_gpt.state_dict(), "sanity_checks.pth")

### Same but with first three masked

In [19]:
one_layer_config = model.GPTConfig
one_layer_config.vocab_size = tokenizer.vocab_size
one_layer_config.n_layer = 1
small_gpt = model.GPT(one_layer_config)

number of parameters: 7.10M


In [20]:
# Basic train loop on single vector
device = "cuda" if torch.cuda.is_available() else "cpu"
embedded, mask = torch.from_numpy(tokenized_text[0]),torch.from_numpy(tokenized_text[1])
masked_start = mask.clone()
masked_start[0,0:3] = False
loss = 1 # initial value for loop condition
small_gpt.train()
small_gpt.to(device)
optimizer = small_gpt.configure_optimizers(learning_rate=1e-5, weight_decay=0.0, betas =(0.9,0.95),device_type=device)
embedded = embedded.to(device)
masked_start = masked_start.to(device)
i = 0
while loss > 1e-16:
  optimizer.zero_grad()
  if i%100 == 1:
    print("Step:",i," Loss:",loss.item())
  pred_logits = small_gpt(embedded,attn_mask=mask)
  loss_per_token = torch.nn.functional.cross_entropy(pred_logits.flatten(end_dim=1),embedded.view(-1),reduction="none")
  loss = (loss_per_token * masked_start.view(-1)).sum() / masked_start.sum()
  loss.backward()
  optimizer.step()
  i += 1

print("Step:",i," Loss:",loss.item())

num decayed parameter tensors: 6, with 7,875,840 parameters
num non-decayed parameter tensors: 10, with 11,520 parameters
using fused AdamW: True
Step: 1  Loss: 2.060918092727661
Step: 101  Loss: 0.001617064350284636
Step: 201  Loss: 0.00019303396402392536
Step: 301  Loss: 2.2637575966655277e-05
Step: 401  Loss: 3.856413059111219e-06
Step: 501  Loss: 1.2636176052183146e-06
Step: 601  Loss: 6.139276820249506e-07
Step: 701  Loss: 3.337859197927173e-07
Step: 801  Loss: 1.5497205652081902e-07
Step: 901  Loss: 1.1324881654672936e-07
Step: 1001  Loss: 7.152556946721234e-08
Step: 1101  Loss: 5.960463766996327e-08
Step: 1201  Loss: 4.172324707951702e-08
Step: 1301  Loss: 1.192092824453539e-08
Step: 1401  Loss: 5.364417532405241e-08
Step: 1403  Loss: 0.0


In [21]:
small_gpt.eval()
with torch.no_grad():
  sampling = torch.zeros_like(embedded)
  sampling[0,0:3] = embedded[0,0:3]
  for i in range(23):
    pred_logits = small_gpt(sampling)
    sampling[0,i] = pred_logits.argmax(dim=2)[0,i]
    print(tokenizer.untokenize(sampling.cpu().numpy()))

['o l']
['o l']
['o o']
['o oo']
['o ooo']
['o oooo']
['o oooo ']
['o oooo m']
['o oooo mo']
['o oooo mom']
['o oooo momo']
['o oooo momom']
['o oooo momomn']
['o oooo momomnm']
['o oooo momomnmm']
['o oooo momomnmmm']
['o oooo momomnmmmm']
['o oooo momomnmmmmm']
['o oooo momomnmmmmmr']
['o oooo momomnmmmmmrm']
['o oooo momomnmmmmmrmi']
['o oooo momomnmmmmmrmim']
['o oooo momomnmmmmmrmimm']


In [22]:
small_gpt.eval()
with torch.no_grad():
  sampling = torch.zeros_like(embedded)
  sampling[0,0:3] = embedded[0,0:3]
  for i in range(20):
    pred_logits = small_gpt(sampling)
    sampling[0,i+3] = pred_logits.argmax(dim=2)[0,i+3]
    print(tokenizer.untokenize(sampling.cpu().numpy()))

['I lo']
['I lov']
['I love']
['I love ']
['I love m']
['I love ma']
['I love mac']
['I love mach']
['I love machi']
['I love machin']
['I love machine']
['I love machine ']
['I love machine l']
['I love machine le']
['I love machine lea']
['I love machine lear']
['I love machine learn']
['I love machine learni']
['I love machine learnin']
['I love machine learning']
