# Week 8: Language modeling!

# Rasika Bhalerao

# Agenda

- Language models
- BERT for classification

In [None]:
# !pip install transformers
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification, AdamW, logging
from torch.nn import functional as F
from sklearn.metrics import f1_score

logging.set_verbosity_error()

if torch.cuda.is_available():    
  device = torch.device("cuda")
  print('Using GPU ', torch.cuda.get_device_name(0))
else:
  device = torch.device("cpu")
  print('Using CPU')


Using CPU


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [86]:
sentence = 'Feature normalization produces features with mean [MASK] and variance 1.'
# sentence = '[MASK] have one question [MASK] you Amy.'
# sentence = 'Maddy do you want to [MASK] your announcement?'
# sentence = 'one two three [MASK] five'


mask_id = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
token_ids = tokenizer.encode(sentence, return_tensors='pt')
output = model(token_ids)[0].squeeze(0)

for mask_idx in (token_ids==mask_id)[0].nonzero():
  hs = output[mask_idx.item()]
  log_probs = torch.nn.LogSoftmax(dim=0)(hs)
  best_guess = tokenizer.convert_ids_to_tokens(torch.argmax(log_probs).item())
  print(best_guess)

0


### BERT for classification

In [106]:
# Hyperparameter tuning
learning_rates = [1e-3, 1e-4, 1e-5]
batch_sizes = [8, 16, 32] # unused with fake data
max_epochs = 100
early_stop_epochs = 3
epsilon = 1e-5

losses = {} # dict from loss (float) to array of hyperparameter values

In [130]:
# Training set
example_batch_of_text = ['whiskers tail tail paw purr', 'meow whiskers whiskers', 'paw woof bark bark']
example_batch_of_labels = torch.tensor([1,1,0])
train_text_batches = [example_batch_of_text]
train_label_batches = [example_batch_of_labels]

# Dev set
example_batch_of_text = ['paw bark woof bark', 'meow meow paw purr', 'henlo whiskers']
example_batch_of_labels = torch.tensor([0,1,1])
dev_text_batches = [example_batch_of_text]
dev_label_batches = [example_batch_of_labels]

# Test set
example_batch_of_text = ['bark paw paw paw', 'meow whiskers purr', 'woof woof woof']
example_batch_of_labels = torch.tensor([0,1,0])
test_text_batches = [example_batch_of_text]
test_label_batches = [example_batch_of_labels]

In [110]:
# Hyperparameter tuning

for lr in learning_rates:

  # train using selected hyperparameter
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True)
  model.to(device)
  model.train()
  optimizer = AdamW(model.parameters(), lr=lr)

  nondecreasing_epochs = early_stop_epochs
  prev_loss = None
  for epoch in range(max_epochs):
    total_loss = 0
    for text_batch, label_batch in zip(train_text_batches, train_label_batches):
      encoding = tokenizer(text_batch, 
                          return_tensors='pt', padding=True, 
                          truncation=True, max_length=256)
      input_ids = encoding['input_ids']
      attention_mask = encoding['attention_mask']

      outputs = model(input_ids, attention_mask=attention_mask)
      loss = F.cross_entropy(outputs.logits, label_batch)

      total_loss += loss.item()
      loss.backward()
      optimizer.step()

    print(f'Train loss epoch {epoch}: {total_loss}')
    if prev_loss is not None and prev_loss - epsilon <= total_loss:
      nondecreasing_epochs -= 1
    else:
      nondecreasing_epochs = early_stop_epochs
    prev_loss = total_loss
    if nondecreasing_epochs <= 0:
      break
  
  # check dev loss with current model
  total_loss = 0
  for text_batch, label_batch in zip(dev_text_batches, dev_label_batches):
    encoding = tokenizer(text_batch, 
                        return_tensors='pt', padding=True, 
                        truncation=True, max_length=256)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask)
    loss = F.cross_entropy(outputs.logits, label_batch)

    total_loss += loss.item()
  print(f'Dev loss: {total_loss}')
  losses[total_loss] = [lr]

Train loss epoch 0: 0.675462007522583
Train loss epoch 1: 0.7045699954032898
Train loss epoch 2: 0.4964850842952728
Train loss epoch 3: 0.6509253978729248
Train loss epoch 4: 1.9392576217651367
Train loss epoch 5: 3.7950007915496826
Dev loss: 0.9086148738861084
Train loss epoch 0: 0.789372444152832
Train loss epoch 1: 0.5797554850578308
Train loss epoch 2: 0.4332538843154907
Train loss epoch 3: 0.4039294421672821
Train loss epoch 4: 0.5009769201278687
Train loss epoch 5: 0.3650988042354584
Train loss epoch 6: 0.29423579573631287
Train loss epoch 7: 0.2196759134531021
Train loss epoch 8: 0.17826694250106812
Train loss epoch 9: 0.16204044222831726
Train loss epoch 10: 0.1118406280875206
Train loss epoch 11: 0.10373073816299438
Train loss epoch 12: 0.06387980282306671
Train loss epoch 13: 0.03434162214398384
Train loss epoch 14: 0.012304903008043766
Train loss epoch 15: 0.0088256960734725
Train loss epoch 16: 0.005327882710844278
Train loss epoch 17: 0.004738897085189819
Train loss epoch 

In [111]:
# Train model with best hyperparameter

best_hyperparams = losses[min(losses)]
best_lr = best_hyperparams[0]
print(f'Best lr: {best_lr}')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True)
model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=best_lr)

nondecreasing_epochs = early_stop_epochs
prev_loss = None
for epoch in range(max_epochs):
  total_loss = 0
  for text_batch, label_batch in zip(train_text_batches, train_label_batches):
    encoding = tokenizer(text_batch, 
                        return_tensors='pt', padding=True, 
                        truncation=True, max_length=256)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask)
    loss = F.cross_entropy(outputs.logits, label_batch)

    total_loss += loss.item()
    loss.backward()
    optimizer.step()

  print(f'Train loss epoch {epoch}: {total_loss}')
  if prev_loss is not None and prev_loss - epsilon <= total_loss:
    nondecreasing_epochs -= 1
  prev_loss = total_loss
  if nondecreasing_epochs <= 0:
    break


Best lr: 0.0001
Train loss epoch 0: 0.7076751589775085
Train loss epoch 1: 0.5344696640968323
Train loss epoch 2: 0.23449377715587616
Train loss epoch 3: 0.17738954722881317
Train loss epoch 4: 0.15177451074123383
Train loss epoch 5: 0.12609608471393585
Train loss epoch 6: 0.11721354722976685
Train loss epoch 7: 0.08962824940681458
Train loss epoch 8: 0.07122112065553665
Train loss epoch 9: 0.049825847148895264
Train loss epoch 10: 0.02504490502178669
Train loss epoch 11: 0.013294324278831482
Train loss epoch 12: 0.013333950191736221
Train loss epoch 13: 0.00651288777589798
Train loss epoch 14: 0.008425845764577389
Train loss epoch 15: 0.005788961425423622
Train loss epoch 16: 0.006442824844270945


In [131]:
# Get F1 score of trained model on test set

output_labels = torch.tensor([])
for text_batch, label_batch in zip(test_text_batches, test_label_batches):
  encoding = tokenizer(text_batch, 
                      return_tensors='pt', padding=True, 
                      truncation=True, max_length=256)
  input_ids = encoding['input_ids']
  attention_mask = encoding['attention_mask']

  outputs = model(input_ids, attention_mask=attention_mask)
  predicted_labels = torch.argmax(outputs.logits, dim=1)
  output_labels = torch.cat([output_labels, predicted_labels], dim=0)


correct_labels = torch.cat(test_label_batches, dim=0)
f1 = f1_score(correct_labels, output_labels)
print(f'F1 score: {f1}')

F1 score: 1.0


In [133]:
outputs.logits

tensor([[ 1.9954, -3.4220],
        [-1.9956,  2.9089],
        [ 2.3230, -3.7223]], grad_fn=<AddmmBackward>)