In [81]:
## Libraries
import wget
import torch
import xtarfile as tarfile
from fairseq.data.data_utils import collate_tokens
from fairseq.models.roberta import RobertaModel
from transformers import pipeline

import sys, time, csv
from transformers import RobertaTokenizer, RobertaForMaskedLM


# Previous codebase

In [82]:
# https://ramsrigoutham.medium.com/sized-fill-in-the-blank-or-multi-mask-filling-with-roberta-and-huggingface-transformers-58eb9e7fb0c
def get_predictions(string, tokenizer, model):
  token_ids = tokenizer.encode(string, return_tensors='pt')
  masked_position = (token_ids.squeeze() == tokenizer.mask_token_id).nonzero()
  masked_pos = [mask.item() for mask in masked_position ]

  token_ids = token_ids.to('cuda')

  with torch.no_grad():
    output = model(token_ids)


  last_hidden_state = output[0].squeeze()

  predictions = []
  for index,mask_index in enumerate(masked_pos):
    mask_hidden_state = last_hidden_state[mask_index]
    idx = torch.topk(mask_hidden_state, k=1, dim=0)[1]
    words = [tokenizer.decode(i.item()).strip() for i in idx]
    predictions.append(words[0]) # just take the first one since it's the highest confidence
    # print ('Mask ', index + 1, 'Guesses : ', words)
  
  best_guess = ''
  for j in predictions:
    if j != '':
      best_guess = best_guess + ' ' + j[0]
 
  return predictions

In [83]:
def get_mask_indices(string):
  mask_indices = []

  i = 0
  for word in string.split():
    if '<mask>' in word:
      mask_indices.append(i)
    i += 1

  return mask_indices


In [84]:
# returns true if strings match minus special characters (we may have some accuracy loss for things like well vs we'll)
def strings_match(a, b):
  return [c for c in a if c.isalpha()] == [c for c in b if c.isalpha()]

In [85]:
def current_time_milli():
  return round(time.time() * 1000)

In [80]:
def eval(tokenizer, model, masked_dataset, original_dataset):
  start_time = current_time_milli()

  correct = 0
  total = 0
  for i in range(0, len(masked_dataset)):
    row = masked_dataset[i]
    if(row == []):
      continue

    index = row[0]
    string = row[1]

    # For longer strings, roberta complains with this error: 
    # Token indices sequence length is longer than the specified maximum sequence length 
    # for this model (1891 > 512). Running this sequence through the model will result in indexing errors
    #
    # This is a less-than-ideal workaround for now
    if(len(string) > 512):
      string = string[:512]

    mask_indices = get_mask_indices(string)
    predictions = get_predictions(string, tokenizer, model)
    total += len(mask_indices)

    original_string_tokens = original_dataset[i][1].split()
    for j in range(len(mask_indices)):
      prediction = predictions[j]
      original = original_string_tokens[mask_indices[j]] 
      if(strings_match(prediction, original)):
        correct += 1
    
    i += 1
    
    processed_msg = 'Processed message ' + str(i) + ' out of ' + str(len(masked_dataset))
    current_time = current_time_milli()
    eta_seconds = (current_time - start_time) / 1000 / (i) * (len(masked_dataset) - i)
    eta_minutes = int(eta_seconds / 60)
    eta_seconds = int(eta_seconds % 60)
    eta_msg = 'Estimated time remaining: ' + str(eta_minutes) + ' minutes ' + str(eta_seconds) + ' seconds'
    sys.stdout.write('\r' + processed_msg + ' | ' + eta_msg)
    sys.stdout.flush()
  
  accuracy = float(correct / total)
  print('\n')
  print('Model predicted ' + str(correct) + ' out of ' + str(total) + '.')
  print('Accuracy: ' + str(accuracy))

  return accuracy

In [None]:
!nvidia-smi

Thu Apr 29 19:51:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 466.11       Driver Version: 466.11       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| 43%   32C    P5    26W / 370W |   1632MiB / 10240MiB |     45%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Filling masks (example)

In [None]:
# Initialize MLM pipeline
mlm = pipeline('fill-mask')

# Get mask token
mask = mlm.tokenizer.mask_token

# Get result for particular masked phrase
results = mlm('Read the rest of this <mask> to understand things in more detail', topk=5)

# Print result
for result in results:
  print(result)

{'sequence': 'Read the rest of this article to understand things in more detail', 'score': 0.35419148206710815, 'token': 1566, 'token_str': ' article'}
{'sequence': 'Read the rest of this post to understand things in more detail', 'score': 0.20478709042072296, 'token': 618, 'token_str': ' post'}
{'sequence': 'Read the rest of this guide to understand things in more detail', 'score': 0.07164707034826279, 'token': 4704, 'token_str': ' guide'}
{'sequence': 'Read the rest of this essay to understand things in more detail', 'score': 0.06781881302595139, 'token': 14700, 'token_str': ' essay'}
{'sequence': 'Read the rest of this blog to understand things in more detail', 'score': 0.04165174812078476, 'token': 5059, 'token_str': ' blog'}


# Testing with different models

In [None]:
# List available models
torch.hub.list('pytorch/fairseq')  # [..., 'transformer_lm.wmt19.en', ...]

Using cache found in C:\Users\Juan Parra/.cache\torch\hub\pytorch_fairseq_master


['bart.base',
 'bart.large',
 'bart.large.cnn',
 'bart.large.mnli',
 'bart.large.xsum',
 'bpe',
 'camembert',
 'camembert-base',
 'camembert-base-ccnet',
 'camembert-base-ccnet-4gb',
 'camembert-base-oscar-4gb',
 'camembert-base-wikipedia-4gb',
 'camembert-large',
 'camembert.v0',
 'conv.stories',
 'conv.stories.pretrained',
 'conv.wmt14.en-de',
 'conv.wmt14.en-fr',
 'conv.wmt17.en-de',
 'data.stories',
 'dynamicconv.glu.wmt14.en-fr',
 'dynamicconv.glu.wmt16.en-de',
 'dynamicconv.glu.wmt17.en-de',
 'dynamicconv.glu.wmt17.zh-en',
 'dynamicconv.no_glu.iwslt14.de-en',
 'dynamicconv.no_glu.wmt16.en-de',
 'lightconv.glu.wmt14.en-fr',
 'lightconv.glu.wmt16.en-de',
 'lightconv.glu.wmt17.en-de',
 'lightconv.glu.wmt17.zh-en',
 'lightconv.no_glu.iwslt14.de-en',
 'lightconv.no_glu.wmt16.en-de',
 'roberta.base',
 'roberta.large',
 'roberta.large.mnli',
 'roberta.large.wsc',
 'tokenizer',
 'transformer.wmt14.en-fr',
 'transformer.wmt16.en-de',
 'transformer.wmt18.en-de',
 'transformer.wmt19.de-en',

## Different pre-trained model results

In [86]:
import sys, time, csv
import torch
import logging

from transformers import RobertaTokenizer, RobertaForMaskedLM

def main():
  models = ['roberta-base', 'roberta-large', 'distilroberta-base']
  best_model = ''
  accuracy = 0.0;
  for entry in models:
    tokenizer = RobertaTokenizer.from_pretrained(entry)
    model = RobertaForMaskedLM.from_pretrained(entry)
    model.eval()

    # move to cuda if available
    if torch.cuda.is_available():
      torch.cuda.empty_cache()
      model = model.to('cuda')
      logging.info("Using cuda now")

    with open('project_dataset/test_masked.csv', newline='') as file:
      masked_dataset = list(csv.reader(file))

    with open('project_dataset/test.csv', newline='') as file:
      original_dataset = list(csv.reader(file))
    
    # remove headers
    masked_dataset.pop(0)
    original_dataset.pop(0)
    model_accuracy = eval(tokenizer, model, masked_dataset, original_dataset)
    if model_accuracy > accuracy:
      best_model = model
      accuracy = model_accuracy

max_int = sys.maxsize
while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int/10)
main()

2021-04-29 22:11:59 | INFO | root | Using cuda now


Processed message 86 out of 206961 | Estimated time remaining: 28 minutes 47 seconds

KeyboardInterrupt: ignored

In [45]:
import happytransformer
from happytransformer import HappyWordPrediction

In [78]:
happy_roberta = HappyWordPrediction("ROBERTA", "roberta-base")
result = happy_roberta.predict_mask("Read the rest of this [MASK] to understand things in more detail", top_k=5)
print(result)

2021-04-29 21:38:41 | INFO | happytransformer.happy_transformer | Using model: cuda


[WordPredictionResult(token=' article', score=0.48412269353866577), WordPredictionResult(token=' post', score=0.3602624535560608), WordPredictionResult(token=' piece', score=0.023724298924207687), WordPredictionResult(token=' paper', score=0.01611217111349106), WordPredictionResult(token=' story', score=0.015522404573857784)]


In [73]:
happy_bert = HappyWordPrediction("BERT", "bert-base-uncased")
result = happy_bert.predict_mask("To better the world I would invest in [MASK] and education.", top_k=2)
print(result)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2021-04-29 21:24:55 | INFO | happytransformer.happy_transformer | Using model: cuda


[WordPredictionResult(token='health', score=0.22784867882728577), WordPredictionResult(token='research', score=0.17031845450401306)]


In [74]:
happy_albert = HappyWordPrediction("ALBERT", "albert-xxlarge-v2")
result = happy_albert.predict_mask("To better the world I would invest in [MASK] and education.", top_k=2)
print(result)

2021-04-29 21:25:00 | INFO | happytransformer.happy_transformer | Using model: cuda


[WordPredictionResult(token='infrastructure', score=0.09300383180379868), WordPredictionResult(token='healthcare', score=0.07224401831626892)]


## Fine Tuning Roberta

In [None]:
# Model paths
MODEL_TYPE = "roberta" 
MODEL_DIR = "models/roberta" 
OUTPUT_DIR = "models/roberta/output" 
TRAIN_PATH = "data/train.txt" 
EVAL_PATH = "data/dev.txt"

In [None]:
cmd = """
    TOTAL_NUM_UPDATES=2036  # 10 epochs through RTE for bsz 16
    WARMUP_UPDATES=122      # 6 percent of the number of updates
    LR=2e-05                # Peak LR for polynomial LR scheduler.
    NUM_CLASSES=2
    MAX_SENTENCES=16        # Batch size.
    ROBERTA_PATH=roberta.large/model.pt

    CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/ \
    --restore-file $ROBERTA_PATH \
    --max-positions 512 \
    --batch-size $MAX_SENTENCES \
    --max-tokens 4400 \
    --task sentence_prediction \
    --reset-optimizer --reset-dataloader --reset-meters \
    --required-batch-size-multiple 1 \
    --init-token 0 --separator-token 2 \
    --arch roberta_large \
    --criterion sentence_prediction \
    --num-classes $NUM_CLASSES \
    --dropout 0.1 --attention-dropout 0.1 \
    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
    --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
    --max-epoch 10 \
    --find-unused-parameters \
    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric; """

In [None]:
cmd

'\n    TOTAL_NUM_UPDATES=2036  # 10 epochs through RTE for bsz 16\n    WARMUP_UPDATES=122      # 6 percent of the number of updates\n    LR=2e-05                # Peak LR for polynomial LR scheduler.\n    NUM_CLASSES=2\n    MAX_SENTENCES=16        # Batch size.\n    ROBERTA_PATH=roberta.large/model.pt\n\n    CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/     --restore-file $ROBERTA_PATH     --max-positions 512     --batch-size $MAX_SENTENCES     --max-tokens 4400     --task sentence_prediction     --reset-optimizer --reset-dataloader --reset-meters     --required-batch-size-multiple 1     --init-token 0 --separator-token 2     --arch roberta_large     --criterion sentence_prediction     --num-classes $NUM_CLASSES     --dropout 0.1 --attention-dropout 0.1     --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06     --clip-norm 0.0     --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES     --fp16 --fp1

In [None]:
import sys, time, csv
import torch
import logging

from transformers import RobertaTokenizer, RobertaForMaskedLM

def main():
  models = ['tmp']
  best_model = ''
  accuracy = 0.0;
  for entry in models:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)
    model = RobertaForMaskedLM.from_pretrained(entry)
    # optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
    # loss_function = torch.nn.CrossEntropyLoss()
    model.eval()

    # move to cuda if available
    if torch.cuda.is_available():
      torch.cuda.empty_cache()
      model = model.to('cuda')
      logging.info("Using cuda now")

    with open('project_dataset/test_masked.csv', newline='') as file:
      masked_dataset = list(csv.reader(file))

    with open('project_dataset/test.csv', newline='') as file:
      original_dataset = list(csv.reader(file))
    
    # remove headers
    masked_dataset.pop(0)
    original_dataset.pop(0)
    model_accuracy = eval(tokenizer, model, masked_dataset, original_dataset)
    if model_accuracy > accuracy:
      best_model = model
      accuracy = model_accuracy

max_int = sys.maxsize
while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int/10)
main()

NameError: ignored

# Updated accuracy based on fine-tuning

In [None]:
from fairseq.models.roberta import RobertaModel

roberta = RobertaModel.from_pretrained(
    'checkpoints/',
    checkpoint_file='checkpoint_best.pt',
    data_name_or_path='RTE-bin'
)

label_fn = lambda label: roberta.task.label_dictionary.string(
    [label + roberta.task.label_dictionary.nspecial]
)
ncorrect, nsamples = 0, 0
roberta.cuda()
roberta.eval()
with open('glue_data/RTE/dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        tokens = roberta.encode(sent1, sent2)
        prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
        prediction_label = label_fn(prediction)
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))