In [1]:
%cd /content/drive/MyDrive/FYP_Edited

/content/drive/MyDrive/FYP_Edited


In [11]:
!pip3 install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.6MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 56.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 44.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=2d16e9bfefa93

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm

torch.manual_seed(2020)

print(torch.cuda.get_device_name(torch.cuda.current_device()))
print(torch.cuda.is_available())
print(torch.__version__)


Tesla P100-PCIE-16GB
True
1.7.0+cu101


In [3]:
import pandas as pd
import numpy as np

data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")


In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [5]:
getter = SentenceGetter(data)


In [6]:
sentences = getter.sentences


In [7]:
tags = ["[PAD]"]
tags.extend(list(set(data["Tag"].values)))
tag2idx = {t: i for i, t in enumerate(tags)}

words = ["[PAD]", "[UNK]"]
words.extend(list(set(data["Word"].values)))
word2idx = {t: i for i, t in enumerate(words)}


In [8]:
test_sentences, val_sentences, train_sentences = sentences[:15000], sentences[15000:20000], sentences[20000:]


In [13]:
import random
from transformers import pipeline


In [16]:
class TransformerAugmenter():
    """
    Use the pretrained masked language model to generate more
    labeled samples from one labeled sentence.
    """
    
    def __init__(self):
        self.num_sample_tokens = 5
        self.fill_mask = pipeline(
            "fill-mask",
            # topk=self.num_sample_tokens,
            model="distilroberta-base"
        )
    
    def generate(self, sentence, num_replace_tokens=3):
        """Return a list of n augmented sentences."""
              
        # run as often as tokens should be replaced
        augmented_sentence = sentence.copy()
        for i in range(num_replace_tokens):
            # join the text
            text = " ".join([w[0] for w in augmented_sentence])
            # pick a token
            replace_token = random.choice(augmented_sentence)
            # mask the picked token
            masked_text = text.replace(
                replace_token[0],
                f"{self.fill_mask.tokenizer.mask_token}",
                1            
            )
            # fill in the masked token with Bert
            res = self.fill_mask(masked_text)[random.choice(range(self.num_sample_tokens))]
            # create output samples list
            tmp_sentence, augmented_sentence = augmented_sentence.copy(), []
            for w in tmp_sentence:
                if w[0] == replace_token[0]:
                    augmented_sentence.append((res["token_str"].replace("Ġ", ""), w[1], w[2]))
                else:
                    augmented_sentence.append(w)
            text = " ".join([w[0] for w in augmented_sentence])
        return [sentence, augmented_sentence]


In [17]:
augmenter = TransformerAugmenter()


Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
augmented_sentences = augmenter.generate(train_sentences[12], num_replace_tokens=7); augmented_sentences


In [20]:
augmented_sentences = augmenter.generate(train_sentences[7], num_replace_tokens=7); augmented_sentences


[[('Iran', 'NNP', 'B-geo'),
  ('denies', 'VBZ', 'O'),
  ('the', 'DT', 'O'),
  ('charge', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Trump', 'NNP', 'B-geo'),
  ('rejects', 'VBZ', 'O'),
  ('the', 'DT', 'O'),
  ('proposal', 'NN', 'O'),
  ('.', '.', 'O')]]

In [21]:
n_sentences = 1000

augmented_sentences = []
for sentence in tqdm(train_sentences[:n_sentences]):
    augmented_sentences.extend(augmenter.generate(sentence, num_replace_tokens=7))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [22]:
len(augmented_sentences)

2000