In [None]:
!pip install transformers datasets

In [3]:
import transformers
from datasets.load import load_from_disk
from datasets import load_dataset, Dataset, concatenate_datasets, ClassLabel, Features, Value
import random
from transformers import AutoTokenizer
import torch
import os
import nltk
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

In [10]:
from torch.utils.data import DataLoader

In [24]:
import tensorflow as tf

In [5]:
# Set the device
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [6]:
sst2 = load_dataset('glue', 'sst2')

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
knowledge_dataset_params = {"sst2": {"n_iter": 20, "p_mask":0.1, "p_pos": 0.1, "p_ng":0.25}, 
                            "MNLI": {"n_iter": 10, "p_mask":0.1, "p_pos": 0.1, "p_ng":0.25}, 
                            "QQP": {"n_iter": 10, "p_mask":0.1, "p_pos": 0.1, "p_ng":0.25},
                            "MRPC": {"n_iter": 20, "p_mask":0.1, "p_pos": 0.1, "p_ng":0.25}
                            }

In [9]:
bert_sst2 = AutoModelForSequenceClassification.from_pretrained("WillHeld/bert-base-cased-sst2").to(device)
teacher_tokenizer = AutoTokenizer.from_pretrained("WillHeld/bert-base-cased-sst2")
batch_size = 32

Downloading (…)lve/main/config.json:   0%|          | 0.00/984 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [11]:
# Create a DataLoader to generate batches
train_dataset = list(zip(sst2['train']['sentence'], sst2['train']['idx']))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

# Create a list to store the logits
logit_items = []

# Loop over the batches
for batch in tqdm(train_dataloader):
    # Unpack the batch
    sentence, idx = batch
    
    # Tokenize input sentences
    encoded_batch = teacher_tokenizer(sentence, padding = True, truncation = True, return_tensors = "pt").to(device)
    
    # Do a forward pass through the model to get logits
    outputs = bert_sst2(**encoded_batch)
    
    # Extract the logits from the output and append to the list
    logits = outputs.logits.detach().cpu().numpy()
    for l in logits:
      logit_items.append(l)


labels = np.array(sst2['train']['label'])
logit_items = np.array(logit_items)
print('Accuracy')
(logit_items.argmax(axis=-1) == labels).sum()/len(labels)

100%|██████████| 2105/2105 [02:43<00:00, 12.90it/s]

Accuracy





0.9427905388350235

In [12]:
# Load word distributions from disk
word_distributions = torch.load('/content/drive/MyDrive/CS685/project/word_distributions.pt')

In [13]:
# Define a function to replace a single word with a random word of the same part-of-speech tag
def replace_word_with_random_word_of_same_pos(word, pos_tag):
    # Get the unigram word distribution for the given part-of-speech tag
    word_distribution = word_distributions[pos_tag]
    
    # Sample a new word from the distribution
    new_word_index = torch.multinomial(word_distribution, 1).item()
    new_word = teacher_tokenizer.convert_ids_to_tokens([new_word_index])[0]
    
    return new_word

def pos_guided_word_replacement(sentence, word_to_replace):
  # Tokenize the sentence
  tokenized_sentence = teacher_tokenizer.tokenize(sentence)

  pos_tag_to_replace = nltk.pos_tag([word_to_replace])[0][1]

  for i in range(len(tokenized_sentence)):
    if tokenized_sentence[i] == word_to_replace:
        return replace_word_with_random_word_of_same_pos(word_to_replace, pos_tag_to_replace)

In [14]:
cur_train = sst2['train']

In [15]:
from collections import defaultdict
pos_tag_word_map = defaultdict(lambda: set())
with torch.no_grad(), tqdm(total=len(cur_train['sentence']), desc="Computing the word distributions") as pbar:
  for s in cur_train['sentence']:
    for word, pos_tag in nltk.pos_tag(nltk.word_tokenize(s)):
      pos_tag_word_map[pos_tag].add(word)
    pbar.update(1)

pos_tag_word_map_list = defaultdict(lambda: [])
for pos_tag in pos_tag_word_map:
  pos_tag_word_map_list[pos_tag] = list(pos_tag_word_map[pos_tag])

Computing the word distributions: 100%|██████████| 67349/67349 [01:05<00:00, 1026.29it/s]


In [16]:
data = {}
for column in cur_train.column_names:
  data[column] = []

idx_counter = len(cur_train)

error_count = 0

with torch.no_grad(), tqdm(total=len(cur_train), desc="Computing the synthetic examples") as pbar:
  for idx, row in enumerate(cur_train):
    try:
      for _ in range(knowledge_dataset_params['sst2']['n_iter']):
        res = []
        for word, pos_tag in nltk.pos_tag(nltk.word_tokenize(row['sentence'])):
          X = random.uniform(0,1)
          if X < knowledge_dataset_params['sst2']['p_mask']:
            res.append('[MASK]')
          elif X < knowledge_dataset_params['sst2']['p_mask']+knowledge_dataset_params['sst2']['p_pos']:
            res.append(random.choice(pos_tag_word_map_list[pos_tag]))
          else:
            res.append(word)
        if random.uniform(0,1) < knowledge_dataset_params['sst2']['p_ng']:
          n_gram_length = random.randint(1, 5)
          start = random.randrange(max(1, len(res)-n_gram_length))
          res = res[start: start+n_gram_length]
          synthetic_sample = ' '.join(res)
          data['sentence'].append(synthetic_sample)
          data['idx'].append(idx_counter)
          idx_counter += 1
          data['label'].append(row['label'])
    except Exception as e:
      print(e)
      error_count += 1
    finally:
      pbar.update(1)

print('\nMissed Entries', str(error_count))

Computing the synthetic examples: 100%|██████████| 67349/67349 [16:53<00:00, 66.44it/s]


Missed Entries 0





In [17]:
new_train = Dataset.from_dict(data)

# schema of sst2 dataset
ds_schema = Features({
    "idx": Value("int32"),
    "sentence": Value("string"),
    "label": ClassLabel(names=["negative", "positive"])
})

new_train = new_train.cast(ds_schema)

Casting the dataset:   0%|          | 0/335739 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader

# Create a DataLoader to generate batches
train_dataset = list(zip(new_train['sentence'], new_train['idx']))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

# Create a list to store the logits
logit_items = []

# Loop over the batches
for batch in tqdm(train_dataloader):
    # Unpack the batch
    sentence, idx = batch
    
    # Tokenize input sentences
    encoded_batch = teacher_tokenizer(sentence, padding = True, truncation = True, return_tensors = "pt").to(device)
    
    # Do a forward pass through the model to get logits
    outputs = bert_sst2(**encoded_batch)
    
    # Extract the logits from the output and append to the list
    logits = outputs.logits.detach().cpu().numpy()
    for l in logits:
      logit_items.append(l)


logit_items = np.array(logit_items).argmax(axis=-1)
dataset_dict = {"idx": new_train["idx"],
                "sentence": new_train["sentence"],
                "label": logit_items}
new_train_2 = Dataset.from_dict(dataset_dict)
new_train_2 = new_train_2.cast(ds_schema)

In [20]:
knowledge_dataset = concatenate_datasets([cur_train, new_train_2])

In [22]:
from torch.utils.data import DataLoader

# Create a DataLoader to generate batches
train_dataset = list(zip(knowledge_dataset['sentence'], knowledge_dataset['idx']))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

# Create a list to store the logits
logit_items = []

# Loop over the batches
for batch in tqdm(train_dataloader):
    # Unpack the batch
    sentence, idx = batch
    
    # Tokenize input sentences
    encoded_batch = teacher_tokenizer(sentence, padding = True, truncation = True, return_tensors = "pt").to(device)
    
    # Do a forward pass through the model to get logits
    outputs = bert_sst2(**encoded_batch)
    
    # Extract the logits from the output and append to the list
    logits = outputs.logits.detach().cpu().numpy()
    for l in logits:
      logit_items.append(l)

labels = np.array(knowledge_dataset['label'])
logit_items = np.array(logit_items)
(logit_items.argmax(axis=-1) == labels).sum()/len(labels)

100%|██████████| 12597/12597 [07:26<00:00, 28.21it/s]


0.9904412932163696

In [None]:
def merge_true_and_teacher_logits(one_hot_labels, teacher_logits):
    return list(zip(one_hot_labels, teacher_logits))

new_dataset_dict = {"idx": knowledge_dataset["idx"],
                    "sentence": knowledge_dataset["sentence"],
                    "label": knowledge_dataset["label"],
                    "logits": logit_items,
                    "combined_logits": merge_true_and_teacher_logits(tf.one_hot(knowledge_dataset['label'], depth=2), logit_items)}

knowledge_dataset = Dataset.from_dict(new_dataset_dict)

In [30]:
knowledge_dataset = knowledge_dataset.shuffle()
sst2['train'] = knowledge_dataset
sst2.save_to_disk('/content/drive/MyDrive/CS685/project/datasets/SST2')

Saving the dataset (0/1 shards):   0%|          | 0/403088 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/872 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1821 [00:00<?, ? examples/s]