In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!nvidia-smi

Tue May 16 17:30:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0    31W /  70W |   4803MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from datasets.load import load_from_disk
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F
import csv
from transformers import TextDataset

# Set the device
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# For MRPC

org_mrpc = load_from_disk('/content/drive/MyDrive/CS685/project/datasets/MRPC')
mrpc = org_mrpc.copy()


In [None]:
mrpc

{'train': Dataset({
     features: ['sentence1', 'sentence2', 'label', 'idx'],
     num_rows: 31048
 }),
 'validation': Dataset({
     features: ['sentence1', 'sentence2', 'label', 'idx'],
     num_rows: 408
 }),
 'test': Dataset({
     features: ['sentence1', 'sentence2', 'label', 'idx'],
     num_rows: 1725
 })}

In [None]:
# mrpc["train"] = mrpc["train"][:1000]
# mrpc["validation"] = mrpc["validation"][:1000]

In [None]:
batch_size = 16

In [None]:
train_data = mrpc["train"]
validation_data = mrpc["validation"]

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

from transformers import AutoTokenizer, AutoModelForSequenceClassification

bert_mrpc = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc").to(device)
teacher_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")

In [None]:
from torch.utils.data import DataLoader

# Create a DataLoader to generate batches
train_dataset = list(zip(mrpc['train']['sentence1'], mrpc['train']['sentence2'], mrpc['train']['idx']))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

# Create a list to store the logits
logit_items = []

# Loop over the batches
for batch in tqdm(train_dataloader):
    # Unpack the batch
    sentence1, sentence2, idx = batch
    
    # Tokenize input sentences
    encoded_batch = teacher_tokenizer(sentence1, sentence2, padding = True, truncation = True, return_tensors = "pt").to(device)
    
    # Do a forward pass through the model to get logits
    outputs = bert_mrpc(**encoded_batch)
    
    # Extract the logits from the output and append to the list
    logits = outputs.logits.detach().cpu().numpy()
    for l in logits:
      logit_items.append(l)


  0%|          | 0/1941 [00:00<?, ?it/s]

In [None]:
# logit_items

In [None]:
from datasets import Dataset
import tensorflow as tf

def merge_true_and_teacher_logits(one_hot_labels, teacher_logits):
    return list(zip(one_hot_labels, teacher_logits))

new_dataset_dict = {"idx": mrpc['train']["idx"],
                    "sentence1": mrpc['train']["sentence1"],
                    "sentence2": mrpc['train']["sentence2"],
                    "label": mrpc['train']["label"],
                    "logits": logit_items,
                    "combined_logits": merge_true_and_teacher_logits(tf.one_hot(mrpc['train']["label"], depth=2), logit_items)}

new_dataset = Dataset.from_dict(new_dataset_dict)
new_dataset.save_to_disk("/content/drive/MyDrive/CS685/project/datasets/MRPC/modified_train")

Saving the dataset (0/1 shards):   0%|          | 0/31048 [00:00<?, ? examples/s]

In [None]:
print(new_dataset['sentence1'][:4])
print(new_dataset['sentence2'][:4])
print(new_dataset['label'][:20])
a = []
for k in new_dataset['logits'][:20]:
  if k[0]>k[1]:
    a.append(0)
  else:
    a.append(1)

print(a)

['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .']
['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .", "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .", 'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .']
[1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1]
[1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1]


In [None]:
new_dataset

Dataset({
    features: ['idx', 'sentence1', 'sentence2', 'label', 'logits', 'combined_logits'],
    num_rows: 31048
})