In [1]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!nvidia-smi

Tue May 16 16:53:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from datasets.load import load_from_disk
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F
import csv
from transformers import TextDataset

# Set the device
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# For QQP

org_qqp = load_from_disk('/content/drive/MyDrive/CS685/project/datasets/QQP')
qqp = org_qqp.copy()

In [6]:
qqp

{'train': Dataset({
     features: ['question1', 'question2', 'label', 'idx'],
     num_rows: 363846
 }),
 'validation': Dataset({
     features: ['question1', 'question2', 'label', 'idx'],
     num_rows: 40430
 }),
 'test': Dataset({
     features: ['question1', 'question2', 'label', 'idx'],
     num_rows: 390965
 })}

In [7]:
# qqp["train"] = qqp["train"][:1000]
# qqp["validation"] = qqp["validation"][:1000]

In [8]:
batch_size = 16

In [9]:
train_data = qqp["train"]
validation_data = qqp["validation"]

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

from transformers import AutoTokenizer, AutoModelForSequenceClassification

bert_qqp = AutoModelForSequenceClassification.from_pretrained("gchhablani/bert-base-cased-finetuned-qqp").to(device)
teacher_tokenizer = AutoTokenizer.from_pretrained("gchhablani/bert-base-cased-finetuned-qqp")

In [10]:
from torch.utils.data import DataLoader

# Create a DataLoader to generate batches
train_dataset = list(zip(qqp['train']['question1'], qqp['train']['question2'], qqp['train']['idx']))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

# Create a list to store the logits
logit_items = []

# Loop over the batches
for batch in tqdm(train_dataloader):
    # Unpack the batch
    question1, question2, idx = batch
    
    # Tokenize input sentences
    encoded_batch = teacher_tokenizer(question1, question2, padding = True, truncation = True, return_tensors = "pt").to(device)
    
    # Do a forward pass through the model to get logits
    outputs = bert_qqp(**encoded_batch)
    
    # Extract the logits from the output and append to the list
    logits = outputs.logits.detach().cpu().numpy()
    for l in logits:
      logit_items.append(l)


  0%|          | 0/22741 [00:00<?, ?it/s]

In [11]:
# logit_items

[array([ 5.3766227, -5.1629496], dtype=float32),
 array([-3.0006692,  2.5930023], dtype=float32),
 array([ 5.0542135, -4.8525205], dtype=float32),
 array([-2.1318386,  1.9744282], dtype=float32),
 array([ 5.4055243, -5.154775 ], dtype=float32),
 array([ 5.4652  , -5.155607], dtype=float32),
 array([ 4.5569963, -4.429636 ], dtype=float32),
 array([-1.8558526,  1.8268228], dtype=float32),
 array([ 5.5258965, -5.1948137], dtype=float32),
 array([ 4.8917   , -4.7223516], dtype=float32),
 array([ 4.024758 , -3.8817823], dtype=float32),
 array([-3.1484046,  2.8343494], dtype=float32),
 array([ 4.406285, -4.246   ], dtype=float32),
 array([-2.8248386,  2.5423899], dtype=float32),
 array([ 5.4911194, -5.1957994], dtype=float32),
 array([ 5.469461 , -5.2043734], dtype=float32),
 array([-1.8572402,  1.8228408], dtype=float32),
 array([ 4.780035, -4.59209 ], dtype=float32),
 array([-3.0774384,  2.7628193], dtype=float32),
 array([-3.3541641,  3.0362344], dtype=float32),
 array([ 4.4725776, -4.286

In [12]:
from datasets import Dataset
import tensorflow as tf

def merge_true_and_teacher_logits(one_hot_labels, teacher_logits):
    return list(zip(one_hot_labels, teacher_logits))

new_dataset_dict = {"idx": qqp['train']["idx"],
                    "question1": qqp['train']["question1"],
                    "question2": qqp['train']["question2"],
                    "label": qqp['train']["label"],
                    "logits": logit_items,
                    "combined_logits": merge_true_and_teacher_logits(tf.one_hot(qqp['train']["label"], depth=2), logit_items)}

new_dataset = Dataset.from_dict(new_dataset_dict)
new_dataset.save_to_disk("/content/drive/MyDrive/CS685/project/datasets/QQP/modified_train")

Saving the dataset (0/1 shards):   0%|          | 0/363846 [00:00<?, ? examples/s]

In [13]:
print(new_dataset['question1'][:20])
print(new_dataset['question2'][:20])
print(new_dataset['label'][:20])
a = []
for k in new_dataset['logits'][:20]:
  if k[0]>k[1]:
    a.append(0)
  else:
    a.append(1)

print(a)

['How is the life of a math student? Could you describe your own experiences?', 'How do I control my horny emotions?', 'What causes stool color to change to yellow?', 'What can one do after MBBS?', 'Where can I find a power outlet for my laptop at Melbourne Airport?', "How not to feel guilty since I am Muslim and I'm conscious we won't have sex together?", 'How is air traffic controlled?', 'What is the best self help book you have read? Why? How did it change your life?', "Can I enter University of Melbourne if I couldn't achieve the guaranteed marks in Trinity College Foundation?", 'Do you need a passport to go to Jamaica from the United States?', 'What is the district of Edgware and how does the lifestyle compare to the London Borough of Islington?', "What will be Hillary Clinton's policy towards India if she becomes president?", 'What is the responsibility of SAP ERP key user?', 'Which is the best book to study TENSOR for general relativity from basic?', 'How is being gay or lesbian

In [14]:
new_dataset

Dataset({
    features: ['idx', 'question1', 'question2', 'label', 'logits', 'combined_logits'],
    num_rows: 363846
})