In [1]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m26

In [2]:
!nvidia-smi

Tue May 16 21:22:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from datasets.load import load_from_disk
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F
import csv
from transformers import TextDataset

# Set the device
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
# For MNLI

org_mnli = load_from_disk('/content/drive/MyDrive/CS685/project/datasets/MNLI')
mnli = org_mnli.copy()


In [6]:
# mnli["train"] = mnli["train"][:1000]
# mnli["validation_matched"] = mnli["validation_matched"][:1000]

In [7]:
batch_size = 16

In [8]:
train_data = mnli["train"]
validation_data = mnli["validation_matched"]

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

from transformers import AutoTokenizer, AutoModelForSequenceClassification

bert_mnli = AutoModelForSequenceClassification.from_pretrained("TehranNLP/bert-base-cased-mnli").to(device)
teacher_tokenizer = AutoTokenizer.from_pretrained("TehranNLP/bert-base-cased-mnli")

Downloading (…)lve/main/config.json:   0%|          | 0.00/800 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/284 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
from torch.utils.data import DataLoader

# Create a DataLoader to generate batches
train_dataset = list(zip(mnli['train']['premise'], mnli['train']['hypothesis'], mnli['train']['idx']))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

# Create a list to store the logits
logit_items = []

# Loop over the batches
for batch in tqdm(train_dataloader):
    # Unpack the batch
    sentence1, sentence2, idx = batch
    
    # Tokenize input sentences
    encoded_batch = teacher_tokenizer(sentence1, sentence2, padding = True, truncation = True, return_tensors = "pt").to(device)
    
    # Do a forward pass through the model to get logits
    outputs = bert_mnli(**encoded_batch)
    
    # Extract the logits from the output and append to the list
    logits = outputs.logits.detach().cpu().numpy()
    for l in logits:
      logit_items.append(l)


  0%|          | 0/24544 [00:00<?, ?it/s]

In [None]:
# logit_items

In [None]:
from datasets import Dataset
import tensorflow as tf

def merge_true_and_teacher_logits(one_hot_labels, teacher_logits):
    return list(zip(one_hot_labels, teacher_logits))

new_dataset_dict = {"idx": mnli['train']["idx"],
                    "premise": mnli['train']["premise"],
                    "hypothesis": mnli['train']["hypothesis"],
                    "label": mnli['train']["label"],
                    "logits": logit_items,
                    "combined_logits": merge_true_and_teacher_logits(tf.one_hot(mnli['train']['label'], depth=3), logit_items)}

new_dataset = Dataset.from_dict(new_dataset_dict)
new_dataset.save_to_disk("/content/drive/MyDrive/CS685/project/datasets/MNLI/modified_train")

In [None]:
print(new_dataset['premise'][:20])
print(new_dataset['hypothesis'][:20])
print(new_dataset['label'][:20])
a = []
for k in new_dataset['logits'][:20]:
  if k[0]>k[1] and k[0]>k[2]:
    a.append(0)
  elif k[1]>k[0] and k[1]>k[2]:
    a.append(1)
  else:
    a.append(2)

print(a)

['Conceptually cream skimming has two basic dimensions - product and geography.', 'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him', 'One of our number will carry out your instructions minutely.', 'How do you know? All this is their information again.', "yeah i tell you what though if you go price some of those tennis shoes i can see why now you know they're getting up in the hundred dollar range", "my walkman broke so i'm upset now i just have to turn the stereo up real loud", 'But a few Christian mosaics survive above the apse is the Virgin with the infant Jesus, with the Archangel Gabriel to the right (his companion Michael, to the left, has vanished save for a few feathers from his wings).', "(Read  for Slate 's take on Jackson's findings.)", 'Gays and l

In [None]:
new_dataset

Dataset({
    features: ['idx', 'premise', 'hypothesis', 'label', 'logits', 'combined_logits'],
    num_rows: 392702
})

In [None]:
mnli

In [None]:
mnli["train"]["combined_logits"]

In [None]:
import random

org_mnli = load_from_disk('/content/drive/MyDrive/CS685/project/datasets/MNLI')

# Get the "train" subset of the dataset
train_subset = org_mnli["train"]

# Get the keys or indices of the "train" subset
train_keys = list(train_subset.keys())

# Shuffle the keys
random.shuffle(train_keys)

# Create a new shuffled "train" subset
shuffled_train = {}
for key in train_keys:
    shuffled_train[key] = train_subset[key]

# Replace the original "train" subset with the shuffled version


In [None]:
org_mnli[:100]

In [None]:
shuffled_train[:100]