In [31]:
# Environment check 
# Return to HW0 if you run into errors in this cell 
# Do not modify this cell 
import os
assert os.environ['CONDA_DEFAULT_ENV'] == "cs375"

import sys
assert sys.version_info.major == 3 and sys.version_info.minor == 11

import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, DataCollatorWithPadding, TrainingArguments)
from datasets import Dataset, load_dataset, load_metric
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [32]:
# Load dataset
dataset = load_dataset("csv", data_files="/Users/samuelwexler/Library/CloudStorage/GoogleDrive-saw9@williams.edu/My Drive/Fall 2024/CSCI 375/Final Project!/eng_train.csv", split=None)

**SET UP DATASET OF TRAINING SET**

In [33]:
# make vectors for example and labels
examples = []
y_true = []

for example in dataset['train']:
  examples.append(example['text'])
  y_true.append([example['Anger'], example['Fear'], example['Joy'], example['Sadness'], example['Surprise']])

# make training and validation sets
examples_train, examples_test, labels_train, labels_test = train_test_split(examples, y_true, test_size=0.05, random_state=42)

train_dataset = Dataset.from_dict( {"text": examples_train, "label": labels_train} )
test_dataset = Dataset.from_dict( {"text": examples_test, "label": labels_test} )


# tokenize examples
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# make tokenized Datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

print(tokenized_train[0])

Map:   0%|          | 0/2629 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

{'text': "No noise at all, no lights, it wasn't pitch black either.", 'label': [0, 1, 0, 0, 1], 'input_ids': [101, 2053, 5005, 2012, 2035, 1010, 2053, 4597, 1010, 2009, 2347, 1005, 1056, 6510, 2304, 2593, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

**CREATE CUSTOM TRAINER BASED ON CLASS WEIGHTS**

In [34]:
# https://discuss.huggingface.co/t/how-can-i-use-class-weights-when-training/1067/6

class CustomTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):

    # get class frequencies
    train_labels_np = np.array(train_dataset['label'])
    test_labels_np = np.array(test_dataset['label'])
    class_freqs = torch.from_numpy(train_labels_np.sum(axis=0) + test_labels_np.sum(axis=0))

    # turn freqs into weights
    class_weights = class_freqs / sum(class_freqs)
    class_weights = max(class_freqs) / class_freqs 
    class_weights[4] = class_weights[4] * .8 # during testing the model was overpredicting the last emotion

    # standard loss things
    labels = inputs.get("labels")
    outputs = model(**inputs)
    logits = outputs.get('logits')

    loss_fct = nn.BCEWithLogitsLoss(pos_weight=class_weights)
    loss = loss_fct(logits, labels)

    return (loss, outputs) if return_outputs else loss

**DEFINE HOW MUCH DATA WILL BE USED**

In [35]:
# define how much data will be used

percent_used = 0.6

examples_in_train = round(percent_used * len(tokenized_train))
examples_in_test = round(percent_used * len(tokenized_test))

mini_tokenized_train = tokenized_train.select(range(examples_in_train)) # make so works with percent_used
mini_tokenized_test = tokenized_test.select(range(examples_in_test))


**TRAINING WITH CUSTOM TRAINER**

In [17]:
# training
model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", num_labels=5)
training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
trainer = CustomTrainer(model=model, args=training_args, train_dataset=mini_tokenized_train, eval_dataset=mini_tokenized_test)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/594 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6502947807312012, 'eval_runtime': 28.6965, 'eval_samples_per_second': 2.892, 'eval_steps_per_second': 0.383, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6452380418777466, 'eval_runtime': 30.6039, 'eval_samples_per_second': 2.712, 'eval_steps_per_second': 0.359, 'epoch': 2.0}
{'loss': 0.5344, 'grad_norm': 2.934779167175293, 'learning_rate': 7.912457912457913e-06, 'epoch': 2.53}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.67934250831604, 'eval_runtime': 30.0498, 'eval_samples_per_second': 2.762, 'eval_steps_per_second': 0.366, 'epoch': 3.0}
{'train_runtime': 7099.6294, 'train_samples_per_second': 0.666, 'train_steps_per_second': 0.084, 'train_loss': 0.4980009477146547, 'epoch': 3.0}


TrainOutput(global_step=594, training_loss=0.4980009477146547, metrics={'train_runtime': 7099.6294, 'train_samples_per_second': 0.666, 'train_steps_per_second': 0.084, 'total_flos': 626736792130560.0, 'train_loss': 0.4980009477146547, 'epoch': 3.0})

**GAUGE ACCURACY OF CUSTOM TRAINER**

In [19]:
# get result of training
predictions = trainer.predict(mini_tokenized_test) # logits
probs = torch.sigmoid(torch.from_numpy(predictions.predictions)) # percentage probabilities
print("PROBS: ", probs)
print("LABELS: ", torch.tensor(mini_tokenized_test['label'])) # trues

for thresh in [0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]:
  # binarize predictions
  binary_predictions = (probs >= thresh).long()
  print("THRESH = ", thresh)

  # get F1 scores
  print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='weighted'))
  print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='micro'))
  print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='macro'))
  print("==============")

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.0488, 0.8669, 0.0444, 0.5307, 0.0258],
        [0.0138, 0.7753, 0.0353, 0.4531, 0.1424],
        [0.7189, 0.3654, 0.1160, 0.5101, 0.0876],
        [0.0276, 0.6898, 0.2749, 0.0790, 0.2707],
        [0.4073, 0.9182, 0.0277, 0.9450, 0.6171],
        [0.1895, 0.8105, 0.0303, 0.7332, 0.5109],
        [0.0891, 0.9058, 0.0258, 0.8528, 0.0319],
        [0.0295, 0.5813, 0.1133, 0.9103, 0.0574],
        [0.0730, 0.9641, 0.0280, 0.5644, 0.6592],
        [0.0143, 0.8499, 0.0511, 0.0774, 0.6118],
        [0.0208, 0.9313, 0.0337, 0.4777, 0.0533],
        [0.0261, 0.9084, 0.0558, 0.5583, 0.8235],
        [0.1398, 0.9253, 0.0520, 0.2603, 0.9304],
        [0.0721, 0.8815, 0.0339, 0.2737, 0.9069],
        [0.0164, 0.6493, 0.1423, 0.4754, 0.0290],
        [0.1643, 0.9666, 0.0197, 0.8929, 0.1386],
        [0.6639, 0.8990, 0.0394, 0.1348, 0.5258],
        [0.7042, 0.7240, 0.0434, 0.7725, 0.0712],
        [0.0789, 0.2070, 0.8806, 0.0288, 0.6979],
        [0.5044, 0.6889, 0.0592, 0.9410, 0

**GAUGE PERFORMANCE BY EMOTION**

In [25]:
# results by label
binary_predictions_final = (probs >= 0.4).long()
labels = torch.tensor(mini_tokenized_test['label'])

train_labels_np = np.array(train_dataset['label'])
test_labels_np = np.array(test_dataset['label'])
class_freqs = torch.from_numpy(train_labels_np.sum(axis=0) + test_labels_np.sum(axis=0))

for col in range(0, 5):
  print("FREQ = ", class_freqs[col])
  print("COL = ", col)
  print(f1_score(y_true=labels[:, col], y_pred=binary_predictions_final[:, col], average='weighted'))
  print(f1_score(y_true=labels[:, col], y_pred=binary_predictions_final[:, col], average='micro'))
  print(f1_score(y_true=labels[:, col], y_pred=binary_predictions_final[:, col], average='macro'))
  print("==========================")


# # get F1 scores
# print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='weighted'))
# print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='micro'))
# print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='macro'))

FREQ =  tensor(333)
COL =  0
0.8546863510379545
0.8313253012048193
0.659037558685446
FREQ =  tensor(1611)
COL =  1
0.61547338780409
0.6385542168674698
0.6024904214559387
FREQ =  tensor(674)
COL =  2
0.8021806981857711
0.7951807228915663
0.6785144679881522
FREQ =  tensor(878)
COL =  3
0.7506925955727887
0.7469879518072289
0.7395786642761093
FREQ =  tensor(839)
COL =  4
0.8554216867469879
0.8554216867469879
0.8433962264150944


**======   EXPERIMENTS =========**

**TRY WITHOUT CLASS WEIGHTS**

In [43]:
model2 = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", num_labels=5)
training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
trainer2 = Trainer(model=model2, args=training_args, train_dataset=mini_tokenized_train, eval_dataset=mini_tokenized_test)
trainer2.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/594 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.45320937037467957, 'eval_runtime': 25.8625, 'eval_samples_per_second': 3.209, 'eval_steps_per_second': 0.425, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.42231285572052, 'eval_runtime': 39.8177, 'eval_samples_per_second': 2.084, 'eval_steps_per_second': 0.276, 'epoch': 2.0}
{'loss': 0.3781, 'grad_norm': 1.4667855501174927, 'learning_rate': 7.912457912457913e-06, 'epoch': 2.53}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4267077147960663, 'eval_runtime': 27.386, 'eval_samples_per_second': 3.031, 'eval_steps_per_second': 0.402, 'epoch': 3.0}
{'train_runtime': 8278.425, 'train_samples_per_second': 0.571, 'train_steps_per_second': 0.072, 'train_loss': 0.35377985700613723, 'epoch': 3.0}


TrainOutput(global_step=594, training_loss=0.35377985700613723, metrics={'train_runtime': 8278.425, 'train_samples_per_second': 0.571, 'train_steps_per_second': 0.072, 'total_flos': 626736792130560.0, 'train_loss': 0.35377985700613723, 'epoch': 3.0})

In [45]:
# get result of training
predictions = trainer2.predict(mini_tokenized_test) # logits
probs = torch.sigmoid(torch.from_numpy(predictions.predictions)) # percentage probabilities

probs_array = probs.numpy()
# formatted_probs = np.array([[f"{value:.6f}" for value in row] for row in probs_array])

print("PROBS: ", probs)
# print("LABELS: ", torch.tensor(mini_tokenized_test['label'])) # trues

for thresh in [0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]:
  # binarize predictions
  binary_predictions = (probs >= thresh).long()
  print("THRESH = ", thresh)

  # get F1 scores
  print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='weighted'))
  print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='micro'))
  print(f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='macro'))
  print("==============")

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.0487, 0.9035, 0.0242, 0.2133, 0.0202],
        [0.0144, 0.8830, 0.0176, 0.4544, 0.0798],
        [0.3316, 0.3466, 0.0453, 0.4799, 0.0614],
        [0.0251, 0.4848, 0.1703, 0.0197, 0.1419],
        [0.1587, 0.9338, 0.0324, 0.9317, 0.5442],
        [0.3070, 0.3450, 0.0401, 0.5527, 0.1540],
        [0.1252, 0.9405, 0.0125, 0.4739, 0.0308],
        [0.0279, 0.5685, 0.0815, 0.8959, 0.0453],
        [0.0458, 0.9769, 0.0211, 0.2852, 0.8038],
        [0.0111, 0.8882, 0.0287, 0.0818, 0.5965],
        [0.0178, 0.9450, 0.0288, 0.2186, 0.0349],
        [0.0379, 0.9391, 0.0489, 0.3095, 0.9410],
        [0.1028, 0.9582, 0.0328, 0.1984, 0.9232],
        [0.0632, 0.9594, 0.0165, 0.1298, 0.8750],
        [0.0209, 0.8848, 0.0340, 0.3872, 0.0178],
        [0.2694, 0.9816, 0.0146, 0.7161, 0.2165],
        [0.3501, 0.8962, 0.0262, 0.0586, 0.4778],
        [0.0440, 0.9078, 0.0180, 0.2685, 0.0500],
        [0.0317, 0.1833, 0.6096, 0.0250, 0.7442],
        [0.1406, 0.5629, 0.0496, 0.7311, 0

**GRID SEARCH TO FIND BEST PARAMETER SET**

**==TESTING WITH LR = 2*10^-4**

In [48]:
# learning_rates = [2e-4, 2e-5, 2e-6]
learning_rates = [2e-4]
batch_sizes = [8, 16, 32]
experiments = {}

trial_num = 0
for curr_learning_rate in learning_rates:
  for curr_batch_size in batch_sizes:
    experiments[trial_num] = {}
    experiments[trial_num]["model"] = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", num_labels=5)
    experiments[trial_num]["training_args"] = TrainingArguments(output_dir="test_trainer", 
                                                                eval_strategy="epoch", 
                                                                per_device_train_batch_size=curr_batch_size, 
                                                                learning_rate=curr_learning_rate)
    experiments[trial_num]["trainer"] = Trainer(model=experiments[trial_num]["model"], 
                                                args=experiments[trial_num]["training_args"], 
                                                train_dataset=mini_tokenized_train, 
                                                eval_dataset=mini_tokenized_test)
    experiments[trial_num]["trainer"].train()
    
    trial_num = trial_num + 1

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/594 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.538555920124054, 'eval_runtime': 39.3841, 'eval_samples_per_second': 2.107, 'eval_steps_per_second': 0.279, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.523768424987793, 'eval_runtime': 48.1267, 'eval_samples_per_second': 1.725, 'eval_steps_per_second': 0.229, 'epoch': 2.0}
{'loss': 0.4235, 'grad_norm': 1.8284848928451538, 'learning_rate': 3.164983164983165e-05, 'epoch': 2.53}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5231296420097351, 'eval_runtime': 41.34, 'eval_samples_per_second': 2.008, 'eval_steps_per_second': 0.266, 'epoch': 3.0}
{'train_runtime': 9253.6355, 'train_samples_per_second': 0.511, 'train_steps_per_second': 0.064, 'train_loss': 0.39413174394806627, 'epoch': 3.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/297 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4815734624862671, 'eval_runtime': 41.5491, 'eval_samples_per_second': 1.998, 'eval_steps_per_second': 0.265, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4870759844779968, 'eval_runtime': 45.9219, 'eval_samples_per_second': 1.807, 'eval_steps_per_second': 0.24, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5103946328163147, 'eval_runtime': 31.6212, 'eval_samples_per_second': 2.625, 'eval_steps_per_second': 0.348, 'epoch': 3.0}
{'train_runtime': 10495.9691, 'train_samples_per_second': 0.451, 'train_steps_per_second': 0.028, 'train_loss': 0.3412875294283986, 'epoch': 3.0}


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4829866290092468, 'eval_runtime': 43.9648, 'eval_samples_per_second': 1.888, 'eval_steps_per_second': 0.25, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.45964980125427246, 'eval_runtime': 37.2359, 'eval_samples_per_second': 2.229, 'eval_steps_per_second': 0.295, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4627303183078766, 'eval_runtime': 37.097, 'eval_samples_per_second': 2.237, 'eval_steps_per_second': 0.297, 'epoch': 3.0}
{'train_runtime': 9950.8428, 'train_samples_per_second': 0.475, 'train_steps_per_second': 0.015, 'train_loss': 0.3673821258544922, 'epoch': 3.0}


In [50]:
# get result of training

best_model_num = 0
best_micro_f1 = 0
corresponding_thresh = 0

for curr_experiment in range(0, len(experiments)):
  predictions = experiments[curr_experiment]["trainer"].predict(mini_tokenized_test) # logits
  probs = torch.sigmoid(torch.from_numpy(predictions.predictions)) # percentage probabilities
  # probs_array = probs.numpy()
  # formatted_probs = np.array([[f"{value:.3f}" for value in row] for row in probs_array])

  print("PROBS: ", probs)
  # print("LABELS: ", torch.tensor(mini_tokenized_test['label'])) # trues
  
  best_thresh = 0
  best_f1_thresh = 0
  for thresh in [0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]:
    # binarize predictions
    binary_predictions = (probs >= thresh).long()
    # print("THRESH = ", thresh)

    # get F1 scores
    curr_f1 = f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='micro')
    print("EXP # = ", curr_experiment, " THRESH = ", thresh, " MICRO F1 = ", curr_f1)
    if curr_f1 > best_f1_thresh:
      best_thresh = thresh
      best_f1_thresh = curr_f1
  
  if best_f1_thresh > best_micro_f1:
    best_model_num = curr_experiment
    best_micro_f1 = best_f1_thresh
    corresponding_thresh = best_thresh

print(best_model_num)
print(best_micro_f1)
print(corresponding_thresh)




  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.0290, 0.9552, 0.0164, 0.6467, 0.0086],
        [0.0048, 0.5757, 0.0440, 0.0557, 0.0255],
        [0.1203, 0.1113, 0.1465, 0.5364, 0.0511],
        [0.0753, 0.9262, 0.0412, 0.0423, 0.9568],
        [0.3931, 0.8804, 0.0147, 0.9289, 0.4590],
        [0.4277, 0.4675, 0.0316, 0.6477, 0.5681],
        [0.1069, 0.9403, 0.0163, 0.8428, 0.0154],
        [0.0066, 0.0361, 0.5789, 0.2649, 0.0150],
        [0.0151, 0.9803, 0.0151, 0.0969, 0.1503],
        [0.0175, 0.4081, 0.0436, 0.0713, 0.4843],
        [0.0325, 0.9770, 0.0113, 0.7049, 0.0170],
        [0.0288, 0.6539, 0.2292, 0.0484, 0.9837],
        [0.0515, 0.9824, 0.0099, 0.2814, 0.8505],
        [0.0387, 0.9192, 0.0217, 0.0529, 0.9333],
        [0.0686, 0.6385, 0.0859, 0.6613, 0.0086],
        [0.0300, 0.9784, 0.0127, 0.4444, 0.0153],
        [0.3601, 0.6085, 0.0491, 0.0395, 0.8228],
        [0.2992, 0.7233, 0.0354, 0.8424, 0.0100],
        [0.0235, 0.2970, 0.4979, 0.0165, 0.8768],
        [0.0456, 0.9105, 0.0178, 0.7211, 0

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.0315, 0.9676, 0.0193, 0.1842, 0.0079],
        [0.0042, 0.7059, 0.0359, 0.6215, 0.0102],
        [0.2936, 0.0878, 0.0909, 0.2360, 0.0548],
        [0.7071, 0.7094, 0.0327, 0.4308, 0.0354],
        [0.1110, 0.8741, 0.0117, 0.9616, 0.6214],
        [0.0876, 0.5194, 0.0551, 0.8354, 0.1109],
        [0.1060, 0.9785, 0.0082, 0.8963, 0.0223],
        [0.0067, 0.0948, 0.3709, 0.7692, 0.0183],
        [0.0130, 0.9794, 0.0276, 0.0742, 0.0596],
        [0.0037, 0.3288, 0.0478, 0.0184, 0.6340],
        [0.0097, 0.9712, 0.0182, 0.1627, 0.0105],
        [0.0180, 0.8601, 0.0397, 0.1773, 0.9862],
        [0.0451, 0.9875, 0.0132, 0.3122, 0.3820],
        [0.0084, 0.9003, 0.0582, 0.0134, 0.9578],
        [0.0177, 0.9626, 0.0176, 0.2192, 0.0067],
        [0.1341, 0.9905, 0.0080, 0.8221, 0.0602],
        [0.2739, 0.9481, 0.0153, 0.0340, 0.2788],
        [0.0203, 0.8598, 0.0227, 0.1518, 0.0079],
        [0.0203, 0.1190, 0.8127, 0.0124, 0.8964],
        [0.0494, 0.7223, 0.0384, 0.9319, 0

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.0717, 0.9421, 0.0219, 0.2301, 0.0155],
        [0.0141, 0.7350, 0.0173, 0.7029, 0.0500],
        [0.2187, 0.2591, 0.0877, 0.0960, 0.0668],
        [0.0470, 0.6432, 0.0942, 0.0485, 0.1017],
        [0.1881, 0.9128, 0.0192, 0.9503, 0.5102],
        [0.2976, 0.5494, 0.0252, 0.7744, 0.1596],
        [0.1086, 0.9729, 0.0092, 0.5887, 0.0372],
        [0.0167, 0.2141, 0.2151, 0.8250, 0.0327],
        [0.0222, 0.9812, 0.0182, 0.1359, 0.5217],
        [0.0068, 0.6533, 0.0598, 0.0201, 0.8728],
        [0.0157, 0.9652, 0.0162, 0.3212, 0.0241],
        [0.0231, 0.9570, 0.0170, 0.3423, 0.9275],
        [0.0639, 0.9851, 0.0096, 0.5094, 0.6571],
        [0.0188, 0.9478, 0.0187, 0.0527, 0.8836],
        [0.0304, 0.9686, 0.0100, 0.6703, 0.0378],
        [0.1720, 0.9847, 0.0107, 0.6227, 0.0787],
        [0.1308, 0.9142, 0.0294, 0.0261, 0.4832],
        [0.1459, 0.8688, 0.0125, 0.4098, 0.0337],
        [0.0294, 0.2046, 0.7135, 0.0102, 0.5749],
        [0.1144, 0.5247, 0.0512, 0.7820, 0

**==GRID SEARCH WITH LR = 2 * 10^-5**

In [51]:
# learning_rates = [2e-4, 2e-5, 2e-6]
learning_rates = [2e-5]
batch_sizes = [8, 16, 32]

trial_num = 3
for curr_learning_rate in learning_rates:
  for curr_batch_size in batch_sizes:
    experiments[trial_num] = {}
    experiments[trial_num]["model"] = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", num_labels=5)
    experiments[trial_num]["training_args"] = TrainingArguments(output_dir="test_trainer", 
                                                                eval_strategy="epoch", 
                                                                per_device_train_batch_size=curr_batch_size, 
                                                                learning_rate=curr_learning_rate)
    experiments[trial_num]["trainer"] = Trainer(model=experiments[trial_num]["model"], 
                                                args=experiments[trial_num]["training_args"], 
                                                train_dataset=mini_tokenized_train, 
                                                eval_dataset=mini_tokenized_test)
    experiments[trial_num]["trainer"].train()
    
    trial_num = trial_num + 1

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/594 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4791913330554962, 'eval_runtime': 27.9807, 'eval_samples_per_second': 2.966, 'eval_steps_per_second': 0.393, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4399697780609131, 'eval_runtime': 25.9688, 'eval_samples_per_second': 3.196, 'eval_steps_per_second': 0.424, 'epoch': 2.0}
{'loss': 0.4444, 'grad_norm': 1.8974618911743164, 'learning_rate': 3.1649831649831652e-06, 'epoch': 2.53}


  0%|          | 0/11 [00:00<?, ?it/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.43773651123046875, 'eval_runtime': 29.7281, 'eval_samples_per_second': 2.792, 'eval_steps_per_second': 0.37, 'epoch': 3.0}
{'train_runtime': 7400.5657, 'train_samples_per_second': 0.639, 'train_steps_per_second': 0.08, 'train_loss': 0.4274437676375161, 'epoch': 3.0}


  0%|          | 0/297 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5045245289802551, 'eval_runtime': 35.2446, 'eval_samples_per_second': 2.355, 'eval_steps_per_second': 0.312, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.46253591775894165, 'eval_runtime': 28.6222, 'eval_samples_per_second': 2.9, 'eval_steps_per_second': 0.384, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.45426255464553833, 'eval_runtime': 27.0435, 'eval_samples_per_second': 3.069, 'eval_steps_per_second': 0.407, 'epoch': 3.0}
{'train_runtime': 8756.3593, 'train_samples_per_second': 0.54, 'train_steps_per_second': 0.034, 'train_loss': 0.4681395456847117, 'epoch': 3.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5429951548576355, 'eval_runtime': 36.8855, 'eval_samples_per_second': 2.25, 'eval_steps_per_second': 0.298, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.499714732170105, 'eval_runtime': 29.0318, 'eval_samples_per_second': 2.859, 'eval_steps_per_second': 0.379, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.48618993163108826, 'eval_runtime': 29.0591, 'eval_samples_per_second': 2.856, 'eval_steps_per_second': 0.379, 'epoch': 3.0}
{'train_runtime': 7825.3774, 'train_samples_per_second': 0.605, 'train_steps_per_second': 0.019, 'train_loss': 0.5212098693847657, 'epoch': 3.0}


In [53]:
# evaluate performance
best_model_num = 0
best_micro_f1 = 0
corresponding_thresh = 0

for curr_experiment in [3, 4, 5]:
  predictions = experiments[curr_experiment]["trainer"].predict(mini_tokenized_test) # logits
  probs = torch.sigmoid(torch.from_numpy(predictions.predictions)) # percentage probabilities
  # probs_array = probs.numpy()
  # formatted_probs = np.array([[f"{value:.3f}" for value in row] for row in probs_array])

  print("PROBS: ", probs)
  # print("LABELS: ", torch.tensor(mini_tokenized_test['label'])) # trues
  
  best_thresh = 0
  best_f1_thresh = 0
  for thresh in [0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]:
    # binarize predictions
    binary_predictions = (probs >= thresh).long()
    # print("THRESH = ", thresh)

    # get F1 scores
    curr_f1 = f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='micro')
    print("EXP # = ", curr_experiment, " THRESH = ", thresh, " MICRO F1 = ", curr_f1)
    if curr_f1 > best_f1_thresh:
      best_thresh = thresh
      best_f1_thresh = curr_f1
  
  if best_f1_thresh > best_micro_f1:
    best_model_num = curr_experiment
    best_micro_f1 = best_f1_thresh
    corresponding_thresh = best_thresh

print(best_model_num)
print(best_micro_f1)
print(corresponding_thresh)


  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.1434, 0.8233, 0.0766, 0.6052, 0.0796],
        [0.0552, 0.8195, 0.0595, 0.3662, 0.1998],
        [0.1605, 0.4852, 0.1217, 0.3269, 0.1244],
        [0.0636, 0.4181, 0.2154, 0.1473, 0.2981],
        [0.2363, 0.8905, 0.0671, 0.7903, 0.3087],
        [0.1251, 0.6978, 0.0627, 0.4021, 0.3901],
        [0.1242, 0.8195, 0.0624, 0.5033, 0.0793],
        [0.0836, 0.6713, 0.1260, 0.6820, 0.0656],
        [0.2079, 0.9079, 0.0611, 0.4078, 0.6759],
        [0.0489, 0.8432, 0.0620, 0.2079, 0.3983],
        [0.0708, 0.8544, 0.0824, 0.4543, 0.0712],
        [0.0873, 0.8693, 0.0894, 0.2474, 0.7711],
        [0.1879, 0.7886, 0.1009, 0.2412, 0.7248],
        [0.1607, 0.8834, 0.0599, 0.2415, 0.7860],
        [0.0960, 0.8706, 0.0739, 0.6072, 0.0949],
        [0.3577, 0.9087, 0.0641, 0.7125, 0.2383],
        [0.1594, 0.8533, 0.0616, 0.1537, 0.6558],
        [0.1227, 0.8690, 0.0469, 0.5012, 0.1372],
        [0.0827, 0.2230, 0.5471, 0.0662, 0.5536],
        [0.2730, 0.7415, 0.1116, 0.7790, 0

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.2039, 0.7671, 0.1180, 0.5209, 0.1394],
        [0.0974, 0.8024, 0.0873, 0.3069, 0.3292],
        [0.1050, 0.4825, 0.1635, 0.2422, 0.2217],
        [0.1030, 0.5253, 0.1967, 0.2652, 0.1938],
        [0.2851, 0.8547, 0.0838, 0.6644, 0.4305],
        [0.1852, 0.6774, 0.0987, 0.2816, 0.5254],
        [0.1367, 0.7999, 0.0868, 0.4148, 0.2011],
        [0.0736, 0.4859, 0.2365, 0.3497, 0.1650],
        [0.2281, 0.8577, 0.1059, 0.3911, 0.6547],
        [0.0558, 0.7444, 0.1352, 0.1671, 0.3979],
        [0.0921, 0.7808, 0.1274, 0.3983, 0.1505],
        [0.0770, 0.8208, 0.1180, 0.2591, 0.4792],
        [0.2253, 0.8101, 0.0917, 0.3463, 0.6289],
        [0.1764, 0.8471, 0.1084, 0.2587, 0.7064],
        [0.1250, 0.8086, 0.0983, 0.5395, 0.2120],
        [0.2698, 0.8698, 0.0837, 0.6393, 0.2489],
        [0.1634, 0.8431, 0.0740, 0.2645, 0.5388],
        [0.0971, 0.7841, 0.1045, 0.3647, 0.1896],
        [0.1187, 0.2801, 0.5570, 0.1094, 0.4340],
        [0.3292, 0.7465, 0.1202, 0.7239, 0

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.2067, 0.7522, 0.1527, 0.4525, 0.2373],
        [0.1396, 0.7499, 0.1307, 0.3477, 0.3419],
        [0.1104, 0.5245, 0.2116, 0.2377, 0.3083],
        [0.1734, 0.7178, 0.1428, 0.3874, 0.2989],
        [0.2739, 0.7813, 0.1382, 0.5510, 0.4676],
        [0.2013, 0.6790, 0.1423, 0.3350, 0.4894],
        [0.1554, 0.7661, 0.1248, 0.3932, 0.2863],
        [0.0999, 0.6064, 0.2000, 0.3034, 0.2756],
        [0.2593, 0.7786, 0.1628, 0.4612, 0.5173],
        [0.1069, 0.7202, 0.1483, 0.2789, 0.3541],
        [0.1624, 0.7949, 0.1412, 0.4504, 0.2544],
        [0.1153, 0.7527, 0.1570, 0.3639, 0.3350],
        [0.2318, 0.7543, 0.1299, 0.4099, 0.4992],
        [0.2141, 0.7785, 0.1515, 0.3841, 0.5325],
        [0.1560, 0.7728, 0.1359, 0.4626, 0.3293],
        [0.2281, 0.8010, 0.1287, 0.5122, 0.3228],
        [0.1796, 0.7873, 0.1181, 0.3711, 0.4248],
        [0.1335, 0.7502, 0.1486, 0.3762, 0.2532],
        [0.1539, 0.3494, 0.4985, 0.1861, 0.3888],
        [0.2516, 0.7611, 0.1382, 0.5393, 0

**==GRID SEARCH WITH LEARNING_RATE = 2*10^-6**

In [54]:
# learning_rates = [2e-4, 2e-5, 2e-6]
learning_rates = [2e-6]
batch_sizes = [8, 16, 32]

trial_num = 6
for curr_learning_rate in learning_rates:
  for curr_batch_size in batch_sizes:
    experiments[trial_num] = {}
    experiments[trial_num]["model"] = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", num_labels=5)
    experiments[trial_num]["training_args"] = TrainingArguments(output_dir="test_trainer", 
                                                                eval_strategy="epoch", 
                                                                per_device_train_batch_size=curr_batch_size, 
                                                                learning_rate=curr_learning_rate)
    experiments[trial_num]["trainer"] = Trainer(model=experiments[trial_num]["model"], 
                                                args=experiments[trial_num]["training_args"], 
                                                train_dataset=mini_tokenized_train, 
                                                eval_dataset=mini_tokenized_test)
    experiments[trial_num]["trainer"].train()
    
    trial_num = trial_num + 1

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/594 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5868320465087891, 'eval_runtime': 26.6023, 'eval_samples_per_second': 3.12, 'eval_steps_per_second': 0.413, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5630336999893188, 'eval_runtime': 24.1094, 'eval_samples_per_second': 3.443, 'eval_steps_per_second': 0.456, 'epoch': 2.0}
{'loss': 0.595, 'grad_norm': 1.053056240081787, 'learning_rate': 3.164983164983165e-07, 'epoch': 2.53}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5594221353530884, 'eval_runtime': 33.4727, 'eval_samples_per_second': 2.48, 'eval_steps_per_second': 0.329, 'epoch': 3.0}
{'train_runtime': 7801.099, 'train_samples_per_second': 0.606, 'train_steps_per_second': 0.076, 'train_loss': 0.5889468787093757, 'epoch': 3.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/297 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6307984590530396, 'eval_runtime': 44.7968, 'eval_samples_per_second': 1.853, 'eval_steps_per_second': 0.246, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5936164259910583, 'eval_runtime': 31.1337, 'eval_samples_per_second': 2.666, 'eval_steps_per_second': 0.353, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.5865357518196106, 'eval_runtime': 31.8827, 'eval_samples_per_second': 2.603, 'eval_steps_per_second': 0.345, 'epoch': 3.0}
{'train_runtime': 9296.0052, 'train_samples_per_second': 0.509, 'train_steps_per_second': 0.032, 'train_loss': 0.6210316359394729, 'epoch': 3.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6702682375907898, 'eval_runtime': 32.0827, 'eval_samples_per_second': 2.587, 'eval_steps_per_second': 0.343, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6390492916107178, 'eval_runtime': 30.4813, 'eval_samples_per_second': 2.723, 'eval_steps_per_second': 0.361, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.6288503408432007, 'eval_runtime': 27.7189, 'eval_samples_per_second': 2.994, 'eval_steps_per_second': 0.397, 'epoch': 3.0}
{'train_runtime': 8123.404, 'train_samples_per_second': 0.582, 'train_steps_per_second': 0.018, 'train_loss': 0.6576084391276041, 'epoch': 3.0}


In [55]:
# evaluate performance
best_model_num = 0
best_micro_f1 = 0
corresponding_thresh = 0

for curr_experiment in [6, 7, 8]:
  predictions = experiments[curr_experiment]["trainer"].predict(mini_tokenized_test) # logits
  probs = torch.sigmoid(torch.from_numpy(predictions.predictions)) # percentage probabilities
  # probs_array = probs.numpy()
  # formatted_probs = np.array([[f"{value:.3f}" for value in row] for row in probs_array])

  print("PROBS: ", probs)
  # print("LABELS: ", torch.tensor(mini_tokenized_test['label'])) # trues
  
  best_thresh = 0
  best_f1_thresh = 0
  for thresh in [0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]:
    # binarize predictions
    binary_predictions = (probs >= thresh).long()
    # print("THRESH = ", thresh)

    # get F1 scores
    curr_f1 = f1_score(y_true=mini_tokenized_test['label'], y_pred=binary_predictions, average='micro')
    print("EXP # = ", curr_experiment, " THRESH = ", thresh, " MICRO F1 = ", curr_f1)
    if curr_f1 > best_f1_thresh:
      best_thresh = thresh
      best_f1_thresh = curr_f1
  
  if best_f1_thresh > best_micro_f1:
    best_model_num = curr_experiment
    best_micro_f1 = best_f1_thresh
    corresponding_thresh = best_thresh

print(best_model_num)
print(best_micro_f1)
print(corresponding_thresh)

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.1803, 0.6061, 0.2533, 0.3577, 0.3242],
        [0.1638, 0.5714, 0.2529, 0.3321, 0.3181],
        [0.1768, 0.5429, 0.2897, 0.3304, 0.3204],
        [0.1598, 0.5870, 0.2587, 0.3449, 0.3086],
        [0.1863, 0.5520, 0.2737, 0.3716, 0.3303],
        [0.1822, 0.5301, 0.2715, 0.3468, 0.3456],
        [0.1775, 0.5589, 0.2765, 0.3490, 0.3152],
        [0.1658, 0.5848, 0.2802, 0.3317, 0.3056],
        [0.1928, 0.5753, 0.2656, 0.3507, 0.3417],
        [0.1629, 0.5997, 0.2423, 0.3450, 0.2905],
        [0.1751, 0.6060, 0.2831, 0.3433, 0.2844],
        [0.1612, 0.5941, 0.2719, 0.3470, 0.3228],
        [0.1903, 0.5474, 0.2805, 0.3749, 0.3266],
        [0.2114, 0.5200, 0.2915, 0.3283, 0.3716],
        [0.2295, 0.5357, 0.3121, 0.3657, 0.3480],
        [0.1687, 0.5898, 0.2777, 0.3541, 0.3156],
        [0.1701, 0.5799, 0.2616, 0.3461, 0.3072],
        [0.1657, 0.5905, 0.2529, 0.3296, 0.3005],
        [0.2001, 0.5308, 0.3227, 0.3586, 0.3328],
        [0.1755, 0.5750, 0.2646, 0.3533, 0

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.2855, 0.6082, 0.3372, 0.3545, 0.3671],
        [0.2735, 0.6117, 0.3294, 0.3554, 0.3773],
        [0.2949, 0.5666, 0.3461, 0.3465, 0.3957],
        [0.2763, 0.5851, 0.3260, 0.3448, 0.3758],
        [0.2999, 0.5836, 0.3448, 0.3865, 0.4057],
        [0.2955, 0.5706, 0.3515, 0.3379, 0.4256],
        [0.2713, 0.5903, 0.3352, 0.3410, 0.3905],
        [0.2730, 0.5864, 0.3340, 0.3544, 0.3892],
        [0.2835, 0.5776, 0.3525, 0.3710, 0.4057],
        [0.2729, 0.5982, 0.3313, 0.3533, 0.3774],
        [0.2857, 0.5845, 0.3482, 0.3701, 0.3967],
        [0.2727, 0.6049, 0.3272, 0.3639, 0.3896],
        [0.2980, 0.5779, 0.3468, 0.3588, 0.4278],
        [0.2890, 0.5936, 0.3607, 0.3723, 0.4027],
        [0.3044, 0.5738, 0.3663, 0.3784, 0.4201],
        [0.2784, 0.5905, 0.3393, 0.3662, 0.3876],
        [0.2676, 0.5878, 0.3249, 0.3554, 0.3816],
        [0.2660, 0.6024, 0.3259, 0.3518, 0.3839],
        [0.2950, 0.5664, 0.3758, 0.3489, 0.4083],
        [0.2769, 0.5999, 0.3297, 0.3526, 0

  0%|          | 0/11 [00:00<?, ?it/s]

PROBS:  tensor([[0.4053, 0.5936, 0.3996, 0.3829, 0.4327],
        [0.3952, 0.5907, 0.3998, 0.3921, 0.4331],
        [0.4151, 0.5498, 0.4205, 0.4005, 0.4521],
        [0.3984, 0.5645, 0.4109, 0.3892, 0.4376],
        [0.4105, 0.5648, 0.4199, 0.4202, 0.4483],
        [0.3998, 0.5538, 0.4302, 0.3941, 0.4719],
        [0.3920, 0.5684, 0.4150, 0.3990, 0.4634],
        [0.3998, 0.5674, 0.4029, 0.3867, 0.4490],
        [0.3899, 0.5618, 0.4290, 0.4072, 0.4457],
        [0.3992, 0.5683, 0.4103, 0.3967, 0.4381],
        [0.4011, 0.5520, 0.4263, 0.4128, 0.4587],
        [0.3921, 0.5860, 0.3938, 0.3964, 0.4493],
        [0.4086, 0.5567, 0.4197, 0.4145, 0.4781],
        [0.4001, 0.5682, 0.4374, 0.4148, 0.4455],
        [0.4166, 0.5523, 0.4338, 0.4336, 0.4658],
        [0.3859, 0.5707, 0.4085, 0.4125, 0.4476],
        [0.3823, 0.5756, 0.3857, 0.4001, 0.4402],
        [0.3877, 0.5657, 0.4141, 0.4067, 0.4552],
        [0.4039, 0.5500, 0.4428, 0.3957, 0.4637],
        [0.3907, 0.5748, 0.4037, 0.3966, 0