**SETTING UP BACKGROUND VARIABLES AND THE DATASETS!**

In [17]:

import os

import sys
assert sys.version_info.major == 3 and sys.version_info.minor == 11

import numpy as np
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, DataCollatorWithPadding, TrainingArguments)
from datasets import Dataset, load_dataset, load_metric
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
# from sklearn.model_selection import StratifiedKFold  ###KFolds are not utilized in final model, but they were explored and can be explored in the future.
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Load all needed datasets

dataset = load_dataset('csv', data_files=r'train\track_a\eng.csv')

dev_dataset = load_dataset('csv', data_files=r'dev\track_a\eng_a.csv')

helinski_dataset_raw = load_dataset('csv', data_files=r'xed_fixed.csv')

# class_weights = torch.tensor([1.8, 0.7, 1.2, 0.9, 0.9])

# loss_fn = CrossEntropyLoss(weight=class_weights)

#List of classeese and give an "id" for each class (0 - 4)

classes = ["Anger", "Fear", "Joy", "Sadness", "Surprise"]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

In [18]:
from transformers import AutoTokenizer
 
model_path = 'microsoft/deberta-v3-small'

tokenizer = AutoTokenizer.from_pretrained(model_path, clean_up_tokenization_spaces=True)



**READ DATA FROM DATASET AND TOKENIZE IT!**

In [19]:

# Load an array for each pair of text/label for each dataset.

examples = []
y_true = []
dev_examples = []
dev_y_true = []
h_examples = []
h_y_true = []
num_folds = 5

for example in dataset['train']:
  examples.append(example['text'])
  y_true.append([float(example['Anger']), float(example['Fear']), float(example['Joy']), float(example['Sadness']), float(example['Surprise'])])

for example in dev_dataset['train']: #Dev set does not have y-labels.
  dev_examples.append(example['text'])
  # dev_y_true.append([float(example['Anger']), float(example['Fear']), float(example['Joy']), float(example['Sadness']), float(example['Surprise'])])

for example in helinski_dataset_raw['train']:
  h_examples.append(example['Sentence'])
  h_y_true.append([float(example['Anger']), float(example['Fear']), float(example['Joy']), float(example['Sadness']), float(example['Surprise'])])

# make training and validation sets through the training dataset
examples_train, examples_test, labels_train, labels_test = train_test_split(examples, y_true, test_size=0.05, random_state=42) #95% of the dataset is training, 5% for eval

# print(examples_train)
# print(examples_test)

#Tokenize each dataset

train_dataset = Dataset.from_dict( {"text": examples_train, "label": labels_train} )
test_dataset = Dataset.from_dict( {"text": examples_test, "label": labels_test} )
helinski_dataset = Dataset.from_dict( {"text": h_examples, "label": h_y_true} )

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dev_dataset = Dataset.from_dict( {"text": dev_examples} )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_traintest = test_dataset.map(tokenize_function, batched=True)

tokenized_dev = dev_dataset.map(tokenize_function, batched=True)
tokenized_helinski = helinski_dataset.map(tokenize_function, batched=True)

# Miniature version of dataset to test smaller chunks of data
percent_used = 0.6

examples_in_train = round(percent_used * len(tokenized_train))
examples_in_traintest = round(percent_used * len(tokenized_traintest))

mini_tokenized_train = tokenized_train.select(range(examples_in_train)) # make so works with percent_used
mini_tokenized_traintest = tokenized_traintest.select(range(examples_in_traintest))


Map:   0%|          | 0/2629 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Map:   0%|          | 0/12243 [00:00<?, ? examples/s]

**SET UP VARIABLES AND FUNCTIONS FOR RESULTS LOG**

In [20]:
import evaluate
import numpy as np

from transformers import DataCollatorWithPadding

# Code to print the results of the test.  

# training!
model = AutoModelForSequenceClassification.from_pretrained(model_path, problem_type="multi_label_classification", num_labels=5)

training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


#references=labels.astype(int).reshape(-1))

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**SET UP MODEL AND TRAIN IT**

In [21]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(

   model_path, num_labels=len(classes),
           id2label=id2class, label2id=class2id,
                       problem_type = "multi_label_classification")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# for i in range(num_folds):

#Best model for Deberta: batch_size = 8, learning_rate = 2e-5

training_args = TrainingArguments(
   output_dir="my_awesome_model",
   per_device_train_batch_size = 8, 
   learning_rate= 2e-5,
   # per_device_eval_batch_size=3,
   num_train_epochs=3,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=mini_tokenized_train,
   eval_dataset=mini_tokenized_traintest,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()




  0%|          | 0/594 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.4998610019683838, 'eval_accuracy': 0.7566265060240964, 'eval_f1': 0.5511111111111111, 'eval_precision': 0.6526315789473685, 'eval_recall': 0.47692307692307695, 'eval_runtime': 1.894, 'eval_samples_per_second': 43.822, 'eval_steps_per_second': 5.808, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.468421995639801, 'eval_accuracy': 0.7855421686746988, 'eval_f1': 0.611353711790393, 'eval_precision': 0.7070707070707071, 'eval_recall': 0.5384615384615384, 'eval_runtime': 2.2325, 'eval_samples_per_second': 37.177, 'eval_steps_per_second': 4.927, 'epoch': 2.0}
{'loss': 0.4892, 'grad_norm': 1.6179118156433105, 'learning_rate': 3.1649831649831652e-06, 'epoch': 2.53}


  0%|          | 0/11 [00:00<?, ?it/s]

{'eval_loss': 0.45839911699295044, 'eval_accuracy': 0.7855421686746988, 'eval_f1': 0.6147186147186147, 'eval_precision': 0.7029702970297029, 'eval_recall': 0.5461538461538461, 'eval_runtime': 2.1552, 'eval_samples_per_second': 38.512, 'eval_steps_per_second': 5.104, 'epoch': 3.0}
{'train_runtime': 940.4943, 'train_samples_per_second': 5.03, 'train_steps_per_second': 0.632, 'train_loss': 0.4732129389188105, 'epoch': 3.0}


TrainOutput(global_step=594, training_loss=0.4732129389188105, metrics={'train_runtime': 940.4943, 'train_samples_per_second': 5.03, 'train_steps_per_second': 0.632, 'total_flos': 51171127674744.0, 'train_loss': 0.4732129389188105, 'epoch': 3.0})

**TEST THE MODEL ADN RUN PRINT RESULTS**

In [31]:
TESTING_SET = tokenized_traintest #adjust testing set as needed.

predictions = trainer.predict(TESTING_SET)
# print(predictions)

# Precision and recall (macro-average across labels)
# precision_macro = precision_score(predictions['label_ids'].tolist(), y_true, average='macro')
# recall_macro = recall_score(predictions['label_ids'].tolist(), y_true, average='macro')

# # Precision and recall (micro-average across all samples and labels)
# precision_micro = precision_score(predictions, y_true, average='micro')
# recall_micro = recall_score(predictions, y_true, average='micro')

probs = sigmoid(torch.from_numpy(predictions.predictions))
# print("PROBS: ", probs)
# print("LABELS: ", torch.tensor(tokenized_test['label'])) # trues
print(f"RESULTS LOG:")
nums = np.round(np.linspace(.3, .7, 40), 2)
final_nums = np.append(nums, 0.5)

best_thresh = 0
best_f1 = 0
for curr_thresh in final_nums:
# binarize predictions
   binary_predictions = (probs >= curr_thresh).long()
   # print("PREDS: ", binary_predictions)
   curr_f1 = f1_score(y_true=TESTING_SET['label'], y_pred=binary_predictions, average='weighted')
   if curr_f1 > best_f1:
      best_f1 = curr_f1
      best_thresh = curr_thresh
# print("THRESH = ", curr_thresh, " F1 = ", curr_f1)
print(f"F1: {best_f1}")
print(best_thresh)

for thresh in [0.35, 0.4, 0.45, 0.5]:
# binarize predictions
   binary_predictions = (probs >= thresh).long()
   print("THRESH = ", thresh)
   # print("PREDS: ", binary_predictions)
   print(f1_score(y_true=TESTING_SET['label'], y_pred=binary_predictions, average='weighted'))
   print(f1_score(y_true=TESTING_SET['label'], y_pred=binary_predictions, average='micro'))
   print(f1_score(y_true=TESTING_SET['label'], y_pred=binary_predictions, average='macro'))
   print("==============")

  0%|          | 0/18 [00:00<?, ?it/s]

RESULTS LOG:
F1: 0.6753795196029225
0.3
THRESH =  0.35
0.6389524831050389
0.6547085201793722
0.5568304325475507
THRESH =  0.4
0.6427443178885135
0.6650943396226415
0.5462273076843134
THRESH =  0.45
0.6243502185791249
0.6551724137931034
0.5035026195837675
THRESH =  0.5
0.6130127818877478
0.6478149100257069
0.4924848500119688


In [24]:
import pandas as pd

# Code to generate a .csv for the dev set to submit to SemEval competition.

predictions = trainer.predict(tokenized_dev) # logits
probs = torch.sigmoid(torch.from_numpy(predictions.predictions)) # percentage probabilities
print("PROBS: ", probs)
binary_predictions = (probs >= 0.35).long()
print(binary_predictions)

# Convert tensors to integers
binary_predictions_list = binary_predictions.tolist()


ids = []
for i in range(0, len(binary_predictions_list)):
  ids.append("text" + str(i))

data = {
    "id": ids,
    "Anger": [pred[0] for pred in binary_predictions_list],
    "Fear": [pred[1] for pred in binary_predictions_list],
    "Joy": [pred[2] for pred in binary_predictions_list],
    "Sadness": [pred[3] for pred in binary_predictions_list],
    "Surprise": [pred[4] for pred in binary_predictions_list],
}

df = pd.DataFrame(data)
df.to_csv("pred_eng_a.csv", index=False) # drop the index column

  0%|          | 0/15 [00:00<?, ?it/s]

PROBS:  tensor([[0.0944, 0.8349, 0.0475, 0.2699, 0.4449],
        [0.0632, 0.9215, 0.0295, 0.3606, 0.2794],
        [0.0336, 0.3242, 0.2733, 0.0616, 0.3499],
        [0.0336, 0.3818, 0.3117, 0.2862, 0.0584],
        [0.0304, 0.1225, 0.5641, 0.0790, 0.1430],
        [0.0609, 0.8911, 0.0586, 0.6105, 0.0725],
        [0.4586, 0.8376, 0.0749, 0.3554, 0.7016],
        [0.0315, 0.4097, 0.2843, 0.3060, 0.0656],
        [0.0200, 0.4959, 0.2000, 0.1125, 0.0816],
        [0.0614, 0.9077, 0.0498, 0.5617, 0.0798],
        [0.0398, 0.1279, 0.6201, 0.0616, 0.2791],
        [0.0858, 0.9210, 0.0448, 0.6360, 0.0831],
        [0.2529, 0.7311, 0.0925, 0.2191, 0.7777],
        [0.2992, 0.8973, 0.0487, 0.3567, 0.6585],
        [0.0538, 0.0686, 0.7802, 0.1050, 0.1612],
        [0.0282, 0.7867, 0.0585, 0.1977, 0.1638],
        [0.3646, 0.8330, 0.0630, 0.5522, 0.4754],
        [0.0306, 0.7037, 0.0946, 0.0866, 0.2853],
        [0.1918, 0.8861, 0.0480, 0.2545, 0.6423],
        [0.0458, 0.8345, 0.0584, 0.3750, 0

In [25]:
# DISCONTINUIED GRID SEARCH CODE

# possible_learn_rate = [2e-4, 2e-5, 2e-6]
# possible_batch_size = [8, 16, 32]


# for learn_r in possible_learn_rate:
#    for batch_s in possible_batch_size:
#       training_args = TrainingArguments(
#          output_dir="my_awesome_model",
#          per_device_train_batch_size = batch_s,
#          learning_rate= learn_r,
#          # per_device_train_batch_size=3,
#          # per_device_eval_batch_size=3,
#          num_train_epochs=3,
#          weight_decay=0.01,
#          evaluation_strategy="epoch",
#          save_strategy="epoch",
#          load_best_model_at_end=True,
#       )

#       trainer = Trainer(

#          model=model,
#          args=training_args,
#          train_dataset=tokenized_train,
#          eval_dataset=tokenized_test      ,
#          tokenizer=tokenizer,
#          data_collator=data_collator,
#          compute_metrics=compute_metrics,
#          # mini_batch_sizee = b_s
#       )

#       trainer.train()
#       predictions = trainer.predict(tokenized_test)
#       probs = sigmoid(torch.from_numpy(predictions.predictions))
#       # print("PROBS: ", probs)
#       # print("LABELS: ", torch.tensor(tokenized_test['label'])) # trues
#       print(f"LOG: Learn: {learn_r} Batch: {batch_s}")
#       for thresh in [0.35, 0.4, 0.45, 0.5]:
#       # binarize predictions
#          binary_predictions = (probs >= thresh).long()
#          print("THRESH = ", thresh)
#          # print("PREDS: ", binary_predictions)

#          print(f1_score(y_true=tokenized_test['label'], y_pred=binary_predictions, average='weighted'))
#          print(f1_score(y_true=tokenized_test['label'], y_pred=binary_predictions, average='micro'))
#          print(f1_score(y_true=tokenized_test['label'], y_pred=binary_predictions, average='macro'))
#          print("==============")
#       nums = np.round(np.linspace(.3, .7, 40), 2)
#       final_nums = np.append(nums, 0.5)

#       best_thresh = 0
#       best_f1 = 0
#       for curr_thresh in final_nums:
#       # binarize predictions
#          binary_predictions = (probs >= curr_thresh).long()
         
#          # print("PREDS: ", binary_predictions)

#          curr_f1 = f1_score(y_true=tokenized_test['label'], y_pred=binary_predictions, average='weighted')
#          if curr_f1 > best_f1:
#             best_f1 = curr_f1
#             best_thresh = curr_thresh
#       # print("THRESH = ", curr_thresh, " F1 = ", curr_f1)
#       print(f"F1: {best_f1}")
#       print(best_thresh)