**SET UP TRAINING DATA**

In [16]:
import os
assert os.environ['CONDA_DEFAULT_ENV'] == "cs375"

import sys
assert sys.version_info.major == 3 and sys.version_info.minor == 11

import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, DataCollatorWithPadding, TrainingArguments)
from datasets import Dataset, load_dataset, load_metric
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import pandas as pd

# Load dataset
dataset = load_dataset("csv", data_files="/Users/samuelwexler/Library/CloudStorage/GoogleDrive-saw9@williams.edu/My Drive/Fall 2024/CSCI 375/Final Project!/eng_train.csv", split=None)

# make vectors for example and labels
examples = []
y_true = []

for example in dataset['train']:
  examples.append(example['text'])
  y_true.append([example['Anger'], example['Fear'], example['Joy'], example['Sadness'], example['Surprise']])


dataset = Dataset.from_dict( {"text": examples, "label": y_true} )

# tokenize examples
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# make tokenized Datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)




Map:   0%|          | 0/2768 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2768
})


**TRAIN WINNING MODEL CONFIGURATION**

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", num_labels=5)
training_args = TrainingArguments(output_dir="test_trainer", 
                                  per_device_train_batch_size=8, 
                                  learning_rate=2e-5)
trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=tokenized_dataset)
trainer.train()
trainer.save_model("/Users/samuelwexler/Documents/CS375/final-project")


# eventually np.savetxt('my_array.txt', arr, delimiter=',')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1038 [00:00<?, ?it/s]

{'loss': 0.4641, 'grad_norm': 2.4478960037231445, 'learning_rate': 1.0366088631984585e-05, 'epoch': 1.45}
{'loss': 0.3179, 'grad_norm': 3.5843586921691895, 'learning_rate': 7.321772639691716e-07, 'epoch': 2.89}
{'train_runtime': 17670.181, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.059, 'train_loss': 0.3874825062319952, 'epoch': 3.0}


**INGEST DEV DATA**

In [30]:
dev_set = pd.read_csv("/Users/samuelwexler/Library/CloudStorage/GoogleDrive-saw9@williams.edu/My Drive/Fall 2024/CSCI 375/Final Project!/public_data/dev/track_a/eng_a.csv")

print(dev_set)
print(dev_set.shape)

dev_examples = []

for i in range(len(dev_set)):
    dev_examples.append(dev_set.iloc[i]['text'])

dev_dataset = Dataset.from_dict( {"text": dev_examples} )
tokenized_dev = dev_dataset.map(tokenize_function, batched=True)

print(tokenized_dev[10])

                        id                                               text  \
0    eng_dev_track_a_00001            My mouth fell open `` No, no, no... I..   
1    eng_dev_track_a_00002  You can barely make out your daughter's pale f...   
2    eng_dev_track_a_00003  But after blinking my eyes for a few times lep...   
3    eng_dev_track_a_00004  Slowly rising to my feet I came to the conclus...   
4    eng_dev_track_a_00005  I noticed this months after moving in and doin...   
..                     ...                                                ...   
111  eng_dev_track_a_00112                       "ARcH stop your progression.   
112  eng_dev_track_a_00113        This 'star', starts to move across the sky.   
113  eng_dev_track_a_00114                                  and my feet hurt.   
114  eng_dev_track_a_00115        so i cried my eyes out and did the drawing.   
115  eng_dev_track_a_00116                              They were coal black.   

     Anger  Fear  Joy  Sadn

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

{'text': 'I would go over to his house where his Grandmother and mom and dad lived to play SSX Tricky and eat Pokemon fruit rollups.', 'input_ids': [101, 1045, 2052, 2175, 2058, 2000, 2010, 2160, 2073, 2010, 7133, 1998, 3566, 1998, 3611, 2973, 2000, 2377, 7020, 2595, 24026, 1998, 4521, 20421, 5909, 4897, 22264, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

**PREDICT ENGLISH DEV SET**

In [54]:
predictions = trainer.predict(tokenized_dev) # logits
probs = torch.sigmoid(torch.from_numpy(predictions.predictions)) # percentage probabilities
print("PROBS: ", probs)
binary_predictions = (probs >= 0.35).long()
print(binary_predictions)

# Convert tensors to integers
binary_predictions_list = binary_predictions.tolist()


ids = []
for i in range(0, len(binary_predictions_list)):
  ids.append(dev_set['id'][i])

data = {
    "id": ids,
    "Anger": [pred[0] for pred in binary_predictions_list],
    "Fear": [pred[1] for pred in binary_predictions_list],
    "Joy": [pred[2] for pred in binary_predictions_list],
    "Sadness": [pred[3] for pred in binary_predictions_list],
    "Surprise": [pred[4] for pred in binary_predictions_list],
}

df = pd.DataFrame(data)
df.to_csv("pred_eng_a.csv", index=False) # drop the index column


  0%|          | 0/15 [00:00<?, ?it/s]

PROBS:  tensor([[0.1959, 0.9371, 0.0266, 0.2787, 0.6651],
        [0.0711, 0.9479, 0.0222, 0.5588, 0.5997],
        [0.0202, 0.1544, 0.5521, 0.0419, 0.1116],
        [0.0272, 0.5158, 0.1499, 0.5890, 0.0312],
        [0.0276, 0.1228, 0.6902, 0.0289, 0.3776],
        [0.0578, 0.4569, 0.0769, 0.2821, 0.0215],
        [0.8338, 0.5886, 0.1299, 0.4505, 0.2976],
        [0.0377, 0.4901, 0.0704, 0.3530, 0.0598],
        [0.0226, 0.1588, 0.6369, 0.0378, 0.0701],
        [0.0685, 0.9365, 0.0323, 0.6514, 0.0443],
        [0.0406, 0.0887, 0.8627, 0.0356, 0.2080],
        [0.0888, 0.8934, 0.0406, 0.8014, 0.0328],
        [0.4077, 0.4699, 0.0632, 0.6350, 0.1636],
        [0.4182, 0.9228, 0.0638, 0.2436, 0.8500],
        [0.0327, 0.0722, 0.8159, 0.1231, 0.1041],
        [0.0163, 0.5397, 0.0930, 0.2893, 0.1580],
        [0.7857, 0.7473, 0.0673, 0.6199, 0.2477],
        [0.0127, 0.4719, 0.2078, 0.0344, 0.2820],
        [0.2595, 0.9371, 0.0447, 0.1402, 0.6726],
        [0.7496, 0.7949, 0.0614, 0.2952, 0

**INGEST (TRANSLATED) AMHARIC DATA**

In [57]:
amh_source = load_dataset("csv", data_files="/Users/samuelwexler/Library/CloudStorage/GoogleDrive-saw9@williams.edu/My Drive/Fall 2024/CSCI 375/Final Project!/amh_translated_ordered.csv", split=None)

# make vectors for example and labels
amh_examples = []
amh_y_true = []

for example in amh_source['train']:
  amh_examples.append(example['text'])
  amh_y_true.append([example['Anger'], example['Fear'], example['Joy'], example['Sadness'], example['Surprise']])


amh_dataset = Dataset.from_dict( {"text": amh_examples, "label": amh_y_true} )

# make tokenized Datasets
amh_tokenized_dataset = amh_dataset.map(tokenize_function, batched=True)
print(amh_tokenized_dataset)


Map:   0%|          | 0/3549 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3549
})


**TEST ON (TRANSLATED) AMHARIC SET**

In [None]:
amh_predictions = trainer.predict(amh_tokenized_dataset) # logits
amh_probs = torch.sigmoid(torch.from_numpy(amh_predictions.predictions)) # percentage probabilities
amh_binary_predictions = (amh_probs >= 0.35).long()

print(f1_score(y_true=amh_y_true, y_pred=amh_binary_predictions, average='weighted'))
print(f1_score(y_true=amh_y_true, y_pred=amh_binary_predictions, average='micro'))
print(f1_score(y_true=amh_y_true, y_pred=amh_binary_predictions, average='macro'))

  0%|          | 0/444 [00:00<?, ?it/s]

0.4970749049665876
0.3663958575127221
0.359234257804785


**INGEST HELSINKI SET**

In [68]:
hel_source = load_dataset("csv", data_files="/Users/samuelwexler/Library/CloudStorage/GoogleDrive-saw9@williams.edu/My Drive/Fall 2024/CSCI 375/Final Project!/xed_fixed.csv", split=None)

# make vectors for example and labels
hel_examples = []
hel_y_true = []

for example in hel_source['train']:
  hel_examples.append(example['Sentence'])
  hel_y_true.append([example['Anger'], example['Fear'], example['Joy'], example['Sadness'], example['Surprise']])


hel_dataset = Dataset.from_dict( {"text": hel_examples, "label": hel_y_true} )

# make tokenized Datasets
hel_tokenized_dataset = hel_dataset.map(tokenize_function, batched=True)

# print(tokenized_dataset[0])
print(hel_tokenized_dataset)

Map:   0%|          | 0/12243 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 12243
})


**PREDICT ON HELSINKI SET**

In [69]:
hel_predictions = trainer.predict(hel_tokenized_dataset) # logits
hel_probs = torch.sigmoid(torch.from_numpy(hel_predictions.predictions)) # percentage probabilities
hel_binary_predictions = (hel_probs >= 0.35).long()

print(f1_score(y_true=hel_y_true, y_pred=hel_binary_predictions, average='weighted'))
print(f1_score(y_true=hel_y_true, y_pred=hel_binary_predictions, average='micro'))
print(f1_score(y_true=hel_y_true, y_pred=hel_binary_predictions, average='macro'))

  0%|          | 0/1531 [00:00<?, ?it/s]

0.5178960298916008
0.5009111797797322
0.5123829961601433
