# Essential import/installs
--------------------------------

In [None]:
!pip install datasets
!pip install tqdm boto3 requests regex sentencepiece sacremoses
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets
!pip install -U accelerate
!pip install -U transformers

In [121]:
from huggingface_hub import notebook_login
from collections import defaultdict
import re
import numpy as np
import torch
from transformers import AutoConfig, DataCollatorWithPadding, Trainer, TrainingArguments, BertTokenizer, BertConfig,BertForSequenceClassification, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [122]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data preprocessing
------------------------

In [123]:
# Initialize the stop words set
words_to_keep = {"but","as","until","while","against","not","only","should"}
words_to_remove = {"oh", "yeah"}
stop_words = set(stopwords.words('english'))
stop_words.difference_update(words_to_keep)     # Remove the words that we want to keep
stop_words.update(words_to_remove)              # Add the extra words that we want to remove

lemmatizer = nltk.stem.WordNetLemmatizer()      # Create a lemmatizer to simplify the words

def remove_special_characters(text):
    # Use regex to remove all non alphabetical characters
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

def remove_stopwords(text, words_to_remove=None):
    # Tokenize the input text
    words = word_tokenize(text)

    if words_to_remove != None:
      stop_words.update(words_to_remove)        # Include teh words we want to remove

    # Remove stop words from the tokenized words
    filtered_words = [lemmatizer.lemmatize(remove_special_characters(word.lower())) for word in words if word.lower() not in stop_words]

    # Reconstruct the text without stop words
    filtered_text = ' '.join(filtered_words)

    return filtered_text


In [124]:
# Create a set with all negations in dataset to avoid misclassification of negative sentences
negations = {"t",
             "dont",
             "don",
             "wasnt",
             "wasn",
             "havent",
             "haven",
             "hasnt",
             "hasn",
             "werent",
             "weren",
             "doesnt",
             "doesn",
             "didnt",
             "didn",
             "hadnt",
             "hadn",
             "shouldnt",
             "shouldn",
             "wouldnt",
             "wouldn",
             "arent",
             "aren",
             "isnt",
             "isn",
             "couldnt",
             "couldn"
             }

# Loading dataset
dataset = load_dataset("dair-ai/emotion")

# Transforming all of it to Pandas
train_dataframe = dataset['train'].to_pandas()
val_dataframe = dataset['validation'].to_pandas()
test_dataframe = dataset['test'].to_pandas()

# Remove all the stop_words and non alphabetic characters
train_dataframe['text'] = train_dataframe['text'].apply(remove_stopwords, words_to_remove=negations)
val_dataframe['text'] = val_dataframe['text'].apply(remove_stopwords, words_to_remove=negations)
test_dataframe['text'] = test_dataframe['text'].apply(remove_stopwords, words_to_remove=negations)

# Create the dataset, to train and test the model
model_data = DatasetDict({
    'train': Dataset.from_pandas(pd.concat([train_dataframe['text'], train_dataframe['label']],axis=1)),
    'validation': Dataset.from_pandas(pd.concat([val_dataframe['text'], val_dataframe['label']],axis=1)),
    'test': Dataset.from_pandas(pd.concat([test_dataframe['text'], test_dataframe['label']],axis=1))
})





In [126]:
# Name of the model to import
model_ckpt = "bert-base-uncased"

label2id = {"sadness": 0,
            "joy": 1,
            "love": 2,
            "anger": 3,
            "fear": 4,
            "surprise": 5}
id2label = {"0": "sadness",
            "1": "joy",
            "2": "love",
            "3": "anger",
            "4": "fear",
            "5": "surprise"}

unique_labels = ['sadness', 'joy','love', 'anger', 'fear', 'surprise']
num_labels = len(unique_labels)

In [127]:
config = BertConfig.from_pretrained(model_ckpt,
                                    label2id=label2id,
                                    id2label=id2label)

### Setting up the Model's Environment
------------------------------------

In [131]:
tokenizer = BertTokenizer.from_pretrained(model_ckpt)

def tokenize_function(example):
    return tokenizer(example['text'],truncation = True)

# We tokenize the input
tokenized_dataset = model_data.map(tokenize_function, batched = True, batch_size = None)

data_collator = DataCollatorWithPadding(tokenizer = tokenizer, padding=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [132]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average = 'weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

# Creating, Training and Testing the model
-------------------------------------

In [135]:
model = (BertForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device))

In [136]:
# Setting the Right Hyperparameters
epochs = 8
batch_size = 128
lr = 5e-5

# Initializing the Trainging Arguments
logging_steps = len(tokenized_dataset['train']) // batch_size
training_args = TrainingArguments(output_dir = './output',
                                overwrite_output_dir = True,
                                num_train_epochs = epochs,
                                learning_rate = lr,
                                optim = "adamw_torch",
                                weight_decay = 0,
                                evaluation_strategy = 'steps',
                                per_device_train_batch_size = batch_size,
                                per_device_eval_batch_size = batch_size,
                                disable_tqdm = False,
                                logging_strategy = 'steps',
                                logging_steps = 125,
                                log_level = 'error',
                                report_to = 'none',
                                push_to_hub = False,
                                save_strategy = 'steps',
                                eval_steps = 125,
                                save_total_limit = 5,
                                metric_for_best_model = 'accuracy',
                                load_best_model_at_end = True
                                )

# Initializing the trainer
trainer = Trainer(model = model,
                tokenizer = tokenizer,
                data_collator = data_collator,
                args = training_args,
                compute_metrics = compute_metrics,
                train_dataset = tokenized_dataset["train"],
                eval_dataset = tokenized_dataset["validation"],
                callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]  # Exits in case of accuracy lowering
                )

In [137]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
125,0.7086,0.213343,0.926,0.926457
250,0.1629,0.13407,0.94,0.940514
375,0.1118,0.141388,0.938,0.938149
500,0.0837,0.12344,0.9455,0.944914
625,0.0641,0.150956,0.9395,0.939842


TrainOutput(global_step=625, training_loss=0.22619726409912108, metrics={'train_runtime': 451.0333, 'train_samples_per_second': 283.793, 'train_steps_per_second': 2.217, 'total_flos': 1432033347548160.0, 'train_loss': 0.22619726409912108, 'epoch': 5.0})

In [138]:
train_predictions = trainer.predict(tokenized_dataset['train'])
test_predictions = trainer.predict(tokenized_dataset['test'])
validation_predictions = trainer.predict(tokenized_dataset['validation'])

# Compute metrics on the predictions for each dataset
train_metrics = compute_metrics(train_predictions)
test_metrics = compute_metrics(test_predictions)
validation_metrics = compute_metrics(validation_predictions)

# Print the metrics for all three datasets
print("Metrics on Train Data:")
print(train_metrics)
print("Metrics on Test Data:")
print(test_metrics)
print("Metrics on Validation Data:")
print(validation_metrics)

Metrics on Train Data:
{'accuracy': 0.97125, 'f1': 0.9707613472552109}
Metrics on Test Data:
{'accuracy': 0.933, 'f1': 0.9315542072364552}
Metrics on Validation Data:
{'accuracy': 0.9455, 'f1': 0.9449137204306589}


# Saving the model
---------------------

In [None]:
notebook_login()

In [141]:
trainer.push_to_hub("RicoCHEH/Bert_test")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/RicoCHEH/output/tree/main/'