## Emotion Classification with a BOW Naive Bayes Model

Some parts of this are adapted from the Week 5 lab on Blackboard (marked as such in the code).

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import random

In [2]:
lemmatize = False
remove_stopwords = False
use_nltk_tokenizer = False

## Load data

In [None]:
from datasets import load_dataset
train_data = load_dataset("dair-ai/emotion", split="train")
test_data = load_dataset("dair-ai/emotion", split="test")
# This is a HuggingFace dataset
print(train_data.column_names)

## Select BOW features

In [None]:
# Part of this code is adapted from the lab
stop_words = list(stopwords.words('english'))
all_words = []
for example in train_data:
    if use_nltk_tokenizer:
        all_words.extend(word_tokenize(example["text"]))
    else:
        all_words.extend(example["text"].split())

if lemmatize:
    lemmatizer = WordNetLemmatizer()
    freqdist = nltk.FreqDist(lemmatizer.lemmatize(w.lower()) for w in all_words)
else:
    freqdist = nltk.FreqDist(w.lower() for w in all_words)


top_words = set(list(freqdist)[:2000])

if remove_stopwords:
    stop_words = list(stopwords.words('english'))
    top_words = list(top_words.difference(stop_words))

print("Using {} top words as features".format(len(top_words)))

## Prepare features

In [35]:
# This code is taken from the lab
def document_features(document, top_words):
    if use_nltk_tokenizer:
        document = word_tokenize(document)
    else:
        document = document.split()
    if lemmatize:
        document = [lemmatizer.lemmatize(w) for w in document]
    doc_words = set(document)
    features = {}
    for word in top_words:
        features['contains({})'.format(word)] = (word in doc_words)
    return features

In [36]:
# Part of this code is adapted from the lab
train_documents = [(example["text"], example["label"]) for example in train_data]
random.shuffle(train_documents)
test_documents = [(example["text"], example["label"]) for example in test_data]

In [37]:
# Part of this code is adapted from the lab
train_set = [(document_features(document=d, top_words=top_words), label) for d, label in train_documents]
test_set = [(document_features(document=d, top_words=top_words), label) for d, label in test_documents]

In [38]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [39]:
# Part of this code is adapted from the lab
test_doc = test_documents[42]
decision = classifier.classify(document_features(document=test_doc[0],
top_words=top_words))
print("ID: %s, LABEL: %s, MODEL DECISION: %s" % (test_doc[0],
test_doc[1], decision))

ID: i feel reassured that i am dealing with my diet in the right way and that all is good, LABEL: 1, MODEL DECISION: 1


In [40]:
classifier.show_most_informative_features(5)

Most Informative Features
        contains(amazed) = True                5 : 0      =    361.1 : 1.0
       contains(curious) = True                5 : 0      =    290.5 : 1.0
       contains(shocked) = True                5 : 0      =    247.1 : 1.0
         contains(dazed) = True                5 : 0      =    176.5 : 1.0
        contains(caring) = True                2 : 1      =    168.5 : 1.0


In [5]:
label2int = {
  "sadness": 0,
  "joy": 1,
  "love": 2,
  "anger": 3,
  "fear": 4,
  "surprise": 5
}
emotions = [ "sadness", "joy", "love", "anger", "fear", "surprise"]


In [41]:
from sklearn.metrics import classification_report

all_preds = []
all_true = []
for item in test_documents:
    y_pred = classifier.classify(document_features(document=item[0],
        top_words=top_words))
    all_preds.append(y_pred)
    all_true.append(item[1])
print(classification_report(all_true, all_preds, target_names=label2int.keys(), digits=len(emotions)))


              precision    recall  f1-score   support

     sadness   0.881739  0.872633  0.877163       581
         joy   0.868966  0.906475  0.887324       695
        love   0.639241  0.635220  0.637224       159
       anger   0.848485  0.814545  0.831169       275
        fear   0.791855  0.781250  0.786517       224
    surprise   0.561404  0.484848  0.520325        66

    accuracy                       0.834500      2000
   macro avg   0.765281  0.749162  0.756620      2000
weighted avg   0.832811  0.834500  0.833366      2000



In [66]:
def informative_per_class(c1:int, c2:int):
    features_d = {}
    for (label, fname), probdist in classifier._feature_probdist.items():
        if "(sound)" in fname:
            print(label, probdist.prob(True))
        if label == c1:
            if fname not in features_d:
                features_d[fname] = {c1:None, c2:None}
            features_d[fname][c1] = probdist.prob(True)
        elif label == c2:
            if fname not in features_d:
                features_d[fname] = {c1:None, c2:None}
            features_d[fname][c2] = probdist.prob(True)
    features = [(fname, class_d[c1]/class_d[c2]) for fname, class_d in features_d.items()]
    features.sort(key=lambda x:x[1], reverse=True)
    print(features[:5])
    print(features[-5:])
    return features

def get_features_for_tokens(tokens, include_absent = True):
    odds = {f: 1 for f in range(len(emotions))}
    for token in tokens:
        for (label, fname), probdist in classifier._feature_probdist.items():
            if f"({token})" in fname:
                odds[label] *= probdist.prob(True)
            else:
                if include_absent:
                    odds[label] *= probdist.prob(False)
    return odds      

#informative_per_class(1, 2)
print(get_features_for_tokens(["great"], include_absent = False))

{0: 0.006106706663809728, 1: 0.007738206227857543, 2: 0.008045977011494253, 3: 0.004861111111111111, 4: 0.00696594427244582, 5: 0.004363001745200698}


In [43]:
with open("bow_misclassified.txt", "w") as f:
    for i, item in enumerate(test_documents):
        y_pred = classifier.classify(document_features(document=item[0],
            top_words=top_words))
        if y_pred != item[1]:
            f.write(f"{test_data[i]["text"]}, pred: {y_pred}, true: {item[1]}\n")
    f.close()

In [67]:
import re
data = load_dataset("go_emotions", "simplified")
go_documents = []
# surprise: 26, sadness 25, joy 17, love 18, anger 2, fear 14
mapping_dict = {25:0, 17:1, 18:2, 2:3, 14:4, 26:5}
for ex in data['train']:
    labels = set(ex['labels']).intersection(mapping_dict.keys())
    if labels:
        text = re.sub(r'[^\w\s]', '', ex['text'].lower()) # remove punctuation, make lowercase
        go_documents.append((text, mapping_dict[list(labels)[0]]))
print(f"Dataset loaded: {len(go_documents)} entries")

all_preds = []
all_true = []
for item in go_documents:
    y_pred = classifier.classify(document_features(document=item[0],
        top_words=top_words))
    all_preds.append(y_pred)
    all_true.append(item[1])
print(classification_report(all_true, all_preds, target_names=label2int.keys(), digits=len(emotions)))

Downloading readme: 100%|██████████| 9.40k/9.40k [00:00<00:00, 9.48MB/s]
Downloading data: 100%|██████████| 2.77M/2.77M [00:01<00:00, 1.87MB/s]
Downloading data: 100%|██████████| 350k/350k [00:00<00:00, 461kB/s]
Downloading data: 100%|██████████| 347k/347k [00:00<00:00, 476kB/s]
Generating train split: 100%|██████████| 43410/43410 [00:00<00:00, 419632.43 examples/s]
Generating validation split: 100%|██████████| 5426/5426 [00:00<00:00, 493148.14 examples/s]
Generating test split: 100%|██████████| 5427/5427 [00:00<00:00, 493452.88 examples/s]


Dataset loaded: 7894 entries
              precision    recall  f1-score   support

     sadness   0.266883  0.682504  0.383718      1326
         joy   0.353927  0.704656  0.471190      1439
        love   0.614880  0.140010  0.228084      2007
       anger   0.484286  0.220273  0.302814      1539
        fear   0.447761  0.214669  0.290206       559
    surprise   0.751174  0.156250  0.258690      1024

    accuracy                       0.357107      7894
   macro avg   0.486485  0.353060  0.322450      7894
weighted avg   0.489241  0.357107  0.321481      7894



In [6]:
tr = load_dataset("dair-ai/emotion", split="train")['label']
val = load_dataset("dair-ai/emotion", split="validation")['label']
te = load_dataset("dair-ai/emotion", split="test")['label']

for split in [tr, val, te]:
    classes = {n:0 for n in range(len(emotions))}
    for instance in split:
        classes[instance] += 1
    print(classes)

Using the latest cached version of the module from C:\Users\ursus\.cache\huggingface\modules\datasets_modules\datasets\dair-ai--emotion\cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd (last modified on Fri Mar 29 16:07:52 2024) since it couldn't be found locally at dair-ai/emotion, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\ursus\.cache\huggingface\modules\datasets_modules\datasets\dair-ai--emotion\cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd (last modified on Fri Mar 29 16:07:52 2024) since it couldn't be found locally at dair-ai/emotion, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\ursus\.cache\huggingface\modules\datasets_modules\datasets\dair-ai--emotion\cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd (last modified on Fri Mar 29 16:07:52 2024) since it couldn't be found locally at dair-ai/emotion, or remotely on the Hugging

{0: 4666, 1: 5362, 2: 1304, 3: 2159, 4: 1937, 5: 572}
{0: 550, 1: 704, 2: 178, 3: 275, 4: 212, 5: 81}
{0: 581, 1: 695, 2: 159, 3: 275, 4: 224, 5: 66}


In [11]:
rnn = open("misclass_rnn.txt", "r")
bow = open("bow_misclassified.txt", "r")

counter = 0
rnn_sents = set()
for line in rnn:
    rnn_sents.add(line.split(" //")[0])
for line in bow:
    if line.split(", pred")[0] in rnn_sents:
        counter += 1
print(counter)
#print(len(rnn.readlines()))
#print(len(bow.readlines()))

142
0
0
