In [1]:
!pip -q install datasets

In [2]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics
import nltk
from nltk.corpus import stopwords

In [3]:
go_emotions = load_dataset("go_emotions", "simplified")
labels = go_emotions["test"].features["labels"].feature.names
nltk.download('stopwords')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/350k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
test_data = go_emotions['test'].to_pandas().drop(columns='id')
train_data = go_emotions['train'].to_pandas().drop(columns='id')

# Initialize MultiLabelBinarizer to encode labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_data['labels'])
test_labels = mlb.transform(test_data['labels'])

# Feature Extraction using TF-IDF
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'))

# Model Training using Logistic Regression
classifier = MultiOutputClassifier(LogisticRegression())

# Creating a pipeline
model = Pipeline([
    ('vectorizer', tfidf),
    ('classifier', classifier)
])

# Training the model
model.fit(train_data['text'], train_labels)

# Model Evaluation
predictions = model.predict(test_data['text'])
print("Accuracy:", metrics.accuracy_score(test_labels, predictions))

Accuracy: 0.279344020637553


In [5]:
def calc_label_metrics(label, y_targets, y_preds):
    return {
        "label": label,
        "accuracy": metrics.accuracy_score(y_targets, y_preds),
        "precision": metrics.precision_score(y_targets, y_preds, zero_division=0),
        "recall": metrics.recall_score(y_targets, y_preds, zero_division=0),
        "f1": metrics.f1_score(y_targets, y_preds, zero_division=0),
        "mcc": metrics.matthews_corrcoef(y_targets, y_preds),
        "support": y_targets.sum(),
    }


y_probas_all = []
y_targets_all = []
test_loss = 0
correct = 0


sum_precision = 0
sum_recall = 0
sum_f1 = 0
sum_mcc = 0

results = []
for label_index, label in enumerate(labels):
    y_targets, y_preds = test_labels[:, label_index], predictions[:, label_index]
    label_metrics = calc_label_metrics(label, y_targets, y_preds)
    results.append(label_metrics)

    # Sum up metrics for macro-average
    sum_precision += label_metrics["precision"]
    sum_recall += label_metrics["recall"]
    sum_f1 += label_metrics["f1"]
    sum_mcc += label_metrics["mcc"]

# Calculate macro-average metrics
num_labels = len(labels)
macro_avg_precision = sum_precision / num_labels
macro_avg_recall = sum_recall / num_labels
macro_avg_f1 = sum_f1 / num_labels
macro_avg_mcc = sum_mcc / num_labels

# Append macro-average metrics to results
macro_avg_results = {
    "label": "macro_avg",
    "accuracy": None,  # Macro-average accuracy is not typically used
    "precision": macro_avg_precision,
    "recall": macro_avg_recall,
    "f1": macro_avg_f1,
    "mcc": macro_avg_mcc,
    "support": None,  # Support doesn't make sense for macro-average
}
results.append(macro_avg_results)

per_label_results = pd.DataFrame(results, index=[label["label"] for label in results])
display(per_label_results.drop(columns=["label"]).round(3))


Unnamed: 0,accuracy,precision,recall,f1,mcc,support
admiration,0.928,0.742,0.343,0.469,0.474,504.0
amusement,0.969,0.8,0.485,0.604,0.609,264.0
anger,0.965,0.622,0.116,0.196,0.259,198.0
annoyance,0.942,0.769,0.031,0.06,0.148,320.0
approval,0.938,0.711,0.077,0.139,0.221,351.0
caring,0.976,0.714,0.074,0.134,0.225,135.0
confusion,0.971,0.333,0.013,0.025,0.061,153.0
curiosity,0.949,0.875,0.025,0.048,0.142,284.0
desire,0.986,0.688,0.133,0.222,0.298,83.0
disappointment,0.973,1.0,0.013,0.026,0.113,151.0
