In [1]:
import datasets
import numpy as np
import evaluate
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report,accuracy_score

In [None]:
# 下載 nltk 資源（只需一次）
nltk.download('punkt')
nltk.download("punkt_tab")
nltk.download('stopwords')

In [2]:
#Load the dataset
dataset = datasets.load_from_disk("super-emotion")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

texts_train  = train_dataset["text"]
labels_train = train_dataset["labels_str"]

texts_test = test_dataset["text"]
labels_test = test_dataset["labels_str"]

### Data Preprocessing(torch)

In [3]:
# NLTK prerocessing
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]  # 移除標點、數字
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)
#RAM會爆炸!!
texts_cleaned_train = [preprocess_text(t) for t in texts_train [:50000]]
texts_cleaned_test = [preprocess_text(t) for t in texts_test [:50000]]
labels_subset_train = labels_train[:50000]
labels_subset_test = labels_test[:50000]

In [4]:
# TF-IDF 向量化
vectorizer = TfidfVectorizer(max_features=10000)
# X = vectorizer.fit_transform(texts_cleaned_train).toarray().astype(np.float32)
X_train = vectorizer.fit_transform(texts_cleaned_train).toarray().astype(np.float32)
X_test = vectorizer.transform(texts_cleaned_test).toarray().astype(np.float32)

In [5]:
# Label multi-hot encoding
mlb = MultiLabelBinarizer()
# Y = mlb.fit_transform(labels_subset_train)   # shape = (n_samples, n_emotions)
y_train = mlb.fit_transform(labels_subset_train)
y_test = mlb.transform(labels_subset_test)

# 分割訓練集與測試集
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [6]:
#建立模型
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.model=nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

In [7]:
#設定類別的權重，因為資料集不平衡
class_counts = torch.tensor([ 5459,  4018, 12158,  2699,  3710,  9814,  2142],dtype=torch.float32)
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum()*7
#設定參數
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = y_train.shape[1]
#初始化模型
model = MLP(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)
#訓練模型
epochs = 75
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    y_train_class = y_train.argmax(dim=1)
    loss = criterion(outputs, y_train_class)
    loss.backward()
    optimizer.step()
    with torch.no_grad():
        preds = torch.argmax(outputs, dim=1)
        acc = accuracy_score(y_train_class.numpy(), preds.numpy())
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Acc: {acc:.4f}")

Epoch [1/75], Loss: 1.9463, Acc: 0.2447
Epoch [2/75], Loss: 1.9439, Acc: 0.2447
Epoch [3/75], Loss: 1.9414, Acc: 0.2447
Epoch [4/75], Loss: 1.9385, Acc: 0.2447
Epoch [5/75], Loss: 1.9352, Acc: 0.2448
Epoch [6/75], Loss: 1.9315, Acc: 0.2463
Epoch [7/75], Loss: 1.9275, Acc: 0.2522
Epoch [8/75], Loss: 1.9232, Acc: 0.2698
Epoch [9/75], Loss: 1.9187, Acc: 0.3038
Epoch [10/75], Loss: 1.9141, Acc: 0.3502
Epoch [11/75], Loss: 1.9092, Acc: 0.4129
Epoch [12/75], Loss: 1.9041, Acc: 0.4808
Epoch [13/75], Loss: 1.8988, Acc: 0.5468
Epoch [14/75], Loss: 1.8935, Acc: 0.5983
Epoch [15/75], Loss: 1.8878, Acc: 0.6414
Epoch [16/75], Loss: 1.8820, Acc: 0.6734
Epoch [17/75], Loss: 1.8759, Acc: 0.6974
Epoch [18/75], Loss: 1.8698, Acc: 0.7133
Epoch [19/75], Loss: 1.8633, Acc: 0.7273
Epoch [20/75], Loss: 1.8567, Acc: 0.7345
Epoch [21/75], Loss: 1.8498, Acc: 0.7409
Epoch [22/75], Loss: 1.8428, Acc: 0.7478
Epoch [23/75], Loss: 1.8356, Acc: 0.7507
Epoch [24/75], Loss: 1.8279, Acc: 0.7529
Epoch [25/75], Loss: 1.82

In [8]:
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    y_test_class = y_test.argmax(dim=1)
    test_loss = criterion(outputs, y_test_class)
    preds = torch.argmax(outputs, dim=1)
    test_acc = accuracy_score(y_test_class.numpy(), preds.numpy())
    print(f"Test Loss: {test_loss.item():.4f}, Test Acc: {test_acc:.4f}")

Test Loss: 1.4131, Test Acc: 0.6878


### Data preprocessing(sklearning)

In [9]:
# NLTK prerocessing
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]  # 移除標點、數字
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

texts_cleaned_train = [preprocess_text(t) for t in texts_train]
texts_cleaned_test = [preprocess_text(t) for t in texts_test]

In [10]:
# TF-IDF 向量化
vectorizer = TfidfVectorizer(max_features=10000)
# X = vectorizer.fit_transform(texts_cleaned_train)
X_train = vectorizer.fit_transform(texts_cleaned_train)
X_test = vectorizer.transform(texts_cleaned_test)

In [11]:
# Label multi-hot encoding
mlb = MultiLabelBinarizer()
# Y = mlb.fit_transform(labels_train)   
y_train = mlb.fit_transform(labels_train) # shape = (n_samples, n_emotions)
y_test = mlb.transform(labels_test)

# 分割訓練集與測試集
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [12]:
sklearn_model=LogisticRegression(max_iter=1000)
clf= OneVsRestClassifier(sklearn_model)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.fit(X_train, y_train)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

       Anger       0.90      0.65      0.76      8705
        Fear       0.86      0.65      0.74      6474
         Joy       0.90      0.78      0.84     18484
        Love       0.78      0.52      0.62      6157
     Neutral       0.60      0.24      0.34      3925
     Sadness       0.92      0.82      0.87     14853
    Surprise       0.74      0.40      0.52      2660

   micro avg       0.88      0.68      0.77     61258
   macro avg       0.82      0.58      0.67     61258
weighted avg       0.86      0.68      0.75     61258
 samples avg       0.69      0.70      0.69     61258



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
