In [None]:
import datasets
import numpy as np
import evaluate
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report,accuracy_score

In [2]:
# 下載 nltk 資源（只需一次）
nltk.download('punkt')
nltk.download("punkt_tab")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
#Load the dataset
dataset = datasets.load_from_disk("super-emotion")
train_dataset = dataset["train"]

texts = train_dataset["text"]
labels = train_dataset["labels_str"]

### Data Preprocessing(torch)

In [None]:
# NLTK prerocessing
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]  # 移除標點、數字
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)
#RAM會爆炸!!
texts_cleaned = [preprocess_text(t) for t in texts[:50000]]
labels_subset = labels[:50000]

In [5]:
# TF-IDF 向量化
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(texts_cleaned).toarray().astype(np.float32)

In [6]:
# Label multi-hot encoding
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(labels_subset)   # shape = (n_samples, n_emotions)

# 分割訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [7]:
#建立模型
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.model=nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

In [28]:
#設定類別的權重，因為資料集不平衡
class_counts = torch.tensor([ 5459,  4018, 12158,  2699,  3710,  9814,  2142],dtype=torch.float32)
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum()*7
#設定參數
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = y_train.shape[1]
#初始化模型
model = MLP(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)
#訓練模型
epochs = 75
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    y_train_class = y_train.argmax(dim=1)
    loss = criterion(outputs, y_train_class)
    loss.backward()
    optimizer.step()
    with torch.no_grad():
        preds = torch.argmax(outputs, dim=1)
        acc = accuracy_score(y_train_class.numpy(), preds.numpy())
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Acc: {acc:.4f}")

Epoch [1/75], Loss: 1.9475, Acc: 0.1781
Epoch [2/75], Loss: 1.9452, Acc: 0.2415
Epoch [3/75], Loss: 1.9426, Acc: 0.2735
Epoch [4/75], Loss: 1.9397, Acc: 0.2898
Epoch [5/75], Loss: 1.9365, Acc: 0.2991
Epoch [6/75], Loss: 1.9328, Acc: 0.3049
Epoch [7/75], Loss: 1.9289, Acc: 0.3083
Epoch [8/75], Loss: 1.9247, Acc: 0.3158
Epoch [9/75], Loss: 1.9202, Acc: 0.3298
Epoch [10/75], Loss: 1.9154, Acc: 0.3486
Epoch [11/75], Loss: 1.9105, Acc: 0.3725
Epoch [12/75], Loss: 1.9053, Acc: 0.3996
Epoch [13/75], Loss: 1.9000, Acc: 0.4228
Epoch [14/75], Loss: 1.8943, Acc: 0.4469
Epoch [15/75], Loss: 1.8885, Acc: 0.4783
Epoch [16/75], Loss: 1.8825, Acc: 0.5105
Epoch [17/75], Loss: 1.8764, Acc: 0.5429
Epoch [18/75], Loss: 1.8700, Acc: 0.5763
Epoch [19/75], Loss: 1.8633, Acc: 0.6038
Epoch [20/75], Loss: 1.8567, Acc: 0.6281
Epoch [21/75], Loss: 1.8494, Acc: 0.6506
Epoch [22/75], Loss: 1.8423, Acc: 0.6663
Epoch [23/75], Loss: 1.8347, Acc: 0.6812
Epoch [24/75], Loss: 1.8274, Acc: 0.6917
Epoch [25/75], Loss: 1.81

In [31]:
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    y_test_class = y_test.argmax(dim=1)
    test_loss = criterion(outputs, y_test_class)
    preds = torch.argmax(outputs, dim=1)
    test_acc = accuracy_score(y_test_class.numpy(), preds.numpy())
    print(f"Test Loss: {test_loss.item():.4f}, Test Acc: {test_acc:.4f}")

Test Loss: 1.4136, Test Acc: 0.6563


### Data preprocessing(sklearning)

In [32]:
# NLTK prerocessing
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]  # 移除標點、數字
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

texts_cleaned = [preprocess_text(t) for t in texts]

In [33]:
# TF-IDF 向量化
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(texts_cleaned)

In [34]:
# Label multi-hot encoding
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(labels)   # shape = (n_samples, n_emotions)

# 分割訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
sklearn_model=LogisticRegression(max_iter=1000)
clf= OneVsRestClassifier(sklearn_model)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.fit(X_train, y_train)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

       Anger       0.90      0.72      0.80     11991
        Fear       0.88      0.67      0.76      9821
         Joy       0.91      0.82      0.86     27407
        Love       0.79      0.54      0.64      9132
     Neutral       0.61      0.18      0.28      5100
     Sadness       0.92      0.86      0.89     22308
    Surprise       0.76      0.41      0.53      3885

   micro avg       0.89      0.71      0.79     89644
   macro avg       0.82      0.60      0.68     89644
weighted avg       0.87      0.71      0.78     89644
 samples avg       0.71      0.73      0.72     89644



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
