In [None]:
import datasets
import numpy as np
import evaluate
import torch
import torch.nn as nn
import torch.optim as optim
import nltk

In [None]:
#Load the dataset
dataset = datasets.load_from_disk("super-emotion")
train_dataset = dataset["train"]

texts = train_dataset["text"]
labels = train_dataset["labels_str"]

### Data Preprocessing

In [None]:
# NLTK prerocessing
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]  # 移除標點、數字
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

texts_cleaned = [preprocess_text(t) for t in texts[:50000]]
labels_subset = labels[:50000]

In [None]:
# TF-IDF 向量化
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(texts_cleaned).toarray().astype(np.float32)

In [None]:
# Label multi-hot encoding
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(labels_subset)   # shape = (n_samples, n_emotions)

# 分割訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [None]:
#建立模型
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.model=nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.model(x)

In [None]:
#設定參數
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = len(set(y_train.numpy()))
#初始化模型
model = MLP(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
#訓練模型
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")