In [1]:
import os

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import dataset
all_df = pd.read_csv('emotion_dataset_raw.csv')
labels_str = all_df["Emotion"].unique().tolist()

print(labels_str)

['neutral', 'joy', 'sadness', 'fear', 'surprise', 'anger', 'shame', 'disgust']


In [3]:
# 큰 데이터셋이라 일부 데이터로 진행
all_df = all_df.loc[:1000]

In [4]:
all_df.head()

Unnamed: 0,Emotion,Text
0,neutral,Why ?
1,joy,Sage Act upgrade on my to do list for tommorow.
2,sadness,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...
3,joy,Such an eye ! The true hazel eye-and so brill...
4,joy,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...


In [5]:
# Import tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [6]:
# Make dataset
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, labels_str):
        super(TextDataset, self).__init__()
        raw_text = df["Text"].tolist()
        raw_labels = df["Emotion"].tolist()

        self.texts = []
        for idx, rt in enumerate(raw_text):
            self.texts.append(tokenizer.encode(rt))

        self.max_length = max(len(text) for text in self.texts)

        self.labels = []
        for idx, rl in enumerate(raw_labels):
            self.labels.append(labels_str.index(rl))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx] # list
        text += [0] * (self.max_length - len(text))

        label = self.labels[idx] # int
        return torch.tensor(text), torch.tensor(label)

2024-05-23 18:41:19.341949: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-23 18:41:19.346144: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-23 18:41:19.407924: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
from sklearn.model_selection import train_test_split

# Define dataset
train_df, test_df = train_test_split(all_df, test_size=0.2)
train_dataset = TextDataset(train_df, tokenizer, labels_str)
test_dataset = TextDataset(test_df, tokenizer, labels_str)

# Define dataloader
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [8]:
print(all_df.loc[0])
print()
print(next(iter(train_dataset)))

Emotion    neutral
Text        Why ? 
Name: 0, dtype: object

(tensor([  101,  1045,  2572,  5305,  6675,  2000,  2085,  4553,  2008,  1996,
         2231,  2038,  9353, 15549,  2229, 11788,  2000,  1996,  2060, 14925,
         4177,  2006,  2023,  3277,  2012,  1996,  5928,  6465,  1998,  2008,
         1010,  2005,  1996,  8739,  1997,  4942,  5332,  9032, 15780,  1010,
         2023, 16449,  2038,  2042,  3333,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0]), tensor(7))


In [9]:
# Define RNN model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        _, h_n = self.rnn(x)
        h_n = h_n.squeeze(0)
        out = self.fc(h_n)
        return out

In [10]:
# Make model
vocab_size = tokenizer.vocab_size
embedding_dim = 512
hidden_dim = 512
output_dim = len(labels_str)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim)
model.to(device)

RNNModel(
  (embedding): Embedding(30522, 512)
  (rnn): RNN(512, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=8, bias=True)
)

In [11]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.01)

In [12]:
# Train
num_epochs = 10

for epoch in range(num_epochs):
    for texts, labels in train_dataloader:
        optimizer.zero_grad()

        texts, labels = texts.to(device), labels.to(device)

        outputs = model(texts)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/10], Loss: 3.1680
Epoch [2/10], Loss: 2.1777
Epoch [3/10], Loss: 2.1125
Epoch [4/10], Loss: 2.0320
Epoch [5/10], Loss: 2.0621
Epoch [6/10], Loss: 2.2926
Epoch [7/10], Loss: 1.8713
Epoch [8/10], Loss: 1.6275
Epoch [9/10], Loss: 1.6534
Epoch [10/10], Loss: 1.7998


In [13]:
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for texts, labels in test_dataloader:
        texts, labels = texts.to(device), labels.to(device)

        outputs = model(texts)

        _, predicted = torch.max(outputs.data, 1)
        
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()

accuracy = total_correct / total_samples
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 33.33%
