In [24]:
import pandas as pd
import sentencepiece as spm
import torch
from torch.nn.utils.rnn import pad_sequence
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset

PATH = 'C:\Emotion Classification\data\\full_dataset\goemotions_1.csv'
TOKENIZER = 'C:\Emotion Classification\\traning\\vocab\\tokenizer.model'

sp = spm.SentencePieceProcessor()
sp.load(TOKENIZER)

batchsize = 100

  PATH = 'C:\Emotion Classification\data\\full_dataset\goemotions_1.csv'
  TOKENIZER = 'C:\Emotion Classification\\traning\\vocab\\tokenizer.model'


In [25]:
df = pd.read_csv(PATH, sep=',')

In [26]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [27]:
df = df.drop(columns=['id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id'])
df['example_very_unclear'] = df['example_very_unclear'].astype(int)

In [28]:
split_index = int(len(df) * 0.85)
split_index

59500

In [29]:
def pipeline_text(data):
    data = data.copy()
    
    data["text"] = data["text"].apply(lambda x: sp.encode(x, out_type=int))
    y = data.drop(columns=['text'])
    
    list_of_lists = data['text'].tolist()
    tensor_list = [torch.tensor(seq, dtype=torch.float32) for seq in list_of_lists]
    X = pad_sequence(tensor_list, batch_first=True, padding_value=0)
    y = torch.tensor(y.values, dtype=torch.float32)
    
    X_train = X[:split_index]
    X_test = X[split_index:]
    
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    
    return X_train, y_train, X_test, y_test
    
X_train, y_train, X_test, y_test = pipeline_text(df)

In [30]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, type(X_train), type(y_train), type(X_test), type(y_test)


(torch.Size([59500, 142]),
 torch.Size([59500, 29]),
 torch.Size([10500, 142]),
 torch.Size([10500, 29]),
 torch.Tensor,
 torch.Tensor,
 torch.Tensor,
 torch.Tensor)

In [31]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=batchsize, shuffle=True, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=batchsize, shuffle=False, num_workers=2)

In [32]:
sequence_len = X_train.shape[1]
input_len = X_train.shape[1]
hidden_size = 128
num_layers = 2
num_classes = y_train.shape[1]
num_epochs = 5
learning_rate = 0.01

In [33]:
class LSTM(nn.Module):
    def __init__(self, input_len, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_len, hidden_size, num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, num_classes) 
    
    def forward(self, X):
        hidden_size = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
        cell_state = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
        out, _ = self.lstm(X, (hidden_size, cell_state))
        out = self.output_layer(out[:, -1, :])
        return out
    

In [34]:
model = LSTM(input_len, hidden_size, num_layers, num_classes)

In [35]:
model

LSTM(
  (lstm): LSTM(142, 128, num_layers=2, batch_first=True)
  (output_layer): Linear(in_features=128, out_features=29, bias=True)
)

In [36]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [37]:
def train(num_epochs, model, train_dataloader, loss_func):
    total_step = len(train_dataloader)
    
    for epoch in range(num_epochs):
        for batch, (text_r, lables) in enumerate(train_dataloader):
            text_r = text_r.reshape(-1, 1, 142)

            outputs = model(text_r)
            loss = loss_func(outputs, lables)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (batch + 1) % 100 == 0:
                print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{batch + 1}/{total_step}], Loss: {loss.item():.4f}, Accuracy: {torch.sum(torch.argmax(outputs, dim=1) == torch.argmax(lables, dim=1)).item() / len(lables):.4f}')

In [38]:
train(num_epochs=num_epochs, model=model, train_dataloader=train_dataloader, loss_func=loss_func)

Epoch [1/5], Step [100/595], Loss: 4.2592, Accuracy: 0.2300
Epoch [1/5], Step [200/595], Loss: 4.0384, Accuracy: 0.3000
Epoch [1/5], Step [300/595], Loss: 3.7694, Accuracy: 0.3100
Epoch [1/5], Step [400/595], Loss: 3.7778, Accuracy: 0.2300
Epoch [1/5], Step [500/595], Loss: 3.7132, Accuracy: 0.3600
Epoch [2/5], Step [100/595], Loss: 3.5872, Accuracy: 0.3000
Epoch [2/5], Step [200/595], Loss: 3.8953, Accuracy: 0.2000
Epoch [2/5], Step [300/595], Loss: 4.0493, Accuracy: 0.2300
Epoch [2/5], Step [400/595], Loss: 3.3810, Accuracy: 0.3500
Epoch [2/5], Step [500/595], Loss: 3.5647, Accuracy: 0.2800
Epoch [3/5], Step [100/595], Loss: 3.7468, Accuracy: 0.1900
Epoch [3/5], Step [200/595], Loss: 3.5907, Accuracy: 0.2100
Epoch [3/5], Step [300/595], Loss: 3.6490, Accuracy: 0.2900
Epoch [3/5], Step [400/595], Loss: 3.4735, Accuracy: 0.2300
Epoch [3/5], Step [500/595], Loss: 3.5154, Accuracy: 0.2300
Epoch [4/5], Step [100/595], Loss: 3.6151, Accuracy: 0.2200
Epoch [4/5], Step [200/595], Loss: 3.379