In [14]:
!pip3 install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp313-cp313-win_amd64.whl.metadata (10 kB)
Downloading sentencepiece-0.2.1-cp313-cp313-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 1.1/1.1 MB 9.2 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.1



[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [551]:
import torch
from tqdm import tqdm
import copy
from transformers import pipeline
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import sentencepiece as spm
from tokenizers import Tokenizer
from sklearn.preprocessing import MinMaxScaler
import torch.nn.functional as F

P_df = pd.read_csv("C:\\Emotion Classification\\treeModelsData\\positive_emotions.csv")
N_df = pd.read_csv("C:\\Emotion Classification\\treeModelsData\\negative_emotions.csv")

TOKENIZER = "C:\Emotion Classification\\traning\\vocab\\tokenizer0.2.json"

tokenizer = Tokenizer.from_file(TOKENIZER)

batchsize = 60

  TOKENIZER = "C:\Emotion Classification\\traning\\vocab\\tokenizer0.2.json"


In [552]:
P_df.shape[0] + N_df.shape[0], P_df.shape[1]

(43332, 14)

In [553]:
P_df.dtypes

text          object
admiration     int64
amusement      int64
approval       int64
caring         int64
curiosity      int64
desire         int64
excitement     int64
gratitude      int64
joy            int64
love           int64
optimism       int64
pride          int64
relief         int64
dtype: object

In [554]:
P_df['text'] = P_df['text'].str.lower()
N_df['text'] = N_df['text'].str.lower()

In [555]:
P_df.sample(1)

Unnamed: 0,text,admiration,amusement,approval,caring,curiosity,desire,excitement,gratitude,joy,love,optimism,pride,relief
2864,ohhh. [name] is my absolute favorite person ev...,1,0,0,0,0,0,0,0,1,0,0,0,0


In [556]:
split_index = int(P_df.shape[0] * 0.8)
split_index
scaler = MinMaxScaler()

In [557]:
vocab_size = tokenizer.get_vocab_size()
embedding_dim = 128
vocab_size, embedding_dim

(30000, 128)

In [558]:
def pipeline_text(data, split_ratio=0.8):
    data = data.copy()
    
    data["text"] = data["text"].apply(lambda x: (tokenizer.encode(x)).ids)
    
    # Мітки
    y = data.drop(columns=['text'])
    
    from sklearn.utils import shuffle
    data = shuffle(data, random_state=42)
    
    y_main = y.values.argmax(axis=1)
    data['main_label'] = y_main
    # sort
    data = data.sort_values('main_label').reset_index(drop=True)
    data = data.drop(columns=['main_label'])
    
    # тензори
    list_of_lists = data['text'].tolist()
    tensor_list = [torch.tensor(seq, dtype=torch.long) for seq in list_of_lists]
    X = pad_sequence(tensor_list, batch_first=True, padding_value=0)
    y = torch.tensor(data.drop(columns=['text']).values, dtype=torch.float32)
    
    # Split
    split_index = int(len(data) * split_ratio)
    X_train = X[:split_index]
    X_test = X[split_index:]
    
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    return X_train, y_train, X_test, y_test

    
X_train, y_train, X_test, y_test = pipeline_text(N_df)

In [559]:
emotion_counts = N_df.drop(columns=['text']).sum()
print(emotion_counts)

anger             2363
annoyance         3885
confusion         1873
disappointment    2313
disapproval       3278
disgust           1438
embarrassment      690
fear               871
grief              187
nervousness        495
remorse            668
sadness           1772
dtype: int64


In [560]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, type(X_train), type(y_train), type(X_test), type(y_test)

(torch.Size([13512, 41]),
 torch.Size([13512, 12]),
 torch.Size([3378, 41]),
 torch.Size([3378, 12]),
 torch.Tensor,
 torch.Tensor,
 torch.Tensor,
 torch.Tensor)

In [561]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=batchsize, shuffle=True, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=batchsize, shuffle=False, num_workers=2)

In [562]:
sequence_len = X_train.shape[1]
input_len = X_train.shape[1]
hidden_size = 256
num_layers = 1
num_classes = y_train.shape[1]
num_epochs = 10
learning_rate = 0.01

In [563]:
# class LSTM(nn.Module):
#     def __init__(self, input_len, hidden_size, num_layers, num_classes):
#         super(LSTM, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.lstm = nn.LSTM(input_len, hidden_size, num_layers, batch_first=True)
#         self.output_layer = nn.Linear(hidden_size, num_classes) 
    
#     def forward(self, X):
#         hidden_size = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
#         cell_state = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
#         out, _ = self.lstm(X, (hidden_size, cell_state))
#         out = self.output_layer(out[:, -1, :])
#         return out
    
    
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes, dropout=0.3):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_size, 
            num_layers=num_layers,
            batch_first=True, 
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False
        )
        self.dropout1 = nn.Dropout(dropout)
        
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.bn1 = nn.BatchNorm1d(hidden_size // 2)
        self.dropout2 = nn.Dropout(dropout)
        
        self.fc2 = nn.Linear(hidden_size // 2, hidden_size // 4)
        self.bn2 = nn.BatchNorm1d(hidden_size // 4)
        self.dropout3 = nn.Dropout(dropout * 0.5)
        

        self.output_layer = nn.Linear(hidden_size // 4, num_classes)
    
    def forward(self, x):

        embedded = self.embedding(x)
        

        lstm_out, _ = self.lstm(embedded)
        

        out = lstm_out.mean(dim=1)
        

        out = self.dropout1(out)

        out = self.fc1(out)
        out = self.bn1(out)
        out = F.relu(out)
        out = self.dropout2(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = F.relu(out)
        out = self.dropout3(out)

        out = self.output_layer(out)
        
        return out

In [564]:
# model = LSTMModel(input_len, hidden_size, num_layers, num_classes)
model = LSTMModel(vocab_size, hidden_size, hidden_size, num_layers, num_classes)

In [565]:
loss_func = nn.BCEWithLogitsLoss()
# optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# loss_func = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

In [566]:
def train(num_epochs, model, train_dataloader, loss_func):
    total_step = len(train_dataloader)
    
    for epoch in range(num_epochs):
        for batch, (text_r, lables) in enumerate(train_dataloader):

            outputs = model(text_r)
            loss = loss_func(outputs, lables)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (batch + 1) % 100 == 0:
                print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{batch + 1}/{total_step}], Loss: {loss.item():.4f}, Accuracy: {torch.sum(torch.argmax(outputs, dim=1) == torch.argmax(lables, dim=1)).item() / len(lables):.4f}')

In [567]:
train(num_epochs=num_epochs, model=model, train_dataloader=train_dataloader, loss_func=loss_func)

Epoch [1/10], Step [100/226], Loss: 0.3232, Accuracy: 0.1667
Epoch [1/10], Step [200/226], Loss: 0.2998, Accuracy: 0.2000
Epoch [2/10], Step [100/226], Loss: 0.2786, Accuracy: 0.2667
Epoch [2/10], Step [200/226], Loss: 0.2730, Accuracy: 0.3667
Epoch [3/10], Step [100/226], Loss: 0.3057, Accuracy: 0.3333
Epoch [3/10], Step [200/226], Loss: 0.3172, Accuracy: 0.2333
Epoch [4/10], Step [100/226], Loss: 0.2261, Accuracy: 0.5000
Epoch [4/10], Step [200/226], Loss: 0.2582, Accuracy: 0.3667
Epoch [5/10], Step [100/226], Loss: 0.2509, Accuracy: 0.4500
Epoch [5/10], Step [200/226], Loss: 0.2371, Accuracy: 0.3667
Epoch [6/10], Step [100/226], Loss: 0.2232, Accuracy: 0.4333
Epoch [6/10], Step [200/226], Loss: 0.2214, Accuracy: 0.4000
Epoch [7/10], Step [100/226], Loss: 0.2071, Accuracy: 0.5000
Epoch [7/10], Step [200/226], Loss: 0.2002, Accuracy: 0.5167
Epoch [8/10], Step [100/226], Loss: 0.2230, Accuracy: 0.4833
Epoch [8/10], Step [200/226], Loss: 0.2213, Accuracy: 0.4500
Epoch [9/10], Step [100/

Epoch [1/5], Step [100/353], Loss: 0.2903, Accuracy: 0.2667
Epoch [1/5], Step [200/353], Loss: 0.2692, Accuracy: 0.3167
Epoch [1/5], Step [300/353], Loss: 0.2249, Accuracy: 0.4000
Epoch [2/5], Step [100/353], Loss: 0.2186, Accuracy: 0.4500
Epoch [2/5], Step [200/353], Loss: 0.2328, Accuracy: 0.4667
Epoch [2/5], Step [300/353], Loss: 0.2460, Accuracy: 0.4167
Epoch [3/5], Step [100/353], Loss: 0.2194, Accuracy: 0.4333
Epoch [3/5], Step [200/353], Loss: 0.1846, Accuracy: 0.5500
Epoch [3/5], Step [300/353], Loss: 0.2377, Accuracy: 0.4667
Epoch [4/5], Step [100/353], Loss: 0.1986, Accuracy: 0.6167
Epoch [4/5], Step [200/353], Loss: 0.2355, Accuracy: 0.5333
Epoch [4/5], Step [300/353], Loss: 0.2356, Accuracy: 0.3500
Epoch [5/5], Step [100/353], Loss: 0.2068, Accuracy: 0.4667
Epoch [5/5], Step [200/353], Loss: 0.1771, Accuracy: 0.6333
Epoch [5/5], Step [300/353], Loss: 0.2150, Accuracy: 0.5667

In [568]:
text = "ahaha"
d = {'text': [text]}
test_df = pd.DataFrame(data=d)
test_df

Unnamed: 0,text
0,ahaha


In [569]:

def prepare_text_for_inference(df=test_df, tokenizer=tokenizer):

    df = df.copy()
    tokenized = df['text'].apply(lambda x: tokenizer.encode(x).ids)
    tensor_list = [torch.tensor(seq, dtype=torch.long) for seq in tokenized.tolist()]
    X = pad_sequence(tensor_list, batch_first=True, padding_value=0)
    
    return X

In [570]:
X_new = prepare_text_for_inference()

In [571]:
X_new

tensor([[10687]])

In [572]:
model.eval()
with torch.no_grad():
    outputs = model(X_new)
    probs = torch.sigmoid(outputs)
    preds = (probs > 0.5).int()




In [573]:
preds_list = preds.tolist()

for i in range(len(preds_list[0])):
    if preds_list[0][i] == 1:
        result = preds_list[0].index(preds_list[0][i]) + 1

In [574]:
P_df.columns[result]

'admiration'

In [575]:
P_df.columns

Index(['text', 'admiration', 'amusement', 'approval', 'caring', 'curiosity',
       'desire', 'excitement', 'gratitude', 'joy', 'love', 'optimism', 'pride',
       'relief'],
      dtype='object')

In [576]:
torch.save(model.state_dict(), 'C:\Emotion Classification\\traning\\train0.3\models\\Nmodel.pth')

  torch.save(model.state_dict(), 'C:\Emotion Classification\\traning\\train0.3\models\\Nmodel.pth')
