In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import keras
from keras.metrics import Precision, Recall
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adam


In [2]:
# импорт данных
test_data = pd.read_csv("./input/test.txt", header=None, sep=";", names=["Comment","Emotion"], encoding="utf-8")
train_data = pd.read_csv("./input/train.txt", header=None, sep=";", names=["Comment","Emotion"], encoding="utf-8")
validation_data = pd.read_csv("./input/val.txt", header=None, sep=";", names=["Comment","Emotion"], encoding="utf-8")

In [3]:
print("Train : ", train_data.shape)
print("Test : ", test_data.shape)
print("Validation : ", validation_data.shape)

Train :  (16000, 2)
Test :  (2000, 2)
Validation :  (2000, 2)


In [4]:
print("Class names:",  train_data["Emotion"].unique().tolist())

Class names: ['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']


In [5]:
train_data.head()

Unnamed: 0,Comment,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [6]:
# Преобразование "Emotion" в числовой формат с использованием LabelEncoder
lb = LabelEncoder()
train_data["Emotion"] = lb.fit_transform(train_data["Emotion"])
test_data["Emotion"] = lb.fit_transform(test_data["Emotion"])
validation_data["Emotion"] = lb.fit_transform(validation_data["Emotion"])

In [7]:
train_data.head()

Unnamed: 0,Comment,Emotion
0,i didnt feel humiliated,4
1,i can go from feeling so hopeless to so damned...,4
2,im grabbing a minute to post i feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplac...,3
4,i am feeling grouchy,0


In [8]:
print("Class names:",  train_data["Emotion"].unique().tolist())

Class names: [4, 0, 3, 5, 1, 2]


In [9]:
vocab_size = 10000 # размер словаря (количество уникальных слов при обучении)
len_sentence = 150 # длина предложений

nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english')) # стоп слова

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Предобработка текста
def text_prepare(data, column):
    print(data.shape)
    stemmer = PorterStemmer()
    corpus = []
    
    for text in data[column]:
        text = re.sub("[^a-zA-Z]", " ", text) # удалить все символы, кроме букв.
        
        text = text.lower()
        text = text.split()
        
        text = [stemmer.stem(word) for word in text if word not in stopwords] # применить стемминг (привести слово к основной форме) и удалить стоп-слова
        text = " ".join(text)
        
        corpus.append(text)
    one_hot_word = [one_hot(input_text=word, n=vocab_size) for word in corpus] # кодирование слов в числовой формат
    embeddec_doc = pad_sequences(sequences=one_hot_word,
                              maxlen=len_sentence,
                              padding="pre") # обрезка последовательностей до фиксированной длины
    print(data.shape)
    return embeddec_doc

In [11]:
x_train = text_prepare(train_data, "Comment")
x_validate = text_prepare(validation_data, "Comment")
x_test = text_prepare(test_data, "Comment")

(16000, 2)
(16000, 2)
(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)


In [12]:
y_train=train_data["Emotion"]
y_validate=validation_data["Emotion"]
y_test=test_data["Emotion"]

In [13]:
enc = OneHotEncoder()
y_train = np.array(y_train)
y_train = enc.fit_transform(y_train.reshape(-1,1)).toarray() # Преобразование меток классов в бинарные векторы с использованием OneHotEncoder 
                                                             # Для использования меток классов в нейронной сети
                                                             # Каждая строка матрицы представляет класс, а каждый столбец - принадлежность к соответствующему классу 
                                                             
y_test = np.array(y_test)
y_validate = np.array(y_validate)

y_test = enc.fit_transform(y_test.reshape(-1,1)).toarray()
y_validate = enc.fit_transform(y_validate.reshape(-1,1)).toarray() 

In [14]:
# optimizer_ = "Adam"
optimizer_ = keras.optimizers.Nadam(learning_rate=0.002)
loss_ = "categorical_crossentropy"
epochs_ = 5
batch_size_ = 32
metrics_ = [Precision(), Recall(), "accuracy"]

In [15]:
# Рекуррентная нейронная сеть 
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=150, input_length=len_sentence)) # Слой векторных представлений слов (используется для обработки естественного языка (классификации текста))
model.add(Dropout(0.2))
model.add(LSTM(128)) 
model.add(Dropout(0.2))
model.add(Dense(64, activation="sigmoid"))
model.add(Dropout(0.2))
model.add(Dense(6, activation="softmax"))

model.compile(optimizer=optimizer_, loss = loss_, metrics=metrics_)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 150)          1500000   
                                                                 
 dropout (Dropout)           (None, 150, 150)          0         
                                                                 
 lstm (LSTM)                 (None, 128)               142848    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 6)                 3

In [16]:
history = model.fit(x_train, y_train, epochs = epochs_, batch_size = batch_size_, validation_data=(x_validate, y_validate))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
# Подготовка новых текстов
new_texts = ["I feel happy today", "This is a sad day"]
x_new = text_prepare(pd.DataFrame({"Comment": new_texts}), "Comment")

# Прогнозирование с использованием обученной модели
predictions = model.predict(x_new)

# Преобразование предсказаний в интерпретируемый вид
predicted_labels = np.argmax(predictions, axis=1)

# Сопоставление с вашими категориями
emotion_mapping = {4: 'sadness', 0: 'anger', 3: 'love', 5: 'surprise', 1: 'fear', 2: 'joy'}
predicted_emotions = [emotion_mapping[label] for label in predicted_labels]

# Вывод результатов
for text, emotion in zip(new_texts, predicted_emotions):
    print(f"Text: {text}, Predicted Emotion: {emotion}")


(2, 1)
(2, 1)
Text: I feel happy today, Predicted Emotion: joy
Text: This is a sad day, Predicted Emotion: sadness
