In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping

In [2]:
train = pd.read_csv('simplified_emotions.csv')
test = pd.read_csv('test.csv', delimiter='\t')

In [3]:
X_train = train['Sentences']
y_train = train['Emotions']

X_test = test['sentence']

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
vocab_size = len(word_index)

sequences = tokenizer.texts_to_sequences(X_train)

max_length = max([len(seq) for seq in sequences])
X_train = pad_sequences(sequences, maxlen=max_length)

In [5]:
vocab_size

85171

In [6]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

In [7]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1 , output_dim=32))
model.add(LSTM(units=64, dropout=0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(8, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

In [10]:
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

Epoch 1/5
[1m12187/12187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m602s[0m 49ms/step - accuracy: 0.9405 - loss: 0.1283 - val_accuracy: 0.6904 - val_loss: 0.6792
Epoch 2/5
[1m12187/12187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m593s[0m 49ms/step - accuracy: 0.9961 - loss: 0.0108 - val_accuracy: 0.7059 - val_loss: 0.6483
Epoch 3/5
[1m12187/12187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m586s[0m 48ms/step - accuracy: 0.9970 - loss: 0.0074 - val_accuracy: 0.6428 - val_loss: 1.2530
Epoch 4/5
[1m12187/12187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m575s[0m 47ms/step - accuracy: 0.9975 - loss: 0.0062 - val_accuracy: 0.6710 - val_loss: 0.9688
Epoch 5/5
[1m12187/12187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m585s[0m 48ms/step - accuracy: 0.9979 - loss: 0.0051 - val_accuracy: 0.6705 - val_loss: 0.8486


In [11]:
test_sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post')
prediction = model.predict(X_test)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


In [12]:
prediction_num = [1 if pred > 0.5 else 0 for pred in prediction]
prediction = encoder.inverse_transform(prediction_num)

In [13]:
submission = pd.DataFrame({'id': test['id'], 'emotion': prediction})

In [14]:
submission['emotion'].unique

<bound method Series.unique of 0       other
1       other
2       other
3       other
4       other
        ...  
1431    other
1432    other
1433    other
1434    other
1435    other
Name: emotion, Length: 1436, dtype: object>

In [15]:
submission.to_csv('submissions/submission_simple_rnn.csv', index=False)