#### Import required libraries

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from gensim.models import KeyedVectors
import gensim.downloader as api
import warnings
warnings.filterwarnings('ignore')

#### Data collection

In [2]:
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [3]:
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (100499, 7)
y_train: (100499, 6)
X_test: (25125, 7)
y_test: (25125, 6)


#### Data cleaning

In [4]:
X_train['clean_text'] = X_train['clean_text'].apply(lambda x: re.sub('[^a-z]', ' ', x))
X_test['clean_text'] = X_test['clean_text'].apply(lambda x: re.sub('[^a-z]', ' ', x))

In [5]:
train_labels = y_train.values
test_labels = y_test.values

#### Tokenize and apply padding

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['clean_text'])

train_text = tokenizer.texts_to_sequences(X_train['clean_text'])
test_text = tokenizer.texts_to_sequences(X_test['clean_text'])

In [7]:
max_length = 500

train_text = pad_sequences(train_text, max_length)
test_text = pad_sequences(test_text, max_length)

#### Load pre-trained fasttext model

In [55]:
#embedding_model = api.load('fasttext-wiki-news-subwords-300')
#embedding_model.save('../src/models/fasttext-wiki-news-subwords-300.model')

In [9]:
loaded_fasttext_model = KeyedVectors.load('../src/models/fasttext-wiki-news-subwords-300.model')

#### Create embedding matrix

In [51]:
embedding_dim = loaded_fasttext_model.vector_size
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [52]:
for word, i in tokenizer.word_index.items():
    if word in loaded_fasttext_model:
        embedding_matrix[i] = loaded_fasttext_model[word]
    else:
        embedding_matrix[i] = np.zeros(embedding_dim)

#### Build LSTM model

In [59]:
lstm = Sequential()
lstm.add(Embedding(input_dim=vocab_size, 
                   output_dim=embedding_dim,
                   input_length=max_length,
                   weights=[embedding_matrix],
                   trainable=False))
lstm.add(LSTM(128))
lstm.add(Dense(128, activation='relu'))
lstm.add(Dense(6, activation='sigmoid'))

In [60]:
lstm.compile(optimizer=Adam(learning_rate=0.001, amsgrad=True),
             loss=BinaryCrossentropy(),
             metrics=['accuracy'])

In [61]:
lstm.fit(train_text, train_labels, batch_size=32, epochs=10, validation_data=(test_text, test_labels))

Epoch 1/10
[1m3141/3141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 138ms/step - accuracy: 0.9376 - loss: 0.0983 - val_accuracy: 0.9953 - val_loss: 0.0579
Epoch 2/10
[1m3141/3141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 140ms/step - accuracy: 0.9940 - loss: 0.0533 - val_accuracy: 0.9953 - val_loss: 0.0502
Epoch 3/10
[1m3141/3141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 140ms/step - accuracy: 0.9899 - loss: 0.0476 - val_accuracy: 0.9936 - val_loss: 0.0467
Epoch 4/10
[1m3141/3141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 141ms/step - accuracy: 0.9890 - loss: 0.0444 - val_accuracy: 0.9928 - val_loss: 0.0461
Epoch 5/10
[1m3141/3141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 140ms/step - accuracy: 0.9785 - loss: 0.0427 - val_accuracy: 0.9942 - val_loss: 0.0460
Epoch 6/10
[1m3141/3141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 141ms/step - accuracy: 0.9736 - loss: 0.0407 - val_accuracy: 0.9936 - val_loss:

<keras.src.callbacks.history.History at 0x1822fc89490>

In [62]:
lstm_pred = lstm.predict(test_text)
lstm_pred_binary = (lstm_pred > 0.5).astype(int)

[1m786/786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 63ms/step


In [64]:
print('Accuracy:', accuracy_score(test_labels, lstm_pred_binary))
print('F1 score:', f1_score(test_labels, lstm_pred_binary, average='weighted'))
print(classification_report(test_labels, lstm_pred_binary))

Accuracy: 0.9160995024875622
F1 score: 0.7549960977268603
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      2702
           1       0.52      0.16      0.24       272
           2       0.84      0.82      0.83      1473
           3       0.57      0.37      0.45        91
           4       0.78      0.68      0.73      1409
           5       0.63      0.44      0.52       240

   micro avg       0.83      0.71      0.76      6187
   macro avg       0.70      0.54      0.60      6187
weighted avg       0.81      0.71      0.75      6187
 samples avg       0.07      0.07      0.07      6187



In [65]:
lstm.save('../src/models/lstm.h5')

