# LSTM + SMOTE


## ImportS

In [2]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time
import spacy
import ast
import joblib


from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from ast import literal_eval



from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from gensim.models import Word2Vec

from sklearn.ensemble import GradientBoostingClassifier

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



## Data Extract

In [2]:
df = pd.read_csv('data/undersampled_data_60_40.csv')


In [5]:
df.columns

Index(['comment_text', 'toxic', 'stopwords_punct_lemma', 'vector_spacy',
       'pos_tags', 'pos_tags_str'],
      dtype='object')

In [6]:
#drop NaNs from df['stopwords_punct_lemma']
df.dropna(subset=['stopwords_punct_lemma'], inplace=True)


## LSTM

In [6]:
X = df['comment_text'].values
y = df['toxic'].values 

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Tokenize and convert text to sequences
max_words = 10000  # Set the maximum number of words to consider
max_len = 100  # Set the maximum length of each sequence
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [7]:
# Save the tokenizer to a file
tokenizer_file_path = 'data/tokenizer.pkl'
with open(tokenizer_file_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [8]:
# Pad sequences to a fixed length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='sigmoid'))

In [9]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x45bf0e610>

In [76]:
# Save the model architecture as JSON
model_json = model.to_json()
with open('model5.json', 'w') as json_file:
    json_file.write(model_json)

# Save the model weights
model.save_weights('model_weights5.h5')

In [23]:
'''
with open('lstm_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    '''

In [77]:
# Generate predictions
y_pred = (model.predict(X_test_padded) > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict(X_test_padded))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.8389152936937936
Precision: 0.8017638036809815
Recall: 0.7948097722796227
F1 Score: 0.7982716435004424
AUC-ROC: 0.9027370638487501
Confusion Matrix:
[[37541  5687]
 [ 5938 23001]]


## FAST Text

In [30]:
ft_df = pd.read_csv('data/undersampled_data_60_40_ft.csv')


In [11]:
ft_df.columns

Index(['Unnamed: 0', 'comment_text', 'toxic', 'stopwords_punct_lemma',
       'toxic_label_ft', 'toxic_label_comment_text', 'vector_fast_text'],
      dtype='object')

In [50]:
ft_df['new_ft'].head()

0    [-0.0577833019, 0.0458838157, -0.0487854704, -...
1    [-0.0385174714, 0.0294841994, -0.0353648514, -...
2    [0.08621803, -0.06944817, 0.08360571, 0.003052...
3    [-0.02172438, 0.01810819, -0.02264511, -0.0008...
4    [-0.04083619, 0.03226621, -0.03952266, -0.0020...
Name: new_ft, dtype: object

In [33]:
corpus_vect = ft_df['vector_fast_text'].str.strip('[]').str.split(expand=True)
corpus_vect = corpus_vect.astype('float')

In [34]:
ft_df['new_ft'] = corpus_vect.values.tolist()


### RFC + fast_text

In [37]:
X = np.array(ft_df['new_ft'].tolist())
y = ft_df['toxic']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Flatten X_train and X_test
X_train_flatten = X_train.reshape(X_train.shape[0], -1)
X_test_flatten = X_test.reshape(X_test.shape[0], -1)

# Initialize RandomForestClassifier
rfc = RandomForestClassifier()

# Fit the model
rfc.fit(X_train_flatten, y_train)

# Predict on the test set
y_pred = rfc.predict(X_test_flatten)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9696090811950986


In [38]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, rfc.predict_proba(X_test_flatten)[:, 1])
print(f"AUC-ROC: {auc_roc}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")


Accuracy: 0.9696090811950986
Precision: 0.9666179337231969
Recall: 0.957551724137931
F1 Score: 0.9620634700665189
AUC-ROC: 0.9888729088406262
Confusion Matrix:
[[42102   959]
 [ 1231 27769]]
