# Modelling LSTM, GRADIENT BOOSTING, FAST TEXT + RFC, FAST TEXT + LSTM


## ImportS

In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time
import spacy
import ast
import joblib


from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from ast import literal_eval



from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from gensim.models import Word2Vec

from sklearn.ensemble import GradientBoostingClassifier

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
#this initialize tqdm which is useful to show a progress bar when applying operations in a pandas df
tqdm.pandas()

## Data Extract

In [None]:
df = pd.read_csv('data/undersampled_data_60_40_FINAL.csv')

'''Column description:
-------------------

'raw'
raw original comment, no cleaing or preprocessing whatsoever

'clean'
previous column with regex cleaning (HTML anchor tags, URLs, newlines etc.)

'clean_pp'
previous column with spaCy preprocessing (tokenization, punctuation removal, make lower case)

'clean_pp_lemma'
previous column with spaCy lemmatization

'clean_pp_lemma_stop'
previous column with stop words removed

'toxic'
target/label'''

In [22]:
df

Unnamed: 0,raw,clean,clean_pp,clean_pp_lemma,clean_pp_lemma_stop,toxic
0,"Well, what are the chances he will turn out to...","Well, what are the chances he will turn out to...",well what are the chances he will turn out to ...,well what be the chance he will turn out to ha...,chance turn active proponent slavery,0
1,The moment of critical mass is approaching whe...,The moment of critical mass is approaching whe...,the moment of critical mass is approaching whe...,the moment of critical mass be approach when t...,moment critical mass approach deed gupta co li...,0
2,"""Hey listen to me,"" he said. ""I'm not going to...","""Hey listen to me,"" he said. ""I'm not going to...",hey listen to me he said i 'm not going to put...,hey listen to i he say i be not go to put up w...,hey listen say go crap prove reporter say uh a...,1
3,We are already owed $488 M plus interest($2Bil...,We are already owed $ M plus interest($ Billio...,we are already owed $ m plus interest($ billio...,we be already owe $ m plus interest($ billion ...,owe $ m plus interest($ billion audits state c...,0
4,There is a reason there are no teeth to the la...,There is a reason there are no teeth to the la...,there is a reason there are no teeth to the la...,there be a reason there be no tooth to the law...,reason tooth law unlawful law way force free e...,0
...,...,...,...,...,...,...
360296,Do you still beat your wife? Simple question.,Do you still beat your wife? Simple question.,do you still beat your wife simple question,do you still beat your wife simple question,beat wife simple question,0
360297,The fascist dictator continues the insanity ag...,The fascist dictator continues the insanity ag...,the fascist dictator continues the insanity ag...,the fascist dictator continue the insanity aga...,fascist dictator continue insanity human civil...,1
360298,Sean Hannity is a lightweight foolish commenta...,Sean Hannity is a lightweight foolish commenta...,sean hannity is a lightweight foolish commenta...,sean hannity be a lightweight foolish commenta...,sean hannity lightweight foolish commentator f...,0
360299,There are a number of countries which make it ...,There are a number of countries which make it ...,there are a number of countries which make it ...,there be a number of country which make it imp...,number country impossible national citizenship...,0


In [5]:
df.columns

Index(['raw', 'clean', 'clean_pp', 'clean_pp_lemma', 'clean_pp_lemma_stop',
       'toxic'],
      dtype='object')

In [7]:
#drop NaNs from df['stopwords_punct_lemma']
df.dropna(subset=['clean_pp_lemma'], inplace=True)


## Function to Calculate

In [8]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"
    predicted_probs = model.predict_proba(X_test)[:, 1]

    # Calculate metrics using probabilities
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predicted_probs)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## LSTM

In [18]:
X = df['clean_pp_lemma'].values
y = df['toxic'].values 

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Tokenize and convert text to sequences
max_words = 10000  # Set the maximum number of words to consider
max_len = 100  # Set the maximum length of each sequence
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [19]:
# Save the tokenizer to a file
tokenizer_file_path = 'data/tokenizer_andre_lstm.pkl'
with open(tokenizer_file_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [20]:
# Pad sequences to a fixed length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='sigmoid'))

In [21]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1080ca690>

In [76]:
# Save the model architecture as JSON
model_json = model.to_json()
with open('model5_andre_lstm.json', 'w') as json_file:
    json_file.write(model_json)

# Save the model weights
model.save_weights('model_weights5.h5')

In [23]:
'''
with open('lstm_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
    '''

In [16]:
# Generate predictions
y_pred = (model.predict(X_test_padded) > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict(X_test_padded))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {roc_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.8663991223806813
Precision: 0.844780711168348
Recall: 0.8172128594663806
F1 Score: 0.8307681483175318
AUC-ROC: 0.9333377455640817
Confusion Matrix:
[[38777  4339]
 [ 5282 23615]]
