In [48]:
import pandas as pd
import os
import re
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, mean_absolute_error, mean_squared_error, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
csv_path = './structured_posts.csv'
structured_df = pd.read_csv(csv_path)
structured_df.head()

Unnamed: 0,post_id,post_description
0,1,@user nice new signage. Are you not concerned ...
1,2,A woman who you fucked multiple times saying y...
2,3,@user @user real talk do you have eyes or were...
3,4,your girlfriend lookin at me like a groupie in...
4,5,Hysterical woman like @user


In [98]:
lexicon_path = './hatelexicons'
hate_words = set()

for filename in os.listdir(lexicon_path):
    file_path = os.path.join(lexicon_path, filename)
    
    if filename.startswith('.'):
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        words = {line.strip().lower() for line in f if line.strip()}
        hate_words.update(words)

print(f"Loaded {len(hate_words)} unique hate terms from {len(os.listdir(lexicon_path))} files.")
hate_df = pd.DataFrame(list(hate_words), columns=["hate_word"])
print(hate_df.head())

Loaded 2612 unique hate terms from 28 files.
       hate_word
0        nussija
1    ejaculation
2      anilingus
3      orospudan
4  splooge moose


In [51]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF" 
        u"\U0001F680-\U0001F6FF" 
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF" 
        u"\U00002702-\U000027B0" 
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def preprocess_text(text):
    text = text.lower()                          
    text = remove_emojis(text)                   
    text = re.sub(r'[^a-z\s]', '', text)        
    text = re.sub(r'\s+', ' ', text).strip()    
    return text

structured_df['cleaned_post'] = structured_df['post_description'].apply(preprocess_text)
structured_df[['post_description', 'cleaned_post']].head()


Unnamed: 0,post_description,cleaned_post
0,@user nice new signage. Are you not concerned ...,user nice new signage are you not concerned by...
1,A woman who you fucked multiple times saying y...,a woman who you fucked multiple times saying y...
2,@user @user real talk do you have eyes or were...,user user real talk do you have eyes or were t...
3,your girlfriend lookin at me like a groupie in...,your girlfriend lookin at me like a groupie in...
4,Hysterical woman like @user,hysterical woman like user


In [52]:
def label_hate_speech(text, hate_words):
    tokens = text.split()
    return int(any(token in hate_words for token in tokens))

structured_df['label'] = structured_df['cleaned_post'].apply(
    lambda x: label_hate_speech(x, hate_words)
)
print(structured_df['label'].value_counts())

label
0    6120
1    2873
Name: count, dtype: int64


In [53]:
vectorizer = TfidfVectorizer(max_features=1000) #max_features=1000
X = vectorizer.fit_transform(structured_df['cleaned_post'])
y = structured_df['label']

In [54]:
vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
vectorized_df.head()

Unnamed: 0,able,about,abuse,accept,access,account,accused,across,act,action,...,yesallmen,yet,yo,you,youll,young,your,youre,yourself,youve
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.378662,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.362155,0.261531,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.14751,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.302445,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#Naive Bayes

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, structured_df['label'], test_size=0.2, random_state=42)
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)
y_pred_prob = nb_model.predict_proba(X_test)[:, 1] 

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_nb = roc_auc_score(y_test, y_pred_prob)  
precision_nb = precision_score(y_test, y_pred)
accuracy_nb = accuracy_score(y_test, y_pred)
mae_nb = mean_absolute_error(y_test, y_pred_prob)
mse_nb = mean_squared_error(y_test, y_pred_prob)
rmse_nb = np.sqrt(mse_nb)
epsilon = 1e-10  
mape_nb = np.mean(np.abs((y_test - y_pred_prob) / (y_test + epsilon))) * 100

print("\nAdditional Metrics:")
print(f"AUC/ROC: {roc_nb:.4f}")
print(f"Precision: {precision_nb:.4f}")
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"MAE: {mae_nb:.4f}")
print(f"MSE: {mse_nb:.4f}")
print(f"RMSE: {rmse_nb:.4f}")
print(f"MAPE: {mape_nb:.2f}%")

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      1217
           1       0.87      0.90      0.89       582

    accuracy                           0.93      1799
   macro avg       0.91      0.92      0.92      1799
weighted avg       0.93      0.93      0.93      1799


Confusion Matrix:
[[1141   76]
 [  58  524]]

Additional Metrics:
AUC/ROC: 0.9720
Precision: 0.8733
Accuracy: 0.9255
MAE: 0.1808
MSE: 0.0678
RMSE: 0.2605
MAPE: 115501987223.76%


In [57]:
pip install keras tensorflow transformers datasets torch scikit-learn

Note: you may need to restart the kernel to use updated packages.


# BiLSTM

In [59]:
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(structured_df['cleaned_post'])
sequences = tokenizer.texts_to_sequences(structured_df['cleaned_post'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, structured_df['label'], test_size=0.2, random_state=42)

In [60]:
embedding_index = {}
with open('./glove.6B/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

embedding_dim = 100
word_index = tokenizer.word_index
embedding_matrix = np.zeros((MAX_NUM_WORDS, embedding_dim))

for word, i in word_index.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [61]:

model = Sequential([
    Embedding(MAX_NUM_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=6, batch_size=32, validation_split=0.1) 


Epoch 1/6




[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.7594 - loss: 0.4762 - val_accuracy: 0.9250 - val_loss: 0.2015
Epoch 2/6
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step - accuracy: 0.9330 - loss: 0.1887 - val_accuracy: 0.9375 - val_loss: 0.1782
Epoch 3/6
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step - accuracy: 0.9574 - loss: 0.1294 - val_accuracy: 0.9639 - val_loss: 0.1020
Epoch 4/6
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step - accuracy: 0.9639 - loss: 0.1114 - val_accuracy: 0.9625 - val_loss: 0.1290
Epoch 5/6
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.9763 - loss: 0.0730 - val_accuracy: 0.9806 - val_loss: 0.0756
Epoch 6/6
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.9838 - loss: 0.0538 - val_accuracy: 0.9778 - val_loss: 0.0736


<keras.src.callbacks.history.History at 0x30174c830>

In [62]:
y_pred_prob = model.predict(X_test).ravel()
y_pred = (y_pred_prob > 0.5).astype(int)

roc = roc_auc_score(y_test, y_pred_prob)
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred_prob)
mse = mean_squared_error(y_test, y_pred_prob)
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_test - y_pred_prob) / y_test)) * 100

print(f"AUC/ROC: {roc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.2f}%")

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
AUC/ROC: 0.9936
Precision: 0.9500
Accuracy: 0.9767
MAE: 0.0367
MSE: 0.0203
RMSE: 0.1424
MAPE: inf%


## SVM 

In [64]:
base_model = LinearSVC(random_state=42)
svm_model = CalibratedClassifierCV(base_model, cv=5)
svm_model.fit(X_train, y_train)

y_pred_prob_svm = svm_model.predict_proba(X_test)[:, 1]
y_pred_svm = (y_pred_prob_svm > 0.5).astype(int)

roc_svm = roc_auc_score(y_test, y_pred_prob_svm)
precision_svm = precision_score(y_test, y_pred_svm)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
mae_svm = mean_absolute_error(y_test, y_pred_prob_svm)
mse_svm = mean_squared_error(y_test, y_pred_prob_svm)
rmse_svm = np.sqrt(mse_svm)
epsilon = 1e-10
mape_svm = np.mean(np.abs((y_test - y_pred_prob_svm) / (y_test + epsilon))) * 100

print("SVM Metrics")
metrics_svm = {
    "AUC/ROC": roc_svm,
    "Precision": precision_svm,
    "Accuracy": accuracy_svm,
    "MAE": mae_svm,
    "MSE": mse_svm,
    "RMSE": rmse_svm,
    "MAPE": mape_svm
}
for metric, value in metrics_svm.items():
    print(f"{metric}: {value:.2f}%" if metric == "MAPE" else f"{metric}: {value:.4f}")



SVM Metrics
AUC/ROC: 0.6333
Precision: 0.2000
Accuracy: 0.6748
MAE: 0.4250
MSE: 0.2120
RMSE: 0.4605
MAPE: 210213086228.88%




## CNN

In [66]:
cnn_model = Sequential([
    Embedding(input_dim=MAX_NUM_WORDS, 
              output_dim=embedding_dim, 
              weights=[embedding_matrix], 
              input_length=MAX_SEQUENCE_LENGTH, 
              trainable=False),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
y_pred_prob_cnn = cnn_model.predict(X_test).ravel()
y_pred_cnn = (y_pred_prob_cnn > 0.5).astype(int)

roc_cnn = roc_auc_score(y_test, y_pred_prob_cnn)
precision_cnn = precision_score(y_test, y_pred_cnn)
accuracy_cnn = accuracy_score(y_test, y_pred_cnn)
mae_cnn = mean_absolute_error(y_test, y_pred_prob_cnn)
mse_cnn = mean_squared_error(y_test, y_pred_prob_cnn)
rmse_cnn = np.sqrt(mse_cnn)
mape_cnn = np.mean(np.abs((y_test - y_pred_prob_cnn) / (y_test + epsilon))) * 100

print("CNN Metrics")
print("AUC/ROC: {:.4f}".format(roc_cnn))
print("Precision: {:.4f}".format(precision_cnn))
print("Accuracy: {:.4f}".format(accuracy_cnn))
print("MAE: {:.4f}".format(mae_cnn))
print("MSE: {:.4f}".format(mse_cnn))
print("RMSE: {:.4f}".format(rmse_cnn))
print("MAPE: {:.2f}%".format(mape_cnn))


Epoch 1/5




[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8027 - loss: 0.4258 - val_accuracy: 0.9389 - val_loss: 0.1577
Epoch 2/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9559 - loss: 0.1352 - val_accuracy: 0.9556 - val_loss: 0.1316
Epoch 3/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9823 - loss: 0.0624 - val_accuracy: 0.9639 - val_loss: 0.1001
Epoch 4/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9886 - loss: 0.0383 - val_accuracy: 0.9667 - val_loss: 0.1109
Epoch 5/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9970 - loss: 0.0145 - val_accuracy: 0.9681 - val_loss: 0.1168
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
CNN Metrics
AUC/ROC: 0.9938
Precision: 0.9615
Accuracy: 0.9700
MAE: 0.0352
MSE: 0.0232
RMSE: 0.1522
MAPE: 16832950862.10%


## Randomforest

In [68]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_prob_rf = rf_model.predict_proba(X_test)[:, 1]
y_pred_rf = (y_pred_prob_rf > 0.5).astype(int)

roc_rf = roc_auc_score(y_test, y_pred_prob_rf)
precision_rf = precision_score(y_test, y_pred_rf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_prob_rf)
mse_rf = mean_squared_error(y_test, y_pred_prob_rf)
rmse_rf = np.sqrt(mse_rf)
epsilon = 1e-10  
mape_rf = np.mean(np.abs((y_test - y_pred_prob_rf) / (y_test + epsilon))) * 100

print("Random Forest Metrics ")
print("AUC/ROC: {:.4f}".format(roc_rf))
print("Precision: {:.4f}".format(precision_rf))
print("Accuracy: {:.4f}".format(accuracy_rf))
print("MAE: {:.4f}".format(mae_rf))
print("MSE: {:.4f}".format(mse_rf))
print("RMSE: {:.4f}".format(rmse_rf))
print("MAPE: {:.2f}%".format(mape_rf))


Random Forest Metrics 
AUC/ROC: 0.7226
Precision: 0.5892
Accuracy: 0.6948
MAE: 0.3932
MSE: 0.1923
RMSE: 0.4385
MAPE: 199760978340.63%


In [69]:
epsilon = 1e-10
metrics_summary = {
    "Naive Bayes": {
        "AUC/ROC": roc_nb,
        "Precision": precision_nb,
        "Accuracy": accuracy_nb,
        "MAE": mae_nb,
        "MSE": mse_nb,
        "RMSE": rmse_nb,
        "MAPE": mape_nb
    },
    "BiLSTM": {
        "AUC/ROC": roc,
        "Precision": precision,
        "Accuracy": accuracy,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "MAPE": mape
    },
    "SVM": metrics_svm,
     "CNN": {
        "AUC/ROC": roc_cnn,
        "Precision": precision_cnn,
        "Accuracy": accuracy_cnn,
        "MAE": mae_cnn,
        "MSE": mse_cnn,
        "RMSE": rmse_cnn,
        "MAPE": mape_cnn
    },
     "Random Forest": {
        "AUC/ROC": roc_rf,
        "Precision": precision_rf,
        "Accuracy": accuracy_rf,
        "MAE": mae_rf,
        "MSE": mse_rf,
        "RMSE": rmse_rf,
        "MAPE": mape_rf
    },
}
summary_df = pd.DataFrame(metrics_summary).T 

summary_df["MAPE"] = summary_df["MAPE"].apply(lambda x: f"{x:.2f}%")
summary_df = summary_df.round(4)

print("Final Model Evaluation Summary:\n")
print(summary_df)

Final Model Evaluation Summary:

               AUC/ROC  Precision  Accuracy     MAE     MSE    RMSE  \
Naive Bayes     0.9720     0.8733    0.9255  0.1808  0.0678  0.2605   
BiLSTM          0.9936     0.9500    0.9767  0.0367  0.0203  0.1424   
SVM             0.6333     0.2000    0.6748  0.4250  0.2120  0.4605   
CNN             0.9938     0.9615    0.9700  0.0352  0.0232  0.1522   
Random Forest   0.7226     0.5892    0.6948  0.3932  0.1923  0.4385   

                           MAPE  
Naive Bayes    115501987223.76%  
BiLSTM                     inf%  
SVM            210213086228.88%  
CNN             16832950862.10%  
Random Forest  199760978340.63%  


In [70]:
import pickle

with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(nb_model, f)

with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

with open('cnn_model.pkl', 'wb') as f:
    pickle.dump(cnn_model, f)

with open('bilstm_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)


In [71]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
 

In [72]:
with open("vectorizer.pkl", "wb") as f:
 pickle.dump(vectorizer, f)


In [73]:
with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)
