In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau


from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [2]:
# Ensure reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# Load pre-trained embeddings
def load_glove_embeddings(glove_file, embedding_dim=100):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, embeddings_index, embedding_dim=100):
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, i in word_index.items():
        if i < num_words:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Preprocess data
def preprocess_data(texts, labels, max_len=50, num_words=10000):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    X = pad_sequences(sequences, maxlen=max_len)
    return X, labels, tokenizer

# Encode labels
def encode_labels(labels):
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(labels)
    y_encoded = tf.keras.utils.to_categorical(y_encoded)  # One-hot encode for multi-class
    return y_encoded, label_encoder

In [4]:
# Split data into train, validation, and test sets
def split_data(X, y, test_size=0.2, val_size=0.1, random_state=42):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(test_size + val_size), random_state=random_state, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_size / (test_size + val_size)), random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

# Apply SMOTE
def apply_smote(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train, y_train)
    return X_res, y_res

In [5]:
# Model building functions
def build_lstm_model(vocab_size, embedding_dim=100, num_classes=5, max_sequence_length=100):
    model = Sequential()
    model.add(Input(shape=(max_sequence_length,)))
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_lstm_with_glove_model(vocab_size, embedding_matrix, embedding_dim=100, num_classes=5, max_sequence_length=100):
    model = Sequential()
    model.add(Input(shape=(max_sequence_length,)))
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_bidirectional_lstm_with_glove_model(vocab_size, embedding_matrix, embedding_dim=100, num_classes=5, max_sequence_length=100):
    model = Sequential()
    model.add(Input(shape=(max_sequence_length,)))
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def preprocess_data(df, max_sequence_length=100):
    X = df['text_cleaned'].values
    y = df['cyberbullying_type'].values
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    y = to_categorical(y)  # Convert labels to one-hot encoding

    # Tokenization and Padding
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(X)
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=max_sequence_length)
    
    return X, y, label_encoder, tokenizer

def train_and_evaluate_model(X, y, model_builder, save_path=None, test_size=0.2, val_size=0.2, oversample=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=np.argmax(y, axis=1))
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=42, stratify=np.argmax(y_train, axis=1))

    if oversample:
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)

    model = model_builder()
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping],
        verbose=2
    )

    scores = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test Score: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')

    y_pred = model.predict(X_test).argmax(axis=1)
    y_true = y_test.argmax(axis=1)
    print(f'\nClassification Report:\n', classification_report(y_true, y_pred))

# Example usage with placeholders for data and GloVe embeddings
if __name__ == "__main__":
    # Load your data
    df = pd.read_csv('D:/Cyberbullying classification/data/cleaned_data.csv')  # Update with your data path
    X, y, label_encoder, tokenizer = preprocess_data(df)

    # Assume you have pre-loaded GloVe embeddings
    embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 100)  # Replace with actual GloVe embedding matrix
    vocab_size = len(embedding_matrix)

    experiments = [
        {"name": "LSTM Model", "builder": lambda: build_lstm_model(vocab_size=vocab_size)},
        {"name": "LSTM with GloVe", "builder": lambda: build_lstm_with_glove_model(vocab_size=vocab_size, embedding_matrix=embedding_matrix)},
        {"name": "Bidirectional LSTM with GloVe", "builder": lambda: build_bidirectional_lstm_with_glove_model(vocab_size=vocab_size, embedding_matrix=embedding_matrix)}
    ]

    for experiment in experiments:
        print(f"\nRunning experiment: {experiment['name']}")
        save_path = f"models/{experiment['name'].replace(' ', '_')}.keras"
        train_and_evaluate_model(X, y, experiment["builder"], oversample=False)



Running experiment: LSTM Model
Epoch 1/10
745/745 - 32s - 43ms/step - accuracy: 0.8720 - loss: 0.3748 - val_accuracy: 0.9359 - val_loss: 0.1921
Epoch 2/10
745/745 - 30s - 41ms/step - accuracy: 0.9560 - loss: 0.1371 - val_accuracy: 0.9347 - val_loss: 0.2079
Epoch 3/10
745/745 - 31s - 42ms/step - accuracy: 0.9752 - loss: 0.0797 - val_accuracy: 0.9260 - val_loss: 0.2636
Epoch 4/10
745/745 - 32s - 42ms/step - accuracy: 0.9846 - loss: 0.0495 - val_accuracy: 0.9261 - val_loss: 0.2963
Test Score: loss of 0.20902451872825623; compile_metrics of 92.6826000213623%
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1582
           1       0.98      0.99      0.98      1558
           2       0.96      0.83      0.89      1489
           3       0.75      0.89      0.82      1237
           4       0.96      0.94      0.95      1582

    accuracy

## Observations:

From the results, we can see the performance metrics for each of your models:

**Baseline LSTM:**

-> Accuracy: 82.39%

The model performed reasonably well, with strong precision and recall for classes like age and ethnicity, but lower performance on classes like not_cyberbullying and other_cyberbullying.

**LSTM with Pre-trained Embedding:**

-> Accuracy: 82.61%
The addition of pre-trained embeddings slightly improved the model’s accuracy. The model showed improvement in the classification of other_cyberbullying and not_cyberbullying.

**Bidirectional LSTM with Pre-trained Embedding:**

-> Accuracy: 83.22%
The Bidirectional LSTM with pre-trained embeddings showed the highest accuracy. It improved recall for gender and other_cyberbullying categories, indicating better performance in capturing context from both directions in the sequence.

Baseline LSTM Model:

Accuracy: 82.31%
The model's performance appears to be decent, with high precision and recall for the categories "age," "ethnicity," and "religion."
However, the model struggles with "not_cyberbullying" and "other_cyberbullying" categories, both showing relatively lower precision and recall, suggesting a need for improvement in distinguishing these categories.
LSTM with Pre-trained Embedding:

Accuracy: 83.28%
Incorporating pre-trained embeddings slightly improved the model's performance. The overall accuracy and the F1-scores for some categories (like "other_cyberbullying") improved.
However, there is still some room for improvement in handling the "not_cyberbullying" and "other_cyberbullying" categories.
Bidirectional LSTM with Pre-trained Embedding:

Accuracy: 83.84%
The Bidirectional LSTM model with pre-trained embeddings performed the best among the three models.
This model showed an increase in F1-scores for the "gender," "other_cyberbullying," and "not_cyberbullying" categories, which were previously problematic.
This indicates that the bidirectional nature of the LSTM helps in better capturing context from both directions in the text, leading to better classification.

The generic nature of the "other_cyberbullying" and "not_cyberbullying" categories could contribute to lower performance. These classes might include a wide range of different text patterns, making it harder for the model to learn consistent features.