In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

# Load GloVe embeddings
def load_glove_embeddings(glove_path, word_index, embedding_dim=100):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            if word in word_index:
                index = word_index[word]
                embedding_matrix[index] = coefs
    return embedding_matrix

# Build Bi-LSTM with GloVe model
def build_bidirectional_lstm_with_glove_model(vocab_size, embedding_matrix, embedding_dim=100, num_classes=5, max_sequence_length=100):
    model = Sequential()
    model.add(Input(shape=(max_sequence_length,)))
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Preprocess data
def preprocess_data(df, max_sequence_length=100):
    X = df['text_cleaned'].values
    y = df['cyberbullying_type'].values
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    y = to_categorical(y)  # Convert labels to one-hot encoding

    # Tokenization and Padding
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(X)
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=max_sequence_length)
    
    return X, y, label_encoder, tokenizer

# Train and evaluate model
def train_and_evaluate_model(X, y, model_builder, save_path=None, test_size=0.2, val_size=0.2, oversample=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=np.argmax(y, axis=1))
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=42, stratify=np.argmax(y_train, axis=1))

    model = model_builder()
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=64,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping],
        verbose=2
    )

    scores = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test Score: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')

    y_pred = model.predict(X_test).argmax(axis=1)
    y_true = y_test.argmax(axis=1)
    
    # Display classification report with class names
    print(f'\nClassification Report:\n', classification_report(y_true, y_pred, target_names=label_encoder.classes_))

    if save_path:
        model.save(save_path)
        print(f'Model saved to {save_path}')

# Example usage
if __name__ == "__main__":
    # Load your data
    df = pd.read_csv('D:/Cyberbullying classification/data/cleaned_data.csv')  
    X, y, label_encoder, tokenizer = preprocess_data(df)

    # Load GloVe embeddings
    glove_path = 'D:\Cyberbullying classification\data\glove.6B.100d.txt'  
    embedding_matrix = load_glove_embeddings(glove_path, tokenizer.word_index, embedding_dim=100)
    vocab_size = len(embedding_matrix)

    print("\nRunning Bi-LSTM with GloVe")
    save_path = "models/Bidirectional_LSTM_with_GloVe.keras"
    train_and_evaluate_model(X, y, lambda: build_bidirectional_lstm_with_glove_model(vocab_size=vocab_size, embedding_matrix=embedding_matrix), save_path=save_path, oversample=False)



Running Bi-LSTM with GloVe
Epoch 1/10
373/373 - 14s - 37ms/step - accuracy: 0.8299 - loss: 0.4978 - val_accuracy: 0.8974 - val_loss: 0.2906
Epoch 2/10
373/373 - 11s - 29ms/step - accuracy: 0.9017 - loss: 0.2865 - val_accuracy: 0.9052 - val_loss: 0.2556
Epoch 3/10
373/373 - 11s - 30ms/step - accuracy: 0.9136 - loss: 0.2512 - val_accuracy: 0.9201 - val_loss: 0.2244
Epoch 4/10
373/373 - 11s - 29ms/step - accuracy: 0.9206 - loss: 0.2303 - val_accuracy: 0.9233 - val_loss: 0.2163
Epoch 5/10
373/373 - 11s - 29ms/step - accuracy: 0.9238 - loss: 0.2183 - val_accuracy: 0.9213 - val_loss: 0.2175
Epoch 6/10
373/373 - 11s - 29ms/step - accuracy: 0.9276 - loss: 0.2046 - val_accuracy: 0.9198 - val_loss: 0.2148
Epoch 7/10
373/373 - 11s - 30ms/step - accuracy: 0.9324 - loss: 0.1900 - val_accuracy: 0.9241 - val_loss: 0.2054
Epoch 8/10
373/373 - 11s - 30ms/step - accuracy: 0.9347 - loss: 0.1840 - val_accuracy: 0.9236 - val_loss: 0.2151
Epoch 9/10
373/373 - 12s - 31ms/step - accuracy: 0.9404 - loss: 0.17

# Observation

* Accuracy: 92.52%
* F1-Score: Varies across categories but remains strong, particularly for the "age," "ethnicity," and "religion" classes.


The classification report shows good precision and recall across most categories. The "not_cyberbullying" class has a slightly lower F1-score which is due to the generality in the tweets.