In [13]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, Dropout
from keras.callbacks import EarlyStopping

from keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
from keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
# Ensure reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [15]:
# Load the dataset
def load_data(file_path):
    """Loads data from a CSV file.
    
    Args:
        file_path (str): The path to the CSV file.
    
    Returns:
        X (pd.DataFrame): Features (tweet texts).
        y (pd.Series): Target labels (cyberbullying types).
    """
    df = pd.read_csv(file_path)
    
    X = df['text_cleaned'].values
    y = df['cyberbullying_type'].values
    
    return X, y

# 2. Preprocess data (convert to numerical format)
def preprocess_labels(y):
    """Encodes target labels as integers.
    
    Args:
        y (np.array or pd.Series): Target labels.
    
    Returns:
        y_encoded (np.array): Encoded labels.
    """
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    return y_encoded


file_path = '../data/cleaned_data.csv' 

# Load and preprocess the data
X, y = load_data(file_path)
y = preprocess_labels(y)

# Print a summary of the loaded data
print(f"Loaded {len(X)} samples.")
print(f"Example text: {X[0]}")
print(f"Example label: {y[0]}")


# Tokenize the text data
tokenizer = Tokenizer()  
tokenizer.fit_on_texts(X)
X_tokenized = tokenizer.texts_to_sequences(X)


# Pad the sequences to have the same length
max_sequence_length = max([len(seq) for seq in X_tokenized])
X_padded = pad_sequences(X_tokenized, maxlen=max_sequence_length)

# Print shape of the processed input data
print(f"Shape of padded input data: {X_padded.shape}")

# Update X for model training
X = X_padded


Loaded 37237 samples.
Example text: word katandandre food crapilicious
Example label: 3
Shape of padded input data: (37237, 43)


In [16]:
def build_rnn_model(vocab_size, embedding_dim=128, rnn_units=128, dropout_rate=0.4, output_classes=6, activation='softmax', stacked_rnn_layers=False):
    """Builds a SimpleRNN model with specified parameters, including an optional additional Dense layer.
    
    Args:
        vocab_size (int): Size of the vocabulary.
        embedding_dim (int): Dimensionality of embedding vectors.
        rnn_units (int): Number of units in the RNN layer.
        dropout_rate (float): Dropout rate for regularization.
        output_classes (int): Number of output classes.
        activation (str): Activation function for the output layer.
        additional_layer (bool): Whether to add an additional Dense layer.
    
    Returns:
        model (keras.Model): Compiled RNN model.
    """
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))

    if stacked_rnn_layers:
        model.add(SimpleRNN(rnn_units, return_sequences=True))
        model.add(SimpleRNN(rnn_units, return_sequences=False))
    else:
        model.add(SimpleRNN(rnn_units, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(output_classes, activation=activation))
    
    optimizer = Adam(learning_rate=0.0003)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model



In [17]:
# Create the 'models' directory if it doesn't exist
models_dir = 'models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

In [18]:
def train_and_evaluate_model(X, y, model_builder, save_path=None, test_size=0.2, val_size=0.2, oversample=False):
    """Trains and evaluates the model using a train-validation-test split and saves the model.
    
    Args:
        X (np.array): Padded input data.
        y (np.array): Encoded target labels.
        model_builder (function): Function to build the model.
        save_path (str): Directory path to save the model. If None, the model is not saved.
        test_size (float): Proportion of the dataset to include in the test split.
        val_size (float): Proportion of the training data to include in the validation split.
        oversample (bool): Whether to apply SMOTE for oversampling.
    
    Returns:
        None
    """
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
    
    # Further split the training data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=42, stratify=y_train)
    
    # Optionally oversample the training data using SMOTE
    if oversample:
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
    
    # Build and compile the model
    model = model_builder()
    
    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=32,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping],
        verbose=2
    )
    
    # Evaluate the model on the test set
    scores = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test Score: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    
    # Store true labels and predicted labels for classification report
    y_pred = model.predict(X_test).argmax(axis=1)
    print(f'\nClassification Report:\n', classification_report(y_test, y_pred))
    
    # Save the model if save_path is provided
    if save_path:
        model.save(save_path)
        print(f'Model saved to {save_path}')


# Configurations for experiments
experiments = [
    {"name": "Baseline RNN", "builder": lambda: build_rnn_model(vocab_size=len(tokenizer.word_index) + 1)},
    {"name": "RNN with SMOTE", "builder": lambda: build_rnn_model(vocab_size=len(tokenizer.word_index) + 1), "oversample": True},
    {"name": "RNN with stacked RNN Layer", "builder": lambda: build_rnn_model(vocab_size=len(tokenizer.word_index) + 1, stacked_rnn_layers=True)}
]

# Execute experiments with model saving
for experiment in experiments:
    print(f"\nRunning experiment: {experiment['name']}")
    save_path = f"models/{experiment['name'].replace(' ', '_')}.keras"
    train_and_evaluate_model(X, y, experiment["builder"], save_path=save_path, oversample=experiment.get("oversample", False))



Running experiment: Baseline RNN
Epoch 1/10
745/745 - 39s - 52ms/step - accuracy: 0.7677 - loss: 0.6293 - val_accuracy: 0.9164 - val_loss: 0.2325
Epoch 2/10
745/745 - 30s - 41ms/step - accuracy: 0.9355 - loss: 0.1934 - val_accuracy: 0.9317 - val_loss: 0.2028
Epoch 3/10
745/745 - 29s - 39ms/step - accuracy: 0.9629 - loss: 0.1146 - val_accuracy: 0.9278 - val_loss: 0.2230
Epoch 4/10
745/745 - 28s - 38ms/step - accuracy: 0.9772 - loss: 0.0765 - val_accuracy: 0.9228 - val_loss: 0.2368
Epoch 5/10
745/745 - 31s - 41ms/step - accuracy: 0.9818 - loss: 0.0609 - val_accuracy: 0.9258 - val_loss: 0.2482
Test Score: loss of 0.2237604707479477; compile_metrics of 92.18581914901733%
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1582
           1       0.98      0.98      0.98      1558
           2       0.90      0.87      0.89      1489
      

## Observations
* SMOTE Impact: SMOTE improved the balance between classes but didn't significantly improve the overall accuracy. It seems to have slightly worsened performance on some classes.

* Stacked RNN Layers: Adding stacked RNN layers improved the model’s accuracy slightly compared to the baseline but didn't drastically change the results. It did, however, help with class 4's performance.
