# Deep Learning Models

This document contain The deep learning models that were created and tested for this project.

### Imports

In [2]:
import warnings
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Bidirectional, LSTM, GRU
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Convolutional Neural Network

This contained the up-sampleing work done for this model along with the creation of the model.

In [None]:
from sklearn.utils import resample

# Load the dataset
df = pd.read_csv('labeled_comments.csv')
minority_class = df[df['Label'] == 'Cyberbullying']
majority_class = df[df['Label'] == 'Not Cyberbullying']
# Upsample the minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine the upsampled minority class with the majority class
balanced_data = pd.concat([majority_class, minority_upsampled])
# Ensure comments are strings and handle NaN values
balanced_data['Comment'] = balanced_data['Comment'].astype(str).fillna('')
# Tokenization and Preprocessing
max_words = 10000  # The number of words to consider as features
max_len = 100      # The maximum length of each sequence (for padding)

# Using Keras Tokenizer to vectorize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(balanced_data['Comment'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(balanced_data['Comment'])

# Pad the sequences to ensure uniform input length
X = pad_sequences(sequences, maxlen=max_len)

# Encode the labels (if they are not binary, you can adjust this for multiclass classification)
balanced_data['Label'] = balanced_data['Label'].astype(str)  # Ensuring labels are string-type if needed
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(balanced_data['Label'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# CNN Model Architecture
model = Sequential()

# Embedding layer: Converts words to dense vectors of fixed size
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))

# Convolutional layer with a kernel size of 5 and 64 filters
model.add(Conv1D(64, 5, activation='relu'))

# Pooling layer: Reduces the dimensionality
model.add(GlobalMaxPooling1D())

# Dense fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout to reduce overfitting
model.add(Dropout(0.5))

# Output layer: Binary classification (you can adjust for more classes)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {val_acc}')

# Make predictions on the validation set
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print(confusion_matrix(y_val, y_pred))
print(f'Classification Report:\n{classification_report(y_val, y_pred)}')



Epoch 1/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.7130 - loss: 0.5349 - val_accuracy: 0.9048 - val_loss: 0.2370
Epoch 2/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - accuracy: 0.9444 - loss: 0.1623 - val_accuracy: 0.9408 - val_loss: 0.1686
Epoch 3/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 21ms/step - accuracy: 0.9801 - loss: 0.0654 - val_accuracy: 0.9442 - val_loss: 0.2118
Epoch 4/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 25ms/step - accuracy: 0.9917 - loss: 0.0270 - val_accuracy: 0.9441 - val_loss: 0.2308
Epoch 5/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 24ms/step - accuracy: 0.9952 - loss: 0.0162 - val_accuracy: 0.9459 - val_loss: 0.2730
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9481 - loss: 0.2642
Validation Accuracy: 0.9458813667297363
[1m223/223[0m [32m━━━━━━━━━━━━━━━



Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      3520
           1       0.97      0.92      0.95      3594

    accuracy                           0.95      7114
   macro avg       0.95      0.95      0.95      7114
weighted avg       0.95      0.95      0.95      7114



In [None]:
# Save the model
model.save('cnn_text_classification.h5')

### Bi-directional Long-Short Term Memory

This contained the up-sampleing work done for this model along with the creation of the model.

In [4]:
from sklearn.utils import resample
# Load the dataset
df = pd.read_csv('labeled_comments.csv')
minority_class = df[df['Label'] == 'Cyberbullying']
majority_class = df[df['Label'] == 'Not Cyberbullying']
# Upsample the minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine the upsampled minority class with the majority class
balanced_data = pd.concat([majority_class, minority_upsampled])
# Ensure comments are strings and handle NaN values
balanced_data['Comment'] = balanced_data['Comment'].astype(str).fillna('')
# Tokenization and Preprocessing
max_words = 10000  # The number of words to consider as features
max_len = 100      # The maximum length of each sequence (for padding)

# Using Keras Tokenizer to vectorize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(balanced_data['Comment'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(balanced_data['Comment'])

# Pad the sequences to ensure uniform input length
X = pad_sequences(sequences, maxlen=max_len)

# Encode the labels (if they are not binary, you can adjust this for multiclass classification)
balanced_data['Label'] = balanced_data['Label'].astype(str)  # Ensuring labels are string-type if needed
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(balanced_data['Label'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# BiLSTM Model Architecture
model = Sequential()

# Embedding layer: Converts words to dense vectors of fixed size
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))

# layer with a kernel 64 filters
model.add(Bidirectional(LSTM(64, return_sequences=True)))

model.add(Bidirectional(LSTM(32)))

# Dense fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout to reduce overfitting
model.add(Dropout(0.5))

# Output layer: Binary classification (you can adjust for more classes)
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {val_acc}')

# Make predictions on the validation set
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print(confusion_matrix(y_val, y_pred))
print(f'Classification Report:\n{classification_report(y_val, y_pred)}')



Epoch 1/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 278ms/step - accuracy: 0.7338 - loss: 0.5128 - val_accuracy: 0.8940 - val_loss: 0.2644
Epoch 2/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 226ms/step - accuracy: 0.9333 - loss: 0.1872 - val_accuracy: 0.9265 - val_loss: 0.2172
Epoch 3/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 223ms/step - accuracy: 0.9642 - loss: 0.0986 - val_accuracy: 0.9322 - val_loss: 0.2232
Epoch 4/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 225ms/step - accuracy: 0.9819 - loss: 0.0557 - val_accuracy: 0.9450 - val_loss: 0.2304
Epoch 5/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 224ms/step - accuracy: 0.9905 - loss: 0.0326 - val_accuracy: 0.9460 - val_loss: 0.2283
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 56ms/step - accuracy: 0.9483 - loss: 0.2339
Validation Accuracy: 0.9460219144821167
[1m223/223[0m [32m━━━

In [None]:
# Save the model
model.save('BiLSTM_text_classification.h5')

### Bi-Directional Grated Recurrent Units

This contained the up-sampleing work done for this model along with the creation of the model.

In [None]:
from sklearn.utils import resample
# Load the dataset
df = pd.read_csv('labeled_comments.csv')
minority_class = df[df['Label'] == 'Cyberbullying']
majority_class = df[df['Label'] == 'Not Cyberbullying']
# Upsample the minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine the upsampled minority class with the majority class
balanced_data = pd.concat([majority_class, minority_upsampled])
# Ensure comments are strings and handle NaN values
balanced_data['Comment'] = balanced_data['Comment'].astype(str).fillna('')

# Tokenization and Preprocessing
max_words = 10000  # The number of words to consider as features
max_len = 100      # The maximum length of each sequence (for padding)

# Using Keras Tokenizer to vectorize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(balanced_data['Comment'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(balanced_data['Comment'])

# Pad the sequences to ensure uniform input length
X = pad_sequences(sequences, maxlen=max_len)

# Encode the labels (if they are not binary, you can adjust this for multiclass classification)
balanced_data['Label'] = balanced_data['Label'].astype(str)  # Ensuring labels are string-type if needed
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(balanced_data['Label'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# BiGRU Model Architecture
model = Sequential()

# Embedding layer: Converts words to dense vectors of fixed size
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))

# layer with a kernel 64 filters
model.add(Bidirectional(GRU(64, return_sequences=True)))

model.add(Bidirectional(GRU(32)))

# Dense fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout to reduce overfitting
model.add(Dropout(0.5))

# Output layer: Binary classification (you can adjust for more classes)
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {val_acc}')

# Make predictions on the validation set
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print(confusion_matrix(y_val, y_pred))
print(f'Classification Report:\n{classification_report(y_val, y_pred)}')

Epoch 1/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 263ms/step - accuracy: 0.7410 - loss: 0.5067 - val_accuracy: 0.9040 - val_loss: 0.2481
Epoch 2/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 255ms/step - accuracy: 0.9405 - loss: 0.1694 - val_accuracy: 0.9240 - val_loss: 0.2197
Epoch 3/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 257ms/step - accuracy: 0.9758 - loss: 0.0744 - val_accuracy: 0.9370 - val_loss: 0.2033
Epoch 4/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 263ms/step - accuracy: 0.9880 - loss: 0.0385 - val_accuracy: 0.9425 - val_loss: 0.2370
Epoch 5/5
[1m890/890[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 258ms/step - accuracy: 0.9933 - loss: 0.0218 - val_accuracy: 0.9355 - val_loss: 0.3234
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 55ms/step - accuracy: 0.9365 - loss: 0.3182
Validation Accuracy: 0.9354793429374695
[1m223/223[0m [32m━━━

In [None]:
# Save the model
model.save('BiGRU_text_classification.h5')

### Using the BERT Dataset

The following section contains the CNN model utilizing the Predicted Labels produced from the BERT model.

In [None]:
df = pd.read_csv('bert_prediction.csv')
df.head()

Unnamed: 0,Comment,Label,Predicted_Label
0,they never told me I couldnt sing what in the ...,0,Not Cyberbullying
1,THEY DIDNT SAY I COULDNT SIIINGGGGG kat from v...,0,Not Cyberbullying
2,Youre very brave for having the comment sectio...,0,Not Cyberbullying
3,i didnt know what her response would be but i ...,0,Not Cyberbullying
4,This is so insulting to the victims,1,Cyberbullying


In [None]:
from sklearn.utils import resample
# Load the dataset
minority_class = df[df['Predicted_Label'] == 'Cyberbullying']
majority_class = df[df['Predicted_Label'] == 'Not Cyberbullying']
# Upsample the minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine the upsampled minority class with the majority class
balanced_data = pd.concat([majority_class, minority_upsampled])
# Ensure comments are strings and handle NaN values
balanced_data['Comment'] = balanced_data['Comment'].astype(str).fillna('')
# Tokenization and Preprocessing
max_words = 10000  # The number of words to consider as features
max_len = 100      # The maximum length of each sequence (for padding)

# Using Keras Tokenizer to vectorize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(balanced_data['Comment'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(balanced_data['Comment'])

# Pad the sequences to ensure uniform input length
X = pad_sequences(sequences, maxlen=max_len)

# Encode the labels (if they are not binary, you can adjust this for multiclass classification)
balanced_data['Predicted_Label'] = balanced_data['Predicted_Label'].astype(str)  # Ensuring labels are string-type if needed
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(balanced_data['Predicted_Label'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# CNN Model Architecture
model = Sequential()

# Embedding layer: Converts words to dense vectors of fixed size
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))

# Convolutional layer with a kernel size of 5 and 64 filters
model.add(Conv1D(64, 5, activation='relu'))

# Pooling layer: Reduces the dimensionality
model.add(GlobalMaxPooling1D())

# Dense fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout to reduce overfitting
model.add(Dropout(0.5))

# Output layer: Binary classification (you can adjust for more classes)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {val_acc}')

# Make predictions on the validation set
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print(confusion_matrix(y_val, y_pred))
print(f'Classification Report:\n{classification_report(y_val, y_pred)}')

Epoch 1/5




[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - accuracy: 0.7246 - loss: 0.5217 - val_accuracy: 0.9168 - val_loss: 0.2238
Epoch 2/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - accuracy: 0.9466 - loss: 0.1534 - val_accuracy: 0.9397 - val_loss: 0.1711
Epoch 3/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 22ms/step - accuracy: 0.9816 - loss: 0.0587 - val_accuracy: 0.9475 - val_loss: 0.1579
Epoch 4/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 23ms/step - accuracy: 0.9941 - loss: 0.0222 - val_accuracy: 0.9564 - val_loss: 0.1813
Epoch 5/5
[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - accuracy: 0.9975 - loss: 0.0102 - val_accuracy: 0.9524 - val_loss: 0.2220
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9513 - loss: 0.2215
Validation Accuracy: 0.9524078369140625
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[[3408  119]
 [ 218 3336]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      3527
           1       0.97      0.94      0.95      3554

    accuracy                           0.95      7081
   macro avg       0.95      0.95      0.95      7081
weighted avg       0.95      0.95      0.95      7081

