<a href="https://colab.research.google.com/github/JDubWeuu/cyberbullying-classification-ml/blob/main/cyberbullying_classification_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
nltk.download(['stopwords', 'wordnet'])
from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import Word
import string
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing the neural net data

In [None]:
def pre_process(dataframe, stopwords):
    df['tweet_text'] = df['tweet_text'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    df['tweet_text'] = df['tweet_text'].apply(lambda x: ' '.join(x if not x.isdigit() else '' for x in x.split()))
    df['tweet_text'] = df['tweet_text'].apply(lambda x: ' '.join(x for x in x.split() if x not in stopwords))
    df['tweet_text'] = df['tweet_text'].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    df['tweet_text'] = df['tweet_text'].apply(lambda x: ' '.join(x if not x.startswith('@') else '@USERNAME' for x in x.split()))
    df['tweet_text'] = df['tweet_text'].apply(lambda x: ''.join(char for char in x if char == '#' or char not in string.punctuation))

    return df

In [None]:
stop_words = stopwords.words('english')
df = pd.read_csv('cyberbullying_tweets.csv')
df = pre_process(df, stop_words)

# Tokenizer and padded sequence

In [None]:
tokenizer = Tokenizer(num_words=10000, split=' ')
tokenizer.fit_on_texts(df['tweet_text'].values)

sequences = tokenizer.texts_to_sequences(df['tweet_text'].values)
padded_sequences = pad_sequences(sequences)

encoder = LabelEncoder()
df['cyberbullying_type'] = encoder.fit_transform(df['cyberbullying_type'])
labels = df['cyberbullying_type']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# SVM implementation

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='auto', probability=False)
svm_classifier.fit(X_train, Y_train)
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# LSTM implementation

In [None]:
model = tf.keras.Sequential([
    Embedding(10000, 128, input_length=padded_sequences.shape[1]),
    SpatialDropout1D(0.4),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='LeakyReLU'),
    Dense(6, activation="softmax")
])

In [None]:
train_accuracy_over_time = []
train_loss_over_time = []

test_accuracy_over_time = []
test_loss_over_time = []

model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

print(model.summary())
int i = 0
for i in range(10):
  model.fit(X_train, Y_train, validation_split=0.1, epochs = 1, batch_size=32, verbose=1)
  train_loss, train_acc = model.evaluate(X_train, Y_train, verbose=0)
  train_accuracy_over_time.append(train_acc)
  train_loss_over_time.append(train_loss)

  test_loss, test_acc = model.evaluate(X_test, Y_test, verbose=0)
  test_accuracy_over_time.append(test_acc)
  test_loss_over_time.append(test_loss)


plt.subplot(1, 2, 1)
plt.plot(range(1, i+1), train_accuracy_over_time, marker='o', label='Training Accuracy')
plt.plot(range(1, epochs+1), test_accuracy_over_time, marker='o', label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy Over Time')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, i+1), train_loss_over_time, marker='o', label='Training Accuracy')
plt.plot(range(1, i+1), test_loss_over_time, marker='o', label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss Over Time')
plt.legend()

plt.tight_layout()
plt.show()

test_loss, test_acc = model.evaluate(X_test, Y_test)
print(f'Loss: {test_loss}\nAccuracy: {test_acc}')