In [None]:
# Install Kaggle API
!pip install -q kaggle

# Upload kaggle.json file
from google.colab import files
files.upload()  # Upload kaggle.json here

# Set up Kaggle API credentials
import os
os.makedirs('/root/.kaggle', exist_ok=True)
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d andrewmvd/cyberbullying-classification
!unzip cyberbullying-classification.zip


Saving kaggle.json to kaggle (1).json
Dataset URL: https://www.kaggle.com/datasets/andrewmvd/cyberbullying-classification
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading cyberbullying-classification.zip to /content
  0% 0.00/2.82M [00:00<?, ?B/s]
100% 2.82M/2.82M [00:00<00:00, 557MB/s]
Archive:  cyberbullying-classification.zip
  inflating: cyberbullying_tweets.csv  


In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# Load the CSV file
df = pd.read_csv('cyberbullying_tweets.csv')  # Replace with actual file name

# Check basic info
print(df.head())
print(df.info())
print(df['cyberbullying_type'].value_counts())  # Check balance of classes

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47692 non-null  object
 1   cyberbullying_type  47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB
None
cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64


In [None]:
# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['tweet_text'].apply(clean_text)

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['cyberbullying_type'])

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_LEN = 128

def encode_texts(texts):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=MAX_LEN)
    return np.array(encodings['input_ids']), np.array(encodings['attention_mask'])

input_ids, attention_masks = encode_texts(df['cleaned_text'])

# Split the dataset
X_train_ids, X_val_ids, X_train_mask, X_val_mask, y_train, y_val = train_test_split(
    input_ids, attention_masks, df['label_enc'], test_size=0.2, random_state=42)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Load BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased', from_pt=True)

# Define the custom model
class BERT_LSTM_Model(tf.keras.Model):
    def __init__(self, bert_model, num_classes, max_len):
        super().__init__()
        self.bert = bert_model
        self.lstm1 = tf.keras.layers.LSTM(64, return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(32)
        self.dropout = tf.keras.layers.Dropout(0.3)
        self.dense = tf.keras.layers.Dense(num_classes, activation='softmax')
        self.max_len = max_len

    def call(self, inputs):
        input_ids, attention_mask = inputs
        bert_output = self.bert(input_ids, attention_mask=attention_mask)[0] # Get the sequence output
        x = self.lstm1(bert_output)
        x = self.lstm2(x)
        x = self.dropout(x)
        return self.dense(x)

num_classes = len(le.classes_)
model = BERT_LSTM_Model(bert_model, num_classes, MAX_LEN)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.build(input_shape=[(None, MAX_LEN), (None, MAX_LEN)])
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [None]:
history = model.fit(
    [X_train_ids, X_train_mask], y_train,
    validation_data=([X_val_ids, X_val_mask], y_val),
    epochs=3, batch_size=16
)


Epoch 1/3
[1m   8/2385[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:51:13[0m 9s/step - accuracy: 0.1688 - loss: 1.8083

In [None]:
# Evaluate the model
val_loss, val_acc = model.evaluate([X_val_ids, X_val_mask], y_val)
print(f"Validation Accuracy: {val_acc*100:.2f}%")

# Classification report
y_pred = np.argmax(model.predict([X_val_ids, X_val_mask]), axis=1)
print(classification_report(y_val, y_pred, target_names=le.classes_))

# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8,6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()
plt.show()
