1.Import Required Libraries

In [3]:
# Training/training_notebook.ipynb

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

2.Load Preprocessed Data

In [4]:
# Load Preprocessed Data
data = pd.read_csv('../Data/preprocessed_data.csv')

3.Drop rows

In [9]:
# Drop rows where tweet_text is NaN
data = data.dropna(subset=['tweet_text'])

X = data['tweet_text']
y = data['cyberbullying_type']  # use the correct label column

4.Vectorize text and Split Data

In [12]:
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

5.Initiate Models

In [10]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

6.Training the models

In [11]:
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

Logistic Regression Accuracy: 0.7775
SVM Accuracy: 0.7802
Random Forest Accuracy: 0.7720
AdaBoost Accuracy: 0.5632
Gradient Boosting Accuracy: 0.7720


7.Display Results

In [5]:
# Display Results
print("\nAll Model Results:", results)


All Model Results: {'Logistic Regression': 0.7774725274725275, 'SVM': 0.7802197802197802, 'Random Forest': 0.7747252747252747, 'AdaBoost': 0.5631868131868132, 'Gradient Boosting': 0.7747252747252747}
