In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Step 2: Load Dataset
vidgen_df=pd.read_csv(r"E:\Cyberbullying\dataset\Dynamically Generated Hate Dataset v0.2.2.csv")  # Replace with actual path


In [None]:
vidgen_df = vidgen_df[['text', '']]  # Keep only relevant columns

label_map = {'nothate': 0, 'hate': 1}
vidgen_df['label'] = vidgen_df['label'].map(label_map)
print(vidgen_df['label'].value_counts())  # Check class balance


In [None]:
# Step 3: Preprocess Text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove links
    text = re.sub(r'@\w+|#\w+', '', text)                # remove mentions/hashtags
    text = re.sub(r'[^a-z\s]', '', text)                 # remove numbers, punctuations
    return text

vidgen_df['clean_tweet'] = vidgen_df['text'].apply(clean_text)


In [None]:
# Step 4: TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(vidgen_df['clean_tweet']).toarray()
y = vidgen_df['label'].values


In [None]:
# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Step 6: Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [None]:
# Step 7: Evaluation
y_pred = model.predict(X_test)
print("Classification Report of vidgen_df:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")


In [None]:


# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-bullying', 'Bullying'], yticklabels=['Non-bullying', 'Bullying'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of vidgen_df')
plt.show()


In [None]:
davidson_df = pd.read_csv(r"E:\Cyberbullying\dataset\davidson.csv")


In [None]:
davidson_df

In [None]:
davidson_df['clean_tweet'] = davidson_df['tweet'].apply(clean_text)



In [None]:
# Convert to binary: 0 or 1 = bullying, 2 = non-bullying
davidson_df['label'] = davidson_df['class'].apply(lambda x: 0 if x == 2 else 1)
davidson_df.drop(columns=['class'], inplace=True)

print(davidson_df['label'].value_counts())  # Check class balance

In [None]:
# Step 4: TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(davidson_df['clean_tweet']).toarray()
y = davidson_df['label'].values


In [None]:
# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Step 6: Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [None]:
# Step 7: Evaluation
y_pred = model.predict(X_test)
print("Classification Report of vidgen_df:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")


In [None]:


# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-bullying', 'Bullying'], yticklabels=['Non-bullying', 'Bullying'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of vidgen_df')
plt.show()
