In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


CUDA available: True
GPU name: NVIDIA GeForce RTX 3060 Laptop GPU


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Load your dataset
df = pd.read_csv("dataset.csv")

# Binary label: -1 (bully) => 1, else => 0
df['binary_label'] = df['label'].apply(lambda x: 1 if x == -1 else 0)

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['headline'].tolist(),
    df['binary_label'].tolist(),
    test_size=0.1,
    random_state=42
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


define and train bert

In [4]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Check for GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device:", device)

# Load dataset
df = pd.read_csv("dataset.csv")  # Make sure this path is correct

# Binary label mapping: -1 â†’ bully (1), others â†’ not bully (0)
df['binary_label'] = df['label'].apply(lambda x: 1 if x == -1 else 0)

# Train/validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['headline'].tolist(),
    df['binary_label'].tolist(),
    test_size=0.1,
    random_state=42
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Dataset class
class BullyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Wrap data into datasets
train_dataset = BullyDataset(train_encodings, train_labels)
val_dataset = BullyDataset(val_encodings, val_labels)

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

# Training config
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train model
trainer.train()

# Save model and tokenizer
model.save_pretrained("./bert-bully-model")
tokenizer.save_pretrained("./bert-bully-model")

print("âœ… Model training and saving complete.")



Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1549,0.156849
2,0.1124,0.193965
3,0.0582,0.193743


âœ… Model training and saving complete.


In [5]:
model.save_pretrained("./bert-bully-model")
tokenizer.save_pretrained("./bert-bully-model")


('./bert-bully-model\\tokenizer_config.json',
 './bert-bully-model\\special_tokens_map.json',
 './bert-bully-model\\vocab.txt',
 './bert-bully-model\\added_tokens.json')

In [7]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load model and tokenizer from the saved directory
model = BertForSequenceClassification.from_pretrained("./bert-bully-model")
tokenizer = BertTokenizer.from_pretrained("./bert-bully-model")

# Move model to the same device (GPU/CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
# Example test sentence
text = "This is a test headline to check for bullying content."

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

# Move inputs to the same device (GPU/CPU)
inputs = {key: val.to(device) for key, val in inputs.items()}


In [16]:
# Example test sentence
text = "nancy"

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

# Move inputs to the same device (GPU/CPU)
inputs = {key: val.to(device) for key, val in inputs.items()}


In [17]:
# Get model prediction
with torch.no_grad():  # No need to calculate gradients during inference
    outputs = model(**inputs)

# Get the predicted class (0 for not bully, 1 for bully)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()

# Output the prediction
print("Predicted class:", predicted_class)


Predicted class: 0


In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd
from sklearn.model_selection import train_test_split

# Load model and tokenizer
model_path = "./bert-bully-model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

# Use GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Load and prepare the dataset
df = pd.read_csv("dataset.csv")
df['binary_label'] = df['label'].apply(lambda x: 1 if x == -1 else 0)

_, val_texts, _, val_labels = train_test_split(
    df['headline'].tolist(),
    df['binary_label'].tolist(),
    test_size=0.1,
    random_state=42
)

# Tokenize validation texts
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_dataset = torch.utils.data.TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    torch.tensor(val_labels)
)

# Predict
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in torch.utils.data.DataLoader(val_dataset, batch_size=32):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Evaluation
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"âœ… Accuracy: {accuracy:.4f}")
print(f"âœ… Precision: {precision:.4f}")
print(f"âœ… Recall: {recall:.4f}")
print(f"âœ… F1-score: {f1:.4f}\n")

print("ðŸ“Š Full Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["Not Bully", "Bully"]))


âœ… Accuracy: 0.9565
âœ… Precision: 0.9532
âœ… Recall: 0.9790
âœ… F1-score: 0.9660

ðŸ“Š Full Classification Report:
              precision    recall  f1-score   support

   Not Bully       0.96      0.92      0.94       670
       Bully       0.95      0.98      0.97      1145

    accuracy                           0.96      1815
   macro avg       0.96      0.95      0.95      1815
weighted avg       0.96      0.96      0.96      1815

