In [21]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score

data = pd.read_csv("../data/train.csv")
# Define label columns

# Extract texts and labels
texts = data['comment_text'].tolist()
# Use the six toxicity-related columns as labels (multi-label classification)
labels = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

# label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Load the CSV using the datasets library
raw_dataset = load_dataset("csv", data_files={"train": "../data/train.csv"})["train"]
tokenizer = AutoTokenizer.from_pretrained("roberta-base")



In [22]:
def tokenize_function(example):
    return tokenizer(example["comment_text"], truncation=True, padding="max_length", max_length=128)


In [23]:
dataset = raw_dataset.map(tokenize_function, batched=True)


In [24]:
def format_labels(example):
    example["labels"] = [float(example[col]) for col in label_columns]
    return example

dataset = dataset.map(format_labels)

In [None]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx].astype(float)  # use float for BCEWithLogitsLoss
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item["labels"] = torch.tensor(label, dtype=torch.float)
        return item

# Function to compute metrics during evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Apply sigmoid to get probabilities and threshold at 0.5
    preds = torch.sigmoid(torch.tensor(logits))
    preds = (preds >= 0.5).int().numpy()
    labels = labels.astype(int)
    f1 = f1_score(labels, preds, average="micro")
    accuracy = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": accuracy}

# Set up 5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 0
all_metrics = []

for train_index, val_index in kf.split(texts):
    fold += 1
    print(f"\n======== Fold {fold} ========")
    # Split the texts and labels
    train_texts = [texts[i] for i in train_index]
    train_labels = labels[train_index]
    val_texts = [texts[i] for i in val_index]
    val_labels = labels[val_index]
    
    # Create Dataset objects for the current fold
    train_dataset = ToxicCommentsDataset(train_texts, train_labels, tokenizer)
    val_dataset = ToxicCommentsDataset(val_texts, val_labels, tokenizer)
    
    # Load a fresh model for each fold; specify num_labels=6 for our 6 toxicity labels.
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=6)
    # Set the problem type to multi-label classification so that the loss is computed appropriately.
    model.config.problem_type = "multi_label_classification"
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold}",
        num_train_epochs=2,  # You can adjust the number of epochs
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f"./logs_fold_{fold}",
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        seed=42
    )
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    # Train and evaluate
    trainer.train()
    eval_result = trainer.evaluate()
    print(f"Fold {fold} evaluation results:", eval_result)
    all_metrics.append(eval_result)

print("\nCross Validation Metrics:")
for i, metrics in enumerate(all_metrics, start=1):
    print(f"Fold {i}: {metrics}")




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
