## Imports

In [32]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import ast

## Proccess data
1. **SemEval-2021 Task 5: Toxic Spans Detection**, to detect the offensive part of the message
2. **Jigsaw**, to detect toxic messages

In [33]:
# Helper function for the SemEval-2021 Task 5: Toxic Spans Detection dataset
def extract_text_and_spans(dataset_split):
    X = []  # texts
    y = []  # spans
    
    for sample in dataset_split:
        text = sample["text_of_post"]
        X.append(text)
        # Parse positions and convert to spans
        try:
            toxic_positions = ast.literal_eval(sample["position"])
        except:
            toxic_positions = []
        # Convert positions to spans [start, end)
        spans = []
        if toxic_positions:
            toxic_positions = sorted(toxic_positions)
            start = toxic_positions[0]
            end = toxic_positions[0]
            
            for pos in toxic_positions[1:]:
                if pos == end + 1:  # Consecutive
                    end = pos
                else:  # Gap found
                    spans.append([start, end + 1])
                    start = pos
                    end = pos
            spans.append([start, end + 1])
        y.append(spans)
    
    return X, y

In [34]:
# SemEval-2021 Task 5: Toxic Spans Detection
dataset = load_dataset("heegyu/toxic-spans")
train = dataset["train"]
test = dataset["test"]
X_train_span, y_train_span = extract_text_and_spans(train)
X_test_span, y_test_span = extract_text_and_spans(test)

# Jigsaw
# -Train / Val-
subcategories = ["severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_data = pd.read_csv("jigsaw-toxic-comment-data/train.csv")
# If any subcategory is 1, set toxic to 1
train_data["toxic"] = train_data[["toxic"] + subcategories].max(axis=1)
X_train_toxic = train_data["comment_text"]
y_train_toxic = train_data["toxic"]
# Split to train and val
X_train_toxic, X_val_toxic, y_train_toxic, y_val_toxic = train_test_split(
    X_train_toxic, y_train_toxic, test_size=0.15, stratify=y_train_toxic, random_state=2025
)

# -Test-
test_text = pd.read_csv("jigsaw-toxic-comment-data/test.csv")
test_labels = pd.read_csv("jigsaw-toxic-comment-data/test_labels.csv")
# Keep only rows where toxic is not -1
mask = test_labels["toxic"] != -1
test_text = test_text[mask].reset_index(drop=True)
test_labels = test_labels[mask].reset_index(drop=True)
test_labels["toxic"] = test_labels[["toxic"] + subcategories].max(axis=1)
X_test_toxic = test_text["comment_text"]
y_test_toxic = test_labels["toxic"]

