In [None]:
!pip install datasets
!pip install transformers

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset('SetFit/toxic_conversations', split='train')

print(len(dataset))

In [None]:
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained('gpt2')

def add_token_count(example):
    tokens = tokenizer.tokenize(example["text"], truncation=True, max_length=1024)
    example["token_count"] = len(tokens)
    return example

dataset = dataset.map(add_token_count)

In [None]:
import pandas as pd

df = dataset.to_pandas()

q1 = df["token_count"].quantile(0.25)
q3 = df["token_count"].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

print(f"Lower bound: {lower_bound}, Upper bound: {upper_bound}")

In [None]:
filtered_dataset = dataset.filter(lambda x: lower_bound <= x['token_count'] <= upper_bound)

print(len(filtered_dataset))

In [None]:
import re

def clean_text(example):
    text = example['text']
    text = str(text) if text is not None else ''

    text = re.sub(r"@[A-Za-z0-9_]+", '', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", '', text)
    text = re.sub(r"[^A-Za-z.!?']", ' ', text)
    text = re.sub(r"\r?\n", '', text)
    text = re.sub(r" +", ' ', text)

    text = text.lower()
    example['text'] = text.strip()
    return example

filtered_dataset = filtered_dataset.map(clean_text)
filtered_dataset = filtered_dataset.filter(
    lambda x: len(x['text']) > 0 and len(re.findall(r'[a-zA-Z]', x['text'])) / len(x['text']) > 0.7
)

filtered_dataset = filtered_dataset.filter(
    lambda x: len(x['text'].split()) > 10
)

filtered_dataset = filtered_dataset.map(
    lambda x: {'text': ' '.join(x['text'].split())}
)

print(len(filtered_dataset))

In [None]:
import matplotlib.pyplot as plt

plt.hist(df["token_count"], bins=50, color='skyblue')
plt.axvline(lower_bound, color='red', linestyle='dashed', linewidth=2, label='Lower Bound')
plt.axvline(upper_bound, color='green', linestyle='dashed', linewidth=2, label='Upper Bound')
plt.title("Token Distribution")
plt.xlabel("Token Quantity")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
filtered_dataset.to_csv('gpt2-toxic-conversations.csv', index=False)