<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/802%20code/paradetox-split-replicating-evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets scipy torch tqdm

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine
from datasets import load_dataset

paradetox_dataset = load_dataset("s-nlp/paradetox")['train']
paradetox_1token_dataset = load_dataset("HamdanXI/paradetox-1Token-Split")
train_val_1token_comments = []
for subset in ['train', 'validation']:
    for row in paradetox_1token_dataset[subset]:
        train_val_1token_comments.append(row['en_toxic_comment'])
        train_val_1token_comments.append(row['en_neutral_comment'])

def preprocess_text(text):
    return text.lower()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Downloading readme:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading readme:   0%|          | 0.00/678 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/440k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/95.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/96.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3784 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/811 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/811 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
def max_token_length(input_1, input_2, input_3, tokenizer):
  max_token_length_input_1 = max(len(tokenizer.encode(item)) for item in input_1)
  max_token_length_input_2 = max(len(tokenizer.encode(item)) for item in input_2)
  max_token_length_input_3 = max(len(tokenizer.encode(item)) for item in input_3)

  if max_token_length_input_1 > max_token_length_input_2:
      highest_length = max_token_length_input_1
  else:
      highest_length = max_token_length_input_2

  if max_token_length_input_3 > highest_length:
      highest_length = max_token_length_input_3

  return highest_length

highest_length = max_token_length(paradetox_dataset['en_toxic_comment'], paradetox_1token_dataset['train']['en_toxic_comment'], paradetox_1token_dataset['train']['en_neutral_comment'], tokenizer)

In [4]:
from tqdm import tqdm

# Sentence Embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# Compute embeddings for the combined train and validation sets
train_val_embeddings = [get_embedding(preprocess_text(text)) for text in tqdm(train_val_1token_comments, desc="Computing Train/Val Embeddings")]

# Filter out similar sentences
test_set = []
for row in tqdm(paradetox_dataset, desc="Filtering Test Set"):
    text = preprocess_text(row['en_toxic_comment'])
    embedding = get_embedding(text)
    is_unique = True
    for ref_embedding in train_val_embeddings:
        if cosine(embedding, ref_embedding) < 0.2:  # 80% similarity or more
            is_unique = False
            break
    if is_unique:
        test_set.append(row)
    if len(test_set) >= 671:
        break

# Test Set
print(f"Collected {len(test_set)} unique test sentences.")

Computing Train/Val Embeddings: 100%|██████████| 9190/9190 [08:44<00:00, 17.53it/s]
Filtering Test Set:   8%|▊         | 1522/19744 [09:41<1:56:07,  2.62it/s]

Collected 671 unique test sentences.





In [5]:
from datasets import Dataset, DatasetDict

# Create a new Dataset for the test set
test_dataset = Dataset.from_dict({'en_toxic_comment': [row['en_toxic_comment'] for row in test_set],
                                  'en_neutral_comment': [row['en_neutral_comment'] for row in test_set]})

# Filter the original dataset to remove the test set rows
indices_to_keep = []
for i, row in enumerate(paradetox_dataset):
    if not any(row['en_toxic_comment'] == test_row['en_toxic_comment'] and row['en_neutral_comment'] == test_row['en_neutral_comment'] for test_row in test_set):
        indices_to_keep.append(i)

filtered_dataset = paradetox_dataset.select(indices_to_keep)

# Create a new DatasetDict
new_paradetox_dataset = DatasetDict({
    'train': filtered_dataset,
    'test': test_dataset
})

In [6]:
new_paradetox_dataset

DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 19073
    })
    test: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 671
    })
})

In [7]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 9 not upgraded.


In [8]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
new_paradetox_dataset.push_to_hub("paradetox-split")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]