<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/paradox_masked_comment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset

dataset = load_dataset("HamdanXI/paradetox_with_editOps")

In [3]:
from datasets import Dataset, DatasetDict

# Assuming you already have the dataset loaded as a DatasetDict and it's named `dataset`
# and the preprocess_data function is modified to return only masked_comments.

def preprocess_data(samples):
    masked_comments_list = []

    for sample in samples:
        toxic_comment = sample['en_toxic_comment']
        edit_ops = sample['edit_ops']

        words = toxic_comment.split()

        for operation in sorted(edit_ops, key=lambda op: int(op[2]), reverse=True):
            op_type, text, index = operation[:3]
            index = int(index)

            if op_type == "replace":
                words[index:index+len(text.split())] = ['[MASK]'] * len(text.split())
            elif op_type == "delete":
                words[index:index+len(text.split())] = ['[DEL]'] * len(text.split())
            elif op_type == "insert":
                words.insert(index, '[INS]')

        masked_comment = ' '.join(words)
        masked_comments_list.append(masked_comment)

    return masked_comments_list

# Apply the preprocess function to generate masked comments
masked_comments = preprocess_data(dataset["train"])

In [4]:
# Add the masked comments to the dataset
dataset_new = dataset.map(lambda examples: {'masked_comment': preprocess_data([examples])[0]}, batched=False)

# Check the first few entries to ensure the new feature has been added
print(dataset_new["train"][0])

{'en_toxic_comment': 'he had steel balls too !', 'en_neutral_comment': 'he was brave too!', 'edit_ops': [['replace', 'had steel balls too !', '1', 'was brave too!']], 'masked_comment': 'he [MASK] [MASK] [MASK] [MASK] [MASK]'}


In [5]:
dataset_new

DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment', 'edit_ops', 'masked_comment'],
        num_rows: 19744
    })
})

In [6]:
dataset_new["train"][1]

{'en_toxic_comment': 'dude should have been taken to api , he would be right at home with all the other knuckleheads there',
 'en_neutral_comment': 'It would have been good if he went to api. He would fit in.',
 'edit_ops': [['replace', 'dude should', '0', 'It would'],
  ['replace', 'taken', '4', 'good if he went'],
  ['replace', 'api , he', '6', 'api. He'],
  ['replace',
   'be right at home with all the other knuckleheads there',
   '10',
   'fit in.']],
 'masked_comment': '[MASK] [MASK] have been [MASK] to [MASK] [MASK] [MASK] would [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]'}

In [7]:
# !huggingface-cli login

In [8]:
# dataset_new.push_to_hub("paradetox_editOps_preprocess")