<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/paradetox-preprocess-BEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset
import re

dataset = load_dataset("s-nlp/paradetox")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 19744
    })
})

In [11]:
def to_lower_case(example):
    example['en_toxic_comment'] = example['en_toxic_comment'].lower()
    example['en_neutral_comment'] = example['en_neutral_comment'].lower()
    return example

lower_cased_dataset = dataset.map(to_lower_case)

In [12]:
lower_cased_dataset['train'][1]

{'en_toxic_comment': 'dude should have been taken to api , he would be right at home with all the other knuckleheads there',
 'en_neutral_comment': 'it would have been good if he went to api. he would fit in.'}

In [13]:
lower_cased_dataset.push_to_hub("paradetox-preprocess")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

In [14]:
def remove_apostrophes(example):
    example['en_toxic_comment'] = example['en_toxic_comment'].replace("'", "")
    example['en_neutral_comment'] = example['en_neutral_comment'].replace("'", "")
    return example

remove_apostrophes_dataset = lower_cased_dataset.map(remove_apostrophes)

Map:   0%|          | 0/19744 [00:00<?, ? examples/s]

In [15]:
remove_apostrophes_dataset['train'][2]

{'en_toxic_comment': 'im not gonna sell the fucking picture , i just want to contribute to the fucking article .',
 'en_neutral_comment': 'im not gonna sell the picture, i just want to contribute to the article.'}

In [8]:
remove_apostrophes_dataset.push_to_hub("paradetox-preprocess")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

In [17]:
def correct_punctuation(text):
    pattern = re.compile(r'\s([?.!,](?:\s|$))')
    corrected_text = pattern.sub(r'\1', text)
    return corrected_text

def correct_dataset(dataset):
    for split in dataset.keys():
        dataset[split] = dataset[split].map(lambda row: {'en_toxic_comment': correct_punctuation(row['en_toxic_comment']),
                                                          'en_neutral_comment': correct_punctuation(row['en_neutral_comment'])})
    return dataset

correct_punctuation_dataset = correct_dataset(remove_apostrophes_dataset)

Map:   0%|          | 0/19744 [00:00<?, ? examples/s]

In [18]:
correct_punctuation_dataset['train'][2]

{'en_toxic_comment': 'im not gonna sell the fucking picture, i just want to contribute to the fucking article.',
 'en_neutral_comment': 'im not gonna sell the picture, i just want to contribute to the article.'}

In [19]:
correct_punctuation_dataset.push_to_hub("paradetox-preprocess")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/508 [00:00<?, ?B/s]

In [20]:
def add_full_stop(sentence):
    if not sentence.endswith('.') and not sentence.endswith('!'):
        sentence += '.'
    return sentence

def add_full_stops_to_dataset(dataset_dict):
    def add_full_stops(example):
        example['en_toxic_comment'] = add_full_stop(example['en_toxic_comment'])
        example['en_neutral_comment'] = add_full_stop(example['en_neutral_comment'])
        return example

    dataset_dict['train'] = dataset_dict['train'].map(add_full_stops)
    return dataset_dict

add_full_stops_dataset = add_full_stops_to_dataset(correct_punctuation_dataset)

Map:   0%|          | 0/19744 [00:00<?, ? examples/s]

In [22]:
add_full_stops_dataset['train'][:3]

{'en_toxic_comment': ['he had steel balls too!',
  'dude should have been taken to api, he would be right at home with all the other knuckleheads there.',
  'im not gonna sell the fucking picture, i just want to contribute to the fucking article.'],
 'en_neutral_comment': ['he was brave too!',
  'it would have been good if he went to api. he would fit in.',
  'im not gonna sell the picture, i just want to contribute to the article.']}

In [23]:
add_full_stops_dataset.push_to_hub("paradetox-preprocess")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/508 [00:00<?, ?B/s]