<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/%20bert-base-paradetox-editOps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers
!pip install transformers[torch]

In [None]:
from datasets import load_dataset

dataset = load_dataset("HamdanXI/paradetox_with_editOps")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment', 'edit_ops'],
        num_rows: 19744
    })
})

In [32]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def find_max_lengths(toxic_comments, neutral_comments, edit_operations):
    max_len_toxic = max(len(tokenizer.encode(comment)) for comment in toxic_comments)
    max_len_neutral = max(len(tokenizer.encode(comment)) for comment in neutral_comments)
    max_len_ops = max(len(tokenizer.encode(" ".join(ops))) for ops in edit_operations)
    return max_len_toxic, max_len_neutral, max_len_ops

In [21]:
toxic_comments = [item['en_toxic_comment'] for item in dataset['train']]
neutral_comment = [item['en_neutral_comment'] for item in dataset['train']]
edit_operations = []

In [29]:
for item in dataset['train']:
    ops_as_string = ' '.join([' '.join(op) for op in item['edit_ops']])
    edit_operations.append(ops_as_string)

In [33]:
max_len_toxic, max_len_neutral, max_len_ops = find_max_lengths(toxic_comments, neutral_comment, edit_operations)

print(f"Maximum length for toxic comments: {max_len_toxic}")
print(f"Maximum length for neutral comments: {max_len_neutral}")
print(f"Maximum length for edit operations: {max_len_ops}")

Maximum length for toxic comments: 35
Maximum length for neutral comments: 35
Maximum length for edit operations: 197


In [34]:
def preprocess_data(samples):
    processed_comments = []
    labels = []

    for sample in samples:
        toxic_comment = sample['en_toxic_comment']
        edit_ops = sample['edit_ops']

        words = toxic_comment.split()

        for operation in sorted(edit_ops, key=lambda op: int(op[2]), reverse=True):
            op_type, text, index = operation[:3]
            index = int(index)

            if op_type == "replace":
                words[index:index+len(text.split())] = ['[MASK]']
            elif op_type == "delete":
                del words[index:index+len(text.split())]
            elif op_type == "insert":
                words.insert(index, '[INSERT]')

        masked_comment = ' '.join(words)

        encoded_comment = tokenizer.encode_plus(
            masked_comment,
            max_length=35,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        label_ids = tokenizer.encode_plus(
            sample['en_neutral_comment'],
            max_length=197,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )['input_ids']

        processed_comments.append(encoded_comment)
        labels.append(label_ids)

    return processed_comments, labels

In [52]:
processed_comments, labels = preprocess_data(dataset["train"])

In [51]:
from transformers import BertForMaskedLM

model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [53]:
import torch
from torch.utils.data import DataLoader, TensorDataset

input_ids = torch.cat([c['input_ids'] for c in processed_comments], dim=0)
attention_mask = torch.cat([c['attention_mask'] for c in processed_comments], dim=0)
labels_prepared = torch.cat(labels, dim=0).squeeze()

labels_prepared = labels_prepared[:, :input_ids.size(1)]

dataset_tensor = TensorDataset(input_ids, attention_mask, labels_prepared)
loader = DataLoader(dataset_tensor, batch_size=8, shuffle=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [54]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

for epoch in range(3):
    for batch in loader:
        b_input_ids, b_attention_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_labels = b_labels.to(device)

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Loss: {loss.item()}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Loss: 16.883323669433594
Loss: 16.37207794189453
Loss: 16.718679428100586
Loss: 15.485368728637695
Loss: 15.072178840637207
Loss: 14.309438705444336
Loss: 17.332473754882812
Loss: 16.541135787963867
Loss: 16.062772750854492
Loss: 15.112016677856445
Loss: 15.812726974487305
Loss: 15.689651489257812
Loss: 14.979300498962402
Loss: 16.433916091918945
Loss: 16.496143341064453
Loss: 15.119895935058594
Loss: 14.374653816223145
Loss: 16.82137680053711
Loss: 15.846787452697754
Loss: 14.982580184936523
Loss: 15.371192932128906
Loss: 15.510007858276367
Loss: 15.766119956970215
Loss: 15.708847045898438
Loss: 16.58591079711914
Loss: 16.034439086914062
Loss: 15.165935516357422
Loss: 15.82550048828125
Loss: 14.828428268432617
Loss: 14.717405319213867
Loss: 15.000475883483887
Loss: 15.998579025268555
Loss: 17.111412048339844
Loss: 16.880207061767578
Loss: 17.50817108154297
Loss: 16.646677017211914
Loss: 16.43191909790039
Loss: 15.0450019

In [55]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.


In [56]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [57]:
model.save_pretrained('bert-base-paradetox-editOps')
tokenizer.save_pretrained('bert-base-paradetox-editOps')

model.push_to_hub('bert-base-paradetox-editOps')
tokenizer.push_to_hub('bert-base-paradetox-editOps')

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HamdanXI/bert-base-paradetox-editOps/commit/87fd610ec10174710531f1c0d036f2d269637211', commit_message='Upload tokenizer', commit_description='', oid='87fd610ec10174710531f1c0d036f2d269637211', pr_url=None, pr_revision=None, pr_num=None)