In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import torch
from torch.utils.data import Dataset

#TRAINING

##PREPROCESSING

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", local_files_only = True)
df = (pd.read_json('CADD.json')).transpose()

df = df[df["comment"].apply(lambda row: len(tokenizer.tokenize(row))) <= 512]
df['label'] = df.apply(lambda row: 0 if (row["L1"] == "0") and (row["L2"] == "0") and (row["L3"] == "0") and (row["L4"] == "0") and (row["L5"] == "0") and (row["L6"] == "0") else 1, axis = 1)

df = df.rename(columns = {"comment": "text"})
df = df.drop(columns = ["title", "body", "L1", "L2", "L3", "L4", "L5", "L6"])

In [None]:
df_abusive0 = df[(df["label"] == 0)]
df_abusive0 = df_abusive0.sample(n = 5000)
df_abusive0_train = df_abusive0.sample(n = (int(0.8 * len(df_abusive0))))
df_abusive0_eval = df_abusive0[~df_abusive0.index.isin(df_abusive0_train.index)]

df_abusive1 = df[(df["label"] == 1)]
df_abusive1 = df_abusive1.sample(n = 5000)
df_abusive1_train = df_abusive1.sample(n = (int(0.8 * len(df_abusive1))))
df_abusive1_eval = df_abusive1[~df_abusive1.index.isin(df_abusive1_train.index)]

In [None]:
df_train = pd.concat([df_abusive0_train, df_abusive1_train])
df_train = df_train.sample(frac = 1).reset_index(drop = True)

df_eval = pd.concat([df_abusive0_eval, df_abusive1_eval])
df_eval = df_eval.sample(frac = 1).reset_index(drop = True)

torch.save(df_train, 'df_train.pt')
torch.save(df_eval, 'df_eval.pt')

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", local_files_only = True)

df_train = torch.load('df_train.pt')
df_eval = torch.load('df_eval.pt')

class CADD(Dataset):
    def __init__(self, df):
        self.text = []
        self.labels = []
        self.tokens = []
        for i in range(len(df)):
            self.text.append(df["text"][i])
            self.labels.append(df["label"][i])
            self.tokens.append(tokenizer(df["text"][i]))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        sample = {"input_ids": torch.tensor(self.tokens[item]["input_ids"]), "attention_mask": torch.tensor(self.tokens[item]["attention_mask"]), "labels": torch.tensor(self.labels[item])}
        return sample

dataset_train = CADD(df_train)
dataset_eval = CADD(df_eval)

##FINE-TUNING

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", local_files_only = True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", local_files_only = True)

args = TrainingArguments(output_dir = "model",
                         overwrite_output_dir = True,
                         per_device_train_batch_size = 8,
                         per_device_eval_batch_size = 1,
                         num_train_epochs = 1,
                         learning_rate= 0.0001,
                         weight_decay = 0.1,
                         save_strategy = "no",
                         logging_strategy = "epoch",
                         evaluation_strategy = "epoch")

trainer = Trainer(model = model,
                  tokenizer = tokenizer,
                  args = args,
                  train_dataset = dataset_train,
                  eval_dataset = dataset_eval)

In [None]:
trainer.train()
trainer.save_model()

#INFERENCE

In [2]:
tokenizer = BertTokenizer.from_pretrained("model", local_files_only = True)
model = BertForSequenceClassification.from_pretrained("model", local_files_only = True)

In [24]:
inputs = tokenizer.encode("He's an engineer, he solves practical problems.", return_tensors = "pt")

outputs = model(inputs)
outputs_normalized = torch.nn.functional.softmax(outputs.logits, dim = -1)

outputs_normalized

tensor([[0.9817, 0.0183]], grad_fn=<SoftmaxBackward0>)

In [25]:
inputs = tokenizer.encode("He's a serial killer, he kills women and children for mere pleasure.", return_tensors = "pt")

outputs = model(inputs)
outputs_normalized = torch.nn.functional.softmax(outputs.logits, dim = -1)

outputs_normalized

tensor([[0.0911, 0.9089]], grad_fn=<SoftmaxBackward0>)