#Preparing the environment 

In [None]:
!pip install transformers==2.11.0

In [None]:
!nvidia-smi

In [None]:
import numpy as np
import math
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
import torch
from transformers import *
import pandas as pd
import re

In [None]:
from tqdm.auto import trange, tqdm
from sklearn.metrics import f1_score, accuracy_score, classification_report,precision_score

In [None]:
cd "Your_Current_Directory" #/content/drive/My Drive/Colab_Notebooks/toxicity/wiki-lda-share/' 

#Preparing the sub-category of Wiki-dataset for training


In [None]:
topic_categories={1:[0,1],
                  2:[2,7,8,9,12,14,16],
                  3:[3,4,5,6,10,11,13,15,17,18,19]}

comments_org = pd.read_csv('wiki_lda_topics_lda_probabilities.csv')
#comments_1 = comments_org[comments_org['wiki_topic'].isin(topic_categories[0])]
#comments_2 = comments_org[comments_org['wiki_topic'].isin(topic_categories[1]+topic_categories[3])][comments_org['toxicity']==0 ].sample(random_state = 100, n = 20000)
#comments_3 = comments_org[comments_org['wiki_topic'].isin(topic_categories[1]+topic_categories[3])][comments_org['toxicity']==1 ]
#comments =  pd.concat([comments_1,comments_2, comments_3])
comments = comments_org
len(comments[comments['toxicity']==0]), len(comments[comments['toxicity']==1])

#Functions for data processing, training and evaluation 

In [None]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

In [None]:
def load_dataset(features):
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
  all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
  all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
  return TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)

In [None]:
def convert_examples_to_features(examples, tokenizer):
  max_length = 128
  labels = [example.label for example in examples]
  batch_encoding = tokenizer.batch_encode_plus(
         [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True,truncation=True,
    )
  features = []
  for i in range(len(examples)):
      inputs = {k: batch_encoding[k][i] for k in batch_encoding}

      feature = InputFeatures(**inputs, label=labels[i])
      features.append(feature)

  return features

In [None]:
def get_dataloader_from_dataframe(dataframe, mode, batch_size = 16, tokenizer = tokenizer  ):
  examples = []
  for row in dataframe.iterrows():
    examples.append(InputExample(guid = row[0], text_a = row[1]['comment'],label=int(row[1]['toxicity'])))
  features = convert_examples_to_features(examples, tokenizer)
  dataset = load_dataset(features)
  if mode == 'train':
    dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
  else: 
    dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=False)
  return dataloader

In [None]:
class ToxicityClassifier:
  def __init__(self, train_dataloader = None, num_labels = 2):
    self.train_dataloader = train_dataloader
    
    self.num_labels = num_labels
    if torch.cuda.is_available():
          self.device = torch.device("cuda")
    else:
        self.device = "cpu" 
    
    model_class = BertForSequenceClassification 
    self.model = model_class.from_pretrained('bert-base-uncased')
    self.gradient_accumulation_steps = 1
    self.learning_rate = 2e-5
    self.model.to(self.device)
    self.precisions = None

  def train(self, train_dataloader=None,num_train_epochs = 2 ):
    if train_dataloader== None:
      train_dataloader = self.train_dataloader


    # Prepare optimizer and schedule (linear warmup and decay)
    param_optimizer = list(self.model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
                                    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
                                    ]
    t_total = len(train_dataloader) // self.gradient_accumulation_steps * num_train_epochs

    optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=math.ceil(t_total * 0.06), num_training_steps=t_total)

    global_step = 0
    
    self.model.zero_grad()
    self.model.train()       
    for epoch_num in trange(int(num_train_epochs)):
        print('epoch #', epoch_num)
        tr_loss = 0
        nb_tr_examples = 0
        nb_tr_steps = 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(self.device) for t in batch)

            inputs = {
                "input_ids":      batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "labels":         batch[3]
            }
            output = self.model(**inputs)
          
            
            loss = output[0]
            
            if self.gradient_accumulation_steps > 1:
                loss = loss / self.gradient_accumulation_steps

            loss.backward()
            
            
            tr_loss += loss.item()
            nb_tr_steps += 1
            if (step + 1) % self.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                self.model.zero_grad()
                global_step += 1
            
        print('train_loss = ',tr_loss / nb_tr_steps )
        #f1s, accuracy, eval_loss =self.test_and_eval(mode = 'eval')
        #print('eval_loss = ',eval_loss)
        #print('eval_f1 = ', f1s)

    return global_step, tr_loss / nb_tr_steps 

    

In [None]:
def test_and_eval(model,test_dataloader ):
   
    model.eval()
    if torch.cuda.is_available():
          device = torch.device("cuda")
    else:
        device = "cpu" 
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids":      batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "labels":         batch[3]
            }
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()

        nb_eval_steps += 1

        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(
                out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds

    preds = np.argmax(preds, axis=1)
    #print('\n\n', classification_report(out_label_ids, preds))
    return preds

#Train the classifier and save the model 

In [None]:

train_dataloader = get_dataloader_from_dataframe(comments, mode='train', batch_size = 16, tokenizer = tokenizer )

In [None]:
wiki_toxic = ToxicityClassifier(train_dataloader = train_dataloader)

In [None]:
wiki_toxic.train() 

In [None]:
model_to_save = wiki_toxic.model.module if hasattr(wiki_toxic.model, 'module') else wiki_toxic.model  
output_model_file = "wiki_bert_2_epoch.bin"
torch.save(model_to_save.state_dict(), output_model_file)

#Test on Waseem-dataset 

In [None]:
waseem_df = pd.read_csv('waseem_wiki_lda_topics_lda_probabilities.csv')

In [None]:
test_batch_size = 128
waseem_dataloader = get_dataloader_from_dataframe(waseem_df, mode='eval', batch_size = test_batch_size, tokenizer = tokenizer  )

In [None]:
waseem_preds = test_and_eval(wiki_toxic.model,test_dataloader = waseem_dataloader)

In [None]:
waseem_df['wiki_toxicity'] = waseem_preds

In [None]:
sexism= waseem_df[waseem_df['Annotation']=='sexism']#[waseem_df['wiki_topic'].isin(topic_categories[1])]
accuracy_score(sexism['toxicity'].tolist(), sexism['wiki_toxicity'].tolist())

In [None]:
racism= waseem_df[waseem_df['Annotation']=='racism']#[waseem_df['wiki_topic'].isin(topic_categories[1])]
accuracy_score(racism['toxicity'].tolist(), racism['wiki_toxicity'].tolist())

#Test on Founta-dataset 

In [None]:
Founta_df = pd.read_csv('Founta_wiki_lda_topics_lda_probabilities.csv')

In [None]:
Founta_dataloader = get_dataloader_from_dataframe(Founta_df, mode='eval', batch_size = test_batch_size, tokenizer = tokenizer  )

In [None]:
Founta_preds = test_and_eval(wiki_toxic.model,test_dataloader = Founta_dataloader)

In [None]:
Founta_df['wiki_toxicity'] = Founta_preds

In [None]:
abusive= Founta_df[Founta_df['label']=='abusive']#[waseem_df['wiki_topic'].isin(topic_categories[1])]
accuracy_score(abusive['toxicity'].tolist(), abusive['wiki_toxicity'].tolist())

In [None]:
hateful= Founta_df[Founta_df['label']=='hateful']#[waseem_df['wiki_topic'].isin(topic_categories[1])]
accuracy_score(hateful['toxicity'].tolist(), hateful['wiki_toxicity'].tolist())

In [None]:
normal= Founta_df[Founta_df['label']=='normal']#[waseem_df['wiki_topic'].isin(topic_categories[1])]
accuracy_score(normal['toxicity'].tolist(), normal['wiki_toxicity'].tolist())