<a href="https://colab.research.google.com/github/GiovanniSorice/Hate_Speech_Detection/blob/main/AlBerto_with_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AlBERTo Hate Speech Classifier 

In [1]:
!pip install ekphrasis
!pip install transformers
import datetime
import sys
import random
import time
import warnings
warnings.filterwarnings("ignore")

#for code working
import tensorflow as tf
import re
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import numpy as np
import pandas as pd 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, StratifiedKFold
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset,DataLoader, RandomSampler, SequentialSampler
import torch
from torch import tensor
import torch.nn as nn
from google.colab import drive
from sklearn.metrics import f1_score

#Prepare and import BERT modules
import subprocess
subprocess.call(["git", "clone", "https://github.com/google-research/bert","bert_repo"])

if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

import modeling
import tokenization



In [2]:
# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


## Load training dataset 

In [4]:
# directory name 
input_test_dir = "/content/drive/My Drive/HLT/dataset_test_evalita/"
model_path = '/content/drive/MyDrive/HLT/alberto/'
input_dir_not_clean = '/content/drive/My Drive/HLT/dataset_training/' 
# Spec
pd.set_option("display.max_colwidth", None)

In [5]:
raw_tsv_file = open(input_dir_not_clean+"haspeede2_dev_taskAB.tsv")
raw_dataset = pd.read_csv(raw_tsv_file,sep='\t')
raw_dataset.rename(columns={"text ": "text"}, inplace=True)

# Inizialize parameters

In [6]:
OUTPUT_DIR = model_path + 'output'
#SET THE PARAMETERS
MAX_SEQ_LENGTH = 64
LEARNING_RATE = 2e-5
label_list = [0, 1]

#SET THE PARAMETERS FOR TRAINING 
BATCH_SIZE = 32
WARMUP_PROPORTION = 0.1
# Model configs

# Preprocessing sentences

In [7]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag"},
    fix_html=True,  # fix HTML tokens

    unpack_hashtags=True,  # perform word segmentation on hashtags

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

class AlBERTo_Preprocessing(object):
    def __init__(self, do_lower_case=True, **kwargs):
        self.do_lower_case = do_lower_case

    def preprocess(self, text):
        if self.do_lower_case:
            text = text.lower()
        text = str(" ".join(text_processor.pre_process_doc(text)))
        text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
        text = re.sub(r'^\s', '', text)
        text = re.sub(r'\s$', '', text)
        return text

Reading english - 1grams ...
Reading english - 2grams ...
Reading english - 1grams ...


In [8]:
AlBERTo_Preprocess = AlBERTo_Preprocessing(do_lower_case=True)
def preprocess(text):
    return AlBERTo_Preprocess.preprocess(text)

In [9]:
raw_dataset['text_preprocessed'] = raw_dataset['text'].apply(preprocess)

In [10]:
X_train_kfold_values = list(raw_dataset['text_preprocessed'].values)

# Tokenizer 

In [11]:
tokenizer = BertTokenizer.from_pretrained('m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0')

In [12]:
encoding_dataset_kfold = tokenizer(X_train_kfold_values, padding=True, truncation=True, max_length=MAX_SEQ_LENGTH)

In [13]:
input_ids_dataset_kfold = torch.tensor(encoding_dataset_kfold['input_ids'])
attention_mask_dataset_kfold  = torch.tensor(encoding_dataset_kfold['attention_mask'])

In [14]:
# Convert other data types to torch.Tensor
dataset_kfold_labels = torch.tensor(raw_dataset['hs'].values)

# Define the model and support functions

## BertClassifier

In [15]:
# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, Hidden_1, Hidden_2, D_out = 768, 256, 64, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, Hidden_1),
            nn.ReLU(),
            nn.Dropout(0.7),
            nn.Linear(Hidden_1, Hidden_2),
            nn.ReLU(),
            nn.Dropout(0.7),
            nn.Linear(Hidden_2, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

    def freeze(self, freeze_bert=False):
        # Freeze or unfreeze the BERT model
        for param in self.bert.parameters():
            param.requires_grad = not freeze_bert


In [16]:
def initialize_model(len_data, epochs=4, num_warmup_steps = 0):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=LEARNING_RATE,  
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len_data * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

## Train and evaluate function

In [17]:
# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def f1_score_func(preds, labels):
    preds_copy = torch.tensor(preds)
    preds_flat = np.argmax(preds_copy.cpu(), axis=1).flatten()
    labels_flat = labels.cpu().flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def set_seed(seed_value=128, random_seed = False):
    """Set seed for reproducibility.
    """
    if not random_seed:
      random.seed(seed_value)
      np.random.seed(seed_value)
      torch.manual_seed(seed_value)
      torch.cuda.manual_seed_all(seed_value)
    else:
      random.seed()
      np.random.seed()
      torch.manual_seed(random.getrandbits(32))
      torch.cuda.manual_seed_all(random.getrandbits(32))


def train(model, train_dataloader, optimizer, scheduler, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'F1 Train':^9} | {'Val Loss':^10} | {'Val Acc':^9} | {'F1 Val':^9} | {'Elapsed':^9}")
        print("-"*95)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts, f1_value_train_batch, f1_value_train_tot  = 0, 0, 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            #b_labels = b_labels *1.0
            #b_labels = b_labels.unsqueeze(1)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            f1_value_train_batch+= f1_score_func(logits, b_labels) 
            f1_value_train_tot+= f1_score_func(logits, b_labels) 

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {f1_value_train_batch / batch_counts:^9.2f} | {'-':^10} | {'-':^9} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts, f1_value_train_batch = 0, 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        avg_f1_value = f1_value_train_tot / len(train_dataloader)

        print("-"*95)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy, f1_value_validation = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {avg_f1_value:^9.2f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {f1_value_validation:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*95)
        print("\n")
    
    print("Training complete!")

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    b_input_ids = torch.tensor([], dtype=torch.long)
    b_attn_mask = torch.tensor([], dtype=torch.long)
    b_labels = torch.tensor([],dtype=torch.long)

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids_curr, b_attn_mask_curr, b_labels_curr = tuple(t for t in batch)

        # Concat input and labels
        b_input_ids = torch.cat((b_input_ids, b_input_ids_curr), 0)
        b_attn_mask = torch.cat((b_attn_mask, b_attn_mask_curr), 0)
        b_labels = torch.cat((b_labels, b_labels_curr), 0)

    b_input_ids = b_input_ids.to(device)
    b_attn_mask = b_attn_mask.to(device)

    # Compute logits
    with torch.no_grad():
        logits = model(b_input_ids, b_attn_mask)

    del b_input_ids
    del b_attn_mask
    torch.cuda.empty_cache()
    
    # Compute loss
    b_labels = b_labels.to(device)
    loss = loss_fn(logits, b_labels)
    val_loss = loss.item()

    # Get the predictions
    preds = torch.argmax(logits, dim=1).flatten()

    # Calculate the accuracy rate
    val_accuracy = (preds == b_labels).cpu().numpy().mean() * 100
    f1_value = f1_score_func(logits, b_labels) * 100

    # Compute the average accuracy and loss over the validation set.
    return val_loss, val_accuracy, f1_value

def evaluate_for_kfold(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    preds = []
    # Tracking variables
    val_accuracy = []
    val_loss = []
    f1_value = []
    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds+=(torch.argmax(logits, dim=1).flatten().tolist())
    
    return preds

## k-fold cross-validation function

In [18]:
def train_test_model_with_kfold(hparams):
  number_of_splits = 4
  cv_kfold = StratifiedKFold(n_splits=number_of_splits, shuffle=True, random_state=100)
  models = []
  epoch_full_model = 3
  epoch_dense_part = 7
  for train_index, validation_index in cv_kfold.split(input_ids_dataset_kfold, dataset_kfold_labels):
    set_seed(random_seed=True)
    num_train_steps = int(len(input_ids_dataset_kfold) / BATCH_SIZE * (epoch_full_model+epoch_dense_part))+1
    num_warmup_steps = int((epoch_full_model+epoch_dense_part) * WARMUP_PROPORTION)
    bert_classifier, optimizer, scheduler = initialize_model(epochs=(epoch_full_model+epoch_dense_part), num_warmup_steps= num_warmup_steps, len_data = len(input_ids_dataset_kfold))

    # Create the DataLoader for our training set
    train_data_kfold = TensorDataset(input_ids_dataset_kfold[train_index], attention_mask_dataset_kfold[train_index], dataset_kfold_labels[train_index])
    train_sampler_kfold = RandomSampler(train_data_kfold)
    train_dataloader_kfold = DataLoader(train_data_kfold, sampler=train_sampler_kfold, batch_size=BATCH_SIZE)

    # Create the DataLoader for our validation set
    val_data_kfold = TensorDataset(input_ids_dataset_kfold[validation_index], attention_mask_dataset_kfold[validation_index], dataset_kfold_labels[validation_index])
    val_sampler_kfold = SequentialSampler(val_data_kfold)
    val_dataloader_kfold = DataLoader(val_data_kfold, sampler=val_sampler_kfold, batch_size=BATCH_SIZE)

    train(bert_classifier, train_dataloader_kfold, optimizer, scheduler,val_dataloader_kfold, epochs=epoch_full_model, evaluation=True,)

    bert_classifier.freeze(True)

    train(bert_classifier, train_dataloader_kfold, optimizer, scheduler, val_dataloader_kfold, epochs=epoch_dense_part, evaluation=True)
    bert_classifier.to("cpu")
    models.append(bert_classifier)

  return models

In [19]:
def predict_with_ensemble(models, test_dataloader):
  # make predictions
  results = []
  y_predict = []
  for model in models:
    model.to(device)
    y_predict.append(evaluate_for_kfold(model, test_dataloader))
    model.to("cpu")

  #y_predict = [np.squeeze(evaluate_for_kfold(model, test_dataloader)) for model in models]
  # sum across ensemble members
  y_predict = np.array(y_predict)

  for i in range(y_predict.shape[1]):
    counts = np.bincount(y_predict[:,i])
    results.append(np.argmax(counts))
  # argmax across classes
  return results

In [20]:
def run_with_kfold(hparams = None):

    models = train_test_model_with_kfold(hparams)
    y_test_pred_tweets = predict_with_ensemble(models, test_tweets_dataloader)
    y_test_pred_news = predict_with_ensemble(models, test_news_dataloader)

    print("f1_score test tweets: {}".format(f1_score(test_tweets_labels, y_test_pred_tweets,average="macro")))
    print("f1_score test news: {}".format(f1_score(test_news_labels, y_test_pred_news,average="macro")))
    return models

# Load and tokenization test datasets

In [21]:
csv_test_tweets_file = open(input_test_dir+"haspeede2_reference_taskAB-tweets.tsv")

testset_tweets = pd.read_csv(csv_test_tweets_file,sep='\t', header=None)
testset_tweets.rename(columns={0: "id"}, inplace=True)
testset_tweets.rename(columns={1: "text"}, inplace=True)
testset_tweets.rename(columns={2: "hs"}, inplace=True)
testset_tweets.rename(columns={3: "stereotype"}, inplace=True)

csv_test_news_file = open(input_test_dir+"haspeede2_reference_taskAB-news.tsv")

testset_news = pd.read_csv(csv_test_news_file,sep='\t', header=None)
testset_news.rename(columns={0: "id"}, inplace=True)
testset_news.rename(columns={1: "text"}, inplace=True)
testset_news.rename(columns={2: "hs"}, inplace=True)
testset_news.rename(columns={3: "stereotype"}, inplace=True)


In [22]:
testset_tweets['text_preprocessed'] = testset_tweets['text'].apply(preprocess)
testset_news['text_preprocessed'] = testset_news['text'].apply(preprocess)

In [23]:
encoding_test_tweets = tokenizer(list(testset_tweets["text_preprocessed"].values), padding=True, truncation=True, max_length=64)
encoding_test_news = tokenizer(list(testset_news["text_preprocessed"].values), padding=True, truncation=True, max_length=64)

In [24]:
input_ids_test_tweets = torch.tensor(encoding_test_tweets['input_ids'])
attention_mask_test_tweets = torch.tensor(encoding_test_tweets['attention_mask'])
input_ids_test_news = torch.tensor(encoding_test_news['input_ids'])
attention_mask_test_news = torch.tensor(encoding_test_news['attention_mask'])

In [25]:
# Convert other data types to torch.Tensor
test_tweets_labels = torch.tensor(list(testset_tweets["hs"].values))
test_news_labels = torch.tensor(list(testset_news["hs"].values))

# Create the DataLoader for our training set
test_tweets_data = TensorDataset(input_ids_test_tweets, attention_mask_test_tweets, test_tweets_labels)
test_tweets_sampler = SequentialSampler(test_tweets_data)
test_tweets_dataloader = DataLoader(test_tweets_data, sampler=test_tweets_sampler, batch_size=len(test_tweets_data))

# Create the DataLoader for our validation set
test_news_data = TensorDataset(input_ids_test_news, attention_mask_test_news, test_news_labels)
test_news_sampler = SequentialSampler(test_news_data)
test_news_dataloader = DataLoader(test_news_data, sampler=test_news_sampler, batch_size=len(test_tweets_data))

# Run k-fold cross validation

In [26]:
models = run_with_kfold()

Start training...

 Epoch  |  Batch  |  Train Loss  | F1 Train  |  Val Loss  |  Val Acc  |  F1 Val   |  Elapsed 
-----------------------------------------------------------------------------------------------
   1    |   20    |   0.702577   |   0.50    |     -      |     -     |     -     |   4.64   
   1    |   40    |   0.674774   |   0.57    |     -      |     -     |     -     |   4.40   
   1    |   60    |   0.635106   |   0.62    |     -      |     -     |     -     |   4.42   
   1    |   80    |   0.634515   |   0.62    |     -      |     -     |     -     |   4.41   
   1    |   100   |   0.609636   |   0.63    |     -      |     -     |     -     |   4.41   
   1    |   120   |   0.581992   |   0.67    |     -      |     -     |     -     |   4.41   
   1    |   140   |   0.523572   |   0.73    |     -      |     -     |     -     |   4.41   
   1    |   160   |   0.590075   |   0.71    |     -      |     -     |     -     |   4.29   
---------------------------------------

## Results for each model

In [27]:
for i in range(len(models)):
  models[i].to(device)
  val_loss_tweets, val_accuracy_tweets, f1_value_tweets = evaluate(models[i], test_tweets_dataloader)
  val_loss_news, val_accuracy_news, f1_value_news = evaluate(models[i], test_news_dataloader)
  models[i].to('cpu')
  print("Bert model {0} tweets testset result => Loss: {1} Accuracy: {2} F1 score: {3}".format(i, val_loss_tweets, val_accuracy_tweets, f1_value_tweets))
  print("Bert model {0} news testset result => Loss: {1} Accuracy: {2} F1 score: {3}".format(i, val_loss_news, val_accuracy_news, f1_value_news))


Bert model 0 tweets testset result => Loss: 0.6804441809654236 Accuracy: 73.00079176563737 F1 score: 72.83390410493762
Bert model 0 news testset result => Loss: 0.9204741716384888 Accuracy: 74.6 F1 score: 72.19597245732034
Bert model 1 tweets testset result => Loss: 0.6492424607276917 Accuracy: 73.39667458432304 F1 score: 73.28976793944528
Bert model 1 news testset result => Loss: 0.6158900260925293 Accuracy: 78.2 F1 score: 76.82230172756489
Bert model 2 tweets testset result => Loss: 0.6604871153831482 Accuracy: 73.15914489311164 F1 score: 73.08383965226407
Bert model 2 news testset result => Loss: 0.746150016784668 Accuracy: 74.6 F1 score: 71.92074671363999
Bert model 3 tweets testset result => Loss: 0.6813445687294006 Accuracy: 72.60490894695171 F1 score: 72.32035492491438
Bert model 3 news testset result => Loss: 1.0090466737747192 Accuracy: 75.2 F1 score: 72.63002585649645


## Save models

In [28]:
#for i in range(len(models)):
  #torch.save(models[i], "{0}dr_0.7_dr_0.7/model_{1}".format(model_path,i))