In [None]:
!pip install -qq transformers

[K     |████████████████████████████████| 1.1MB 3.4MB/s 
[K     |████████████████████████████████| 3.0MB 22.0MB/s 
[K     |████████████████████████████████| 1.1MB 40.5MB/s 
[K     |████████████████████████████████| 890kB 49.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


# DATASET DOWNLOAD

In [None]:
#SENTIPOLC DATASET
!gdown --id 1OjAR47V-rVYwaSFtu-OWCbxpRhfhORv7
!gdown --id 1iD_MhCqoqUPeg8f3I3QN0PF_FJPTkCFT

#HATE SPEECH DATASET
!gdown --id 16fiKO4XSjlYz0zt9AMiW5h-Kbb2tyJ9w

#IRONY DETECTION DATASET
!gdown --id 1tQIofTUHkflMjhky5UKwc7FaYucTClCT
!gdown --id 10bTHDmrn_-_u3_P6jLIjZqkbNkQOr9vk

from google.colab import drive
drive.mount('/content/gdrive')



Downloading...
From: https://drive.google.com/uc?id=1FZ4teXVlw1PBQsM3e6Fxaz-cSDU82a6w
To: /content/training_set_sentipolc16.csv
100% 1.03M/1.03M [00:00<00:00, 69.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QHCgrXE-Ys3eOGub4RdenvpRcE77t7gj
To: /content/test_set_sentipolc16_gold2000.csv
100% 329k/329k [00:00<00:00, 119MB/s]
Downloading...
From: https://drive.google.com/uc?id=14iWpHn47t5h9-67ruC5TWmAVXchjTOQh
To: /content/haspeede_TW-train.tsv
100% 391k/391k [00:00<00:00, 52.2MB/s]
Mounted at /content/gdrive


# IMPORT SECTION
Here are defined all the libraries used and the path of the drive that contains the tagged datasets.

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup, AutoModel, AutoTokenizer
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, random_split, TensorDataset
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

MODEL_PATH = F"/content/gdrive/My Drive/Models/"

  import pandas.util.testing as tm


# SELECTION OF TASK

In [None]:
#CHOOSE ONE OF THE THREE SUPPORTING TASKS --- HATE, SENTIMENT, IRONY
SUPPORTING_TASK = "HATE"

if SUPPORTING_TASK = "HATE":
  df = pd.read_csv("haspeede_TW-train.tsv", sep="\t", names= ['id','tweet','label'])
  MODEL_PATH =  F"/content/gdrive/My Drive/hatespeech_best"

elif SUPPORTING_TASK = "SENTIMENT":
  df = pd.read_csv("training_set_sentipolc16.csv")
  df_test = pd.read_csv("test_set_sentipolc16_gold2000.csv", error_bad_lines=False, encoding='latin', names=df.columns)
  MODEL_PATH = F"/content/gdrive/My Drive/Models/sentipolc_best"
  
elif SUPPORTING_TASK = "IRONY":
  df = pd.read_csv("training_ironita2018.csv", sep="\t")
  df_test = pd.read_csv("test_gold_ironita2018.csv", sep="\t")
  MODEL_PATH = F"/content/gdrive/My Drive/Models/irony_best"


Unnamed: 0,id,tweet,label
0,6847,@matteorenzi ...all'invasione di questi animal...,1
1,2066,"È terrorismo anche questo, per mettere in uno ...",0
2,2045,@Nanoalto @FedeAngeli infatti finché ci hanno ...,0
3,6630,@dinofarnesi Piovegovernolad In Italia a...,1
4,5556,#londonattack chiedete ai buonisti del cavolo ...,1


# SETTING PRE TRAINED MODEL AND TOKENIZER

In [None]:
PRE_TRAINED_MODEL_NAME = 'Musixmatch/umberto-commoncrawl-cased-v1'

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
#SET MAXIMUM NUMBER OF TOKENS PER SENTENCE
MAX_LEN = 70

# DATASET CLASS
Now we create the class that contains the information of each tweet and then we use it to create the data structure containing the tokenized phrases.

In [None]:
class TaskDataset(Dataset):
  def __init__(self, tweet_id, label, tweet, tokenizer, max_len):
    self.tweet_id = tweet_id
    self.label = label
    self.tweet = tweet
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.tweet)
  def __getitem__(self, item):
    tweet = str(self.tweet[item])
    tweet_id = int(self.tweet_id[item])
    label = self.label[item]
    encoding = self.tokenizer.encode_plus(
      tweet,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt',
    )
    return {
      'tweet_id': torch.tensor(tweet_id, dtype=torch.int),
      'tweet' : tweet,
      'label': torch.tensor(label, dtype=torch.long),
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
    }

In [None]:
if SUPPORTING_TASK = "HATE":
  df_train, df_val = train_test_split(
    df,
    test_size=0.2,
    random_state=RANDOM_SEED
  )
else:
  df_train = df
  df_val = df_test

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TaskDataset(
    tweet=df.tweet.to_numpy(),
    label=df.label.to_numpy(),
    tweet_id=df.id.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4

  )

In [None]:
#DEFINE NUMBER OF BATCH
BATCH_SIZE = 32
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

# TASK CLASSIFIER
This class is used to load the UmBERTo model and apply activation functions that will be used in the training process.
Furthermore, to make the training phase more consistent with the test phase, we also used a dropout function.

In [None]:
class TaskClassifier(nn.Module):
  def __init__(self, n_classes):
    super(HateClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.softmax = nn.Softmax(dim=1)
    
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

  def save_pretrained(self, path):
    self.bert.save_pretrained(path)
    torch.save(self.out,path + "_out_layer" )

In [None]:
if SUPPORTING_TASK = "SENTIMENT":
  model = None
  model = HateClassifier(3)
  model = model.to(device)
else:
  model = None
  model = HateClassifier(2)
  model = model.to(device)

2


# SETTING OF HYPER PARAMETERS
In the following code we have just set some hyperparameters, as the number of epochs or the optimizer, you should able to change these easily

In [None]:
EPOCHS = 5

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = int((len(train_data_loader) * EPOCHS)) 
warmup_step = int(len(train_data_loader))  

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=warmup_step,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

# MODEL TRAINING
Here it is an helper function for training our model for one epoch

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()

  all_predictions , true_labels = [], []


  correct_predictions = 0
  losses = []
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    label = d["label"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, label)

    all_predictions.append(preds.cpu().data)
    true_labels.append(label.cpu().data) 

    correct_predictions += torch.sum(preds == label)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  all_predictions = np.concatenate(all_predictions, axis=0)
  true_labels = np.concatenate(true_labels, axis=0)

  #SYSTEM MEASURES
  f1 = f1_score(true_labels, all_predictions, average="macro")
  precision = precision_score(true_labels, all_predictions, average="macro")
  recall = recall_score(true_labels, all_predictions, average="macro")

    
  return correct_predictions.double() / n_examples, np.mean(losses), precision , recall, f1

# MODEL EVALUATION
This function helps us to avaluate our model given a data loader


In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  
  predictions = []
  all_predictions , true_labels = [], []

  correct_predictions = 0
  with torch.no_grad():
    
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      label = d["label"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, label)
      losses.append(loss.item())
      correct_predictions += torch.sum(preds == label)
      predictions.append({"tweet_id": d["tweet_id"], "label": preds, "exact": label})

      all_predictions.append(preds.cpu().data)
      true_labels.append(label.cpu().data) 
  
    
  all_predictions = np.concatenate(all_predictions, axis=0)
  true_labels = np.concatenate(true_labels, axis=0)
  
  #SYSTEM MEASURES
  f1 = f1_score(true_labels, all_predictions, average="macro")
  precision = precision_score(true_labels, all_predictions, average="macro")
  recall = recall_score(true_labels, all_predictions, average="macro")
 
  return correct_predictions.double() / n_examples, np.mean(losses), predictions, precision , recall, f1

# RUNNING SECTION AND SAVING OF MODEL
In the following code we run the models, and save the one that returns the best accuracy

In [None]:
history = defaultdict(list)

best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss, p, r, f1 = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc} precision {p} recall {r} f1 {f1}')
  val_acc, val_loss,predictions, p,r,f1 = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc} precision {p} f1 {f1} ')

  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    model.save_pretrained(MODEL_PATH)
    best_accuracy = val_acc

Epoch 1/5
----------
Train loss 0.580657813946406 accuracy 0.6970833333333334 precision 0.6372200871585725 recall 0.5793292924896633 f1 0.5758780391700258
Val   loss 0.6103149056434631 accuracy 0.6533333333333333 precision 0.5771812080536913 f1 0.4041708043694141 

Epoch 2/5
----------
Train loss 0.4985181001822154 accuracy 0.7591666666666668 precision 0.7230388679209587 recall 0.7263757856603388 f1 0.7246253101736972
Val   loss 0.6575407385826111 accuracy 0.7733333333333334 precision 0.8216364258815474 f1 0.6977777777777778 

Epoch 3/5
----------
Train loss 0.4061372861266136 accuracy 0.8295833333333333 precision 0.8032232126501997 recall 0.8059322956003021 f1 0.8045446553536465
Val   loss 0.44912097720723404 accuracy 0.8333333333333334 precision 0.8390018053695183 f1 0.8022125819169557 

Epoch 4/5
----------
Train loss 0.2844910051425298 accuracy 0.89375 precision 0.877154593343055 recall 0.8784626659327437 f1 0.877803115661148
Val   loss 0.41392031233561666 accuracy 0.83833333333333