In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
#import seaborn as sns
#import transformers
#import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)
from imblearn.over_sampling import RandomOverSampler
import time 

# Check GPU Utility
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda



In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup


In [3]:
#tweets = pd.read_csv('data/cleaned/nohashtag_cleaned_lemmatized_english.csv')
#tweets.head()

In [4]:
# Old mapping, keeping for now unless
#mapping = {'religion':0,'age':1,'ethnicity':2,'gender':3,'not_cyberbullying':4, 'other_cyberbullying':5}


tweets = pd.read_csv('data/cleaned/nohashtag_cleaned_lemmatized_english.csv')


label_to_int = {"not_cyberbullying":0,
           "religion":1,
           "age":2,
           "gender":3,
           "ethnicity":4,
           "other_cyberbullying":5}

def map_label(label):
    return label_to_int[label]

#tweets = lem_english#[['tweet_text','cyberbullying_type']]
tweets['cyberbullying_type'] = tweets['cyberbullying_type'].apply(map_label)

tweets['tweet_text'] = tweets['tweet_text'].fillna('<UNK>')
tweets['tweet_text'] = tweets['tweet_text'].replace('nan', '<UNK>')

tweets_x = tweets['tweet_text'].values
tweets_y = tweets['cyberbullying_type'].values


tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,lang,mentioned_users,hashtags
0,0,word food crapilicious,0,en,,katandandre mkr
1,1,white,0,en,,aussietv MKR theblock ImACelebrityAU today sun...
2,2,@username classy whore red velvet cupcake,0,en,XochitlSuckkks,
3,3,@username meh p thanks head concern angry dude...,0,en,Jason_Gio,
4,4,@username isi account pretend kurdish account ...,0,en,RudhoeEnglish,


In [5]:
X_train, X_test, y_train, y_test = train_test_split(tweets_x, tweets_y, test_size=0.2, stratify=tweets_y, random_state=20)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train, random_state=20)

ran_overSamp = RandomOverSampler()

X_train_os, y_train_os = ran_overSamp.fit_resample(np.array(X_train).reshape(-1,1),np.array(y_train).reshape(-1,1))

X_train_os = X_train_os.flatten()
y_train_os = y_train_os.flatten()

(unique, counts) = np.unique(y_train_os, return_counts=True)
np.asarray((unique, counts)).T

array([[   0, 4781],
       [   1, 4781],
       [   2, 4781],
       [   3, 4781],
       [   4, 4781],
       [   5, 4781]], dtype=int64)

In [6]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

In [8]:
MAX_LEN = 512
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True, truncation=True, max_length=MAX_LEN) for sent in X_train]
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

Max length:  512


In [9]:
def roberta_tokenizer(data):
    input_ids = []
    attention_masks = []
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
            max_length=MAX_LEN,             # Choose max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length 
            return_attention_mask=True      # Return attention mask
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [11]:
train_inputs, train_masks = roberta_tokenizer(X_train_os)
val_inputs, val_masks = roberta_tokenizer(X_valid)
test_inputs, test_masks = roberta_tokenizer(X_test)


In [12]:
train_labels = torch.from_numpy(y_train_os)
val_labels = torch.from_numpy(y_valid)
test_labels = torch.from_numpy(y_test)

In [13]:
batch_size = 5


# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Create the DataLoader for our test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [14]:
%%time
class Roberta_Classifier(nn.Module):
    def __init__(self, freeze_roberta=False):
        super(Roberta_Classifier, self).__init__()
        # Specify hidden size of RoBERTa, hidden size of the classifier, and number of labels
        n_input = 768
        n_hidden = 50
        n_output = 6
        # Instantiate RoBERTa model
        self.roberta = RobertaModel.from_pretrained('xlm-roberta-base')

        # Add dense layers to perform the classification
        self.classifier = nn.Sequential(
            nn.Linear(n_input,  n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_output)
        )
        # Add possibility to freeze the RoBERTa model
        # to avoid fine tuning RoBERTa params (usually leads to worse results)
        if freeze_roberta:
            for param in self.roberta.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        # Feed input data to BERT
        outputs = self.roberta(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: total: 0 ns
Wall time: 0 ns


In [15]:
def initialize_model(epochs=4):
    # Instantiate RoBERTa Classifier
    roberta_classifier = Roberta_Classifier(freeze_roberta=False)
    
    roberta_classifier.to(device)

    # Set up optimizer
    optimizer = AdamW(roberta_classifier.parameters(),
                      lr=5e-5,    # learning rate, set to default value
                      eps=1e-8    # decay, set to default value
                      )
    
    ### Set up learning rate scheduler ###

    # Calculate total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Defint the scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return roberta_classifier, optimizer, scheduler

In [16]:
EPOCHS=4

In [18]:
roberta_classifier, optimizer, scheduler = initialize_model(epochs=EPOCHS)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
loss_fn = nn.CrossEntropyLoss().to(device)

def roberta_train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):

    print("Start training...\n")
    for epoch_i in range(epochs):
        print("-"*10)
        print("Epoch : {}".format(epoch_i+1))
        print("-"*10)
        print("-"*38)
        print(f"{'BATCH NO.':^7} | {'TRAIN LOSS':^12} | {'ELAPSED (s)':^9}")
        print("-"*38)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0
        
        ###TRAINING###

        # Put the model into the training mode
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass and get logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update model parameters:
            # fine tune RoBERTa params and train additional dense layers
            optimizer.step()
            # update learning rate
            scheduler.step()

            # Print the loss values and time elapsed for every 100 batches
            if (step % 100 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch
                
                print(f"{step:^9} | {batch_loss / batch_counts:^12.6f} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        ###EVALUATION###
        
        # Put the model into the evaluation mode
        model.eval()
        
        # Define empty lists to host accuracy and validation for each batch
        val_accuracy = []
        val_loss = []

        for batch in val_dataloader:
            batch_input_ids, batch_attention_mask, batch_labels = tuple(t.to(device) for t in batch)
            
            # We do not want to update the params during the evaluation,
            # So we specify that we dont want to compute the gradients of the tensors
            # by calling the torch.no_grad() method
            with torch.no_grad():
                logits = model(batch_input_ids, batch_attention_mask)

            loss = loss_fn(logits, batch_labels)

            val_loss.append(loss.item())

            # Get the predictions starting from the logits (get index of highest logit)
            preds = torch.argmax(logits, dim=1).flatten()

            # Calculate the validation accuracy 
            accuracy = (preds == batch_labels).cpu().numpy().mean() * 100
            val_accuracy.append(accuracy)

        # Compute the average accuracy and loss over the validation set
        val_loss = np.mean(val_loss)
        val_accuracy = np.mean(val_accuracy)
        
        # Print performance over the entire training data
        time_elapsed = time.time() - t0_epoch
        print("-"*61)
        print(f"{'AVG TRAIN LOSS':^12} | {'VAL LOSS':^10} | {'VAL ACCURACY (%)':^9} | {'ELAPSED (s)':^9}")
        print("-"*61)
        print(f"{avg_train_loss:^14.6f} | {val_loss:^10.6f} | {val_accuracy:^17.2f} | {time_elapsed:^9.2f}")
        print("-"*61)
        print("\n")
    
    print("Training complete!")

In [20]:
def get_predictions(roberta_classifier, val_dataloader):
    pred_vec = np.zeros((len(val_dataloader), 5))
    label_vec = np.zeros((len(val_dataloader), 5))
    for batch in val_dataloader:
        batch_input_ids, batch_attention_mask, batch_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(batch_input_ids, batch_attention_mask)

        # Get the predictions starting from the logits (get index of highest logit)
        preds = torch.argmax(logits, dim=1).flatten()
        
    return preds, batch_labels

In [21]:
roberta_train(roberta_classifier, train_dataloader, val_dataloader, epochs=EPOCHS)

Start training...

----------
Epoch : 1
----------
--------------------------------------
BATCH NO. |  TRAIN LOSS  | ELAPSED (s)
--------------------------------------
   100    |   1.790853   |   18.01  
   200    |   1.674218   |   15.01  
   300    |   1.383587   |   14.98  
   400    |   1.014792   |   15.02  
   500    |   0.807930   |   14.99  
   600    |   0.783020   |   15.01  
   700    |   0.645348   |   15.01  
   800    |   0.687578   |   15.02  
   900    |   0.606161   |   15.03  
  1000    |   0.900756   |   15.15  
  1100    |   0.575013   |   15.20  
  1200    |   0.649389   |   15.10  
  1300    |   0.745383   |   15.15  
  1400    |   0.636626   |   14.97  
  1500    |   0.658923   |   15.00  
  1600    |   0.653201   |   15.02  
  1700    |   0.666398   |   15.03  
  1800    |   0.696570   |   15.23  
  1900    |   0.652333   |   16.07  
  2000    |   0.621738   |   16.90  
  2100    |   0.607865   |   15.76  
  2200    |   0.580013   |   16.82  
  2300    |   0.65

   700    |   0.349279   |   16.56  
   800    |   0.525785   |   16.52  
   900    |   0.448333   |   16.51  
  1000    |   0.476736   |   16.46  
  1100    |   0.463183   |   16.50  
  1200    |   0.433211   |   16.50  
  1300    |   0.519769   |   16.55  
  1400    |   0.499458   |   16.65  
  1500    |   0.490348   |   16.58  
  1600    |   0.464520   |   16.88  
  1700    |   0.497374   |   16.55  
  1800    |   0.461478   |   16.60  
  1900    |   0.390176   |   16.51  
  2000    |   0.460366   |   16.51  
  2100    |   0.419902   |   16.50  
  2200    |   0.395214   |   16.53  
  2300    |   0.390720   |   16.50  
  2400    |   0.443550   |   16.57  
  2500    |   0.460629   |   16.58  
  2600    |   0.473155   |   16.63  
  2700    |   0.460911   |   16.60  
  2800    |   0.426610   |   16.60  
  2900    |   0.363549   |   16.59  
  3000    |   0.528330   |   16.63  
  3100    |   0.389533   |   16.62  
  3200    |   0.478484   |   16.62  
  3300    |   0.382110   |   16.68  
 

In [23]:
#preds, labels = get_predictions(roberta_classifier, val_dataloader)
torch.save(roberta_classifier.state_dict(), "models/trained.pt")

In [42]:
torch.cuda.memory_summary(device=None, abbreviated=False)
torch.cuda.empty_cache()
