In [1]:
import pandas as pd
import numpy as np
import sklearn
from time import time
import datetime
import pickle
import matplotlib.pyplot as plt
import nltk
from collections import Counter
import re
from collections import Counter
from nltk.corpus import words
%matplotlib inline

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import (
                BertConfig, BertForSequenceClassification, BertTokenizer,
              XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup

import random
import warnings
warnings.filterwarnings('ignore')

In [2]:
# print gpu devices
if torch.cuda.device_count() > 0:
    for i in range(torch.cuda.device_count()):
        print('cuda {}: {}'.format(i, torch.cuda.get_device_name(i)))

cuda 0: Tesla V100-PCIE-32GB
cuda 1: Tesla M40
cuda 2: Tesla M40
cuda 3: GeForce GTX 1080
cuda 4: GeForce GTX 1080
cuda 5: Tesla V100-PCIE-32GB
cuda 6: Tesla P100-PCIE-16GB
cuda 7: Tesla P100-PCIE-16GB


In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('you are now using {}.'.format(torch.cuda.get_device_name(device)))

you are now using Tesla V100-PCIE-32GB.


In [4]:
# Set the seed value all over the place to make this reproducible.
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  

setup_seed(42)

In [5]:
CONFIG = {}
CONFIG['A_train_path'] = "datasets/train/SemEval2018-T3-train-taskA_emoji.txt"
CONFIG['A_test_path'] = "datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt"
CONFIG['B_train_path'] = "datasets/train/SemEval2018-T3-train-taskB_emoji.txt"
CONFIG['B_test_path'] = "datasets/goldtest_TaskB/SemEval2018-T3_gold_test_taskB_emoji.txt"
CONFIG['bert_epochs'] = 4
CONFIG['xlnet_epochs'] = 4
CONFIG['batch_size'] = 32
CONFIG['max_len'] = 128
CONFIG['bert_models'] = ['bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'bert-large-cased']
CONFIG['xlnet_models'] = ['xlnet-base-cased', 'xlnet-large-cased']

In [6]:
# some util functions, just run it

# get score
def print_score(true, predicted, task='A'):
    acc = calc_accuracy(true, predicted)
    if task == "A":
        p, r, f = precision_recall_fscore(true, predicted, beta=1, labels=[0,1], pos_label=1)
    elif task == "B":
        p, r, f = precision_recall_fscore(true, predicted, beta=1, labels=[0,1,2,3])
    print("Accuracy:{0}\nPrecision:{1}\nRecall:{2}\nF1-score:{3}\n".format(acc, p,r,f))
            

def calc_accuracy(true, predicted):
    """Calculates the accuracy of a (multiclass) classifier, defined as the fraction of correct classifications."""
    return sum([t==p for t,p in zip(true, predicted)]) / float(len(true))


def precision_recall_fscore(true, predicted, beta=1, labels=None, pos_label=None, average=None, each=None):
    """Calculates the precision, recall and F-score of a classifier.
    :param true: iterable of the true class labels
    :param predicted: iterable of the predicted labels
    :param beta: the beta value for F-score calculation
    :param labels: iterable containing the possible class labels
    :param pos_label: the positive label (i.e. 1 label for binary classification)
    :param average: selects weighted, micro- or macro-averaged F-score
    """

    # Build contingency table as ldict
    ldict = {}
    for l in labels:
        ldict[l] = {"tp": 0., "fp": 0., "fn": 0., "support": 0.}

    for t, p in zip(true, predicted):
        if t == p:
            ldict[t]["tp"] += 1
        else:
            ldict[t]["fn"] += 1
            ldict[p]["fp"] += 1
        ldict[t]["support"] += 1

    # Calculate precision, recall and F-beta score per class
    beta2 = beta ** 2
    for l, d in ldict.items():
        try:
            ldict[l]["precision"] = d["tp"]/(d["tp"] + d["fp"])
        except ZeroDivisionError: ldict[l]["precision"] = 0.0
        try: ldict[l]["recall"]    = d["tp"]/(d["tp"] + d["fn"])
        except ZeroDivisionError: ldict[l]["recall"]    = 0.0
        try: ldict[l]["fscore"] = (1 + beta2) * (ldict[l]["precision"] * ldict[l]["recall"]) / (beta2 * ldict[l]["precision"] + ldict[l]["recall"])
        except ZeroDivisionError: ldict[l]["fscore"] = 0.0

            
    if each:
        return [ldict[l]["fscore"] for l in labels]
    # If there is only 1 label of interest, return the scores. No averaging needs to be done.
    if pos_label:
        d = ldict[pos_label]
        return (d["precision"], d["recall"], d["fscore"])
    # If there are multiple labels of interest, macro-average scores.
    else:
        for label in ldict.keys():
            avg_precision = sum(l["precision"] for l in ldict.values()) / len(ldict)
            avg_recall = sum(l["recall"] for l in ldict.values()) / len(ldict)
            avg_fscore = sum(l["fscore"] for l in ldict.values()) / len(ldict)
        return (avg_precision, avg_recall, avg_fscore)

# get ids and masks
def get_ids_mask(sents, tokenizer, max_len=None):
    t_e = [tokenizer.encode_plus(sent, 
                              max_length = max_len,
                              add_special_tokens = True,
                              pad_to_max_length = 'right',
#                                 return_tensors='pt',
                             ) for sent in sents]
    
    input_ids, attention_masks = [], []    
    
    for x in t_e:
        input_ids.append(x['input_ids'])
        attention_masks.append(x['attention_mask'])
    
    return input_ids, attention_masks

# format time for training time
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [7]:
# fine-tuning function
def ft(task='A', model_type='BERT', name='bert-base-uncased', verbose=False, each=None):
    df = pd.read_csv(CONFIG[task+'_train_path'], delimiter='\t', index_col=0)
    df_test = pd.read_csv(CONFIG[task+'_test_path'], delimiter='\t', index_col=0)
    
    print('Training Dataset has {} sentences.'.format(df.shape[0]))
    print('Test Dataset has {} sentences.'.format(df_test.shape[0]))

    
    # bert and xlnet have different format
    # data preprocessing
    with open('normalized_sents.pickle', 'rb') as f:
        tv_sents, test_sents = pickle.load(f) 
    tv_labels = df['Label'].values
    test_labels = df_test['Label'].values
    
    #initialize tokenizer
    if model_type == 'BERT':
        if 'uncased' in name:
            tokenizer =  BertTokenizer.from_pretrained(name, do_lower_case=True)
        else:
            tokenizer =  BertTokenizer.from_pretrained(name)
    elif model_type == 'XLNet':
        tokenizer = XLNetTokenizer.from_pretrained(name)
    
    #get tokens for training
    tv_ids, tv_masks = get_ids_mask(tv_sents, tokenizer, max_len=CONFIG['max_len'])
    test_ids, test_masks = get_ids_mask(test_sents, tokenizer, max_len=CONFIG['max_len'])

    # Use 90% for training and 10% for validation.
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(tv_ids, tv_labels, random_state=42, test_size=0.1)
    # Do the same for the masks.
    train_masks, validation_masks, _, _ = train_test_split(tv_masks, tv_labels, random_state=42, test_size=0.1)

    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)

    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    prediction_inputs = torch.tensor(test_ids)
    prediction_masks = torch.tensor(test_masks)
    prediction_labels = torch.tensor(test_labels)
    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_dataloader = DataLoader(prediction_data,
                    shuffle=True, batch_size=CONFIG['batch_size'])

    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader = DataLoader(train_data, 
                    shuffle=True, batch_size=CONFIG['batch_size'])

    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_dataloader = DataLoader(validation_data, 
                    shuffle=True, batch_size=CONFIG['batch_size'])

    # initialize model
    num_labels = 2 if task=='A' else 4
    if model_type == 'BERT':
        model = BertForSequenceClassification.from_pretrained(name, num_labels=num_labels)
    elif model_type == 'XLNet':
        model = XLNetForSequenceClassification.from_pretrained(name, num_labels=num_labels)

    # Tell pytorch to run this model on the GPU. 
#     if torch.cuda.device_count() > 1:
#         # data parallelism
#         model = torch.nn.DataParallel(model)
#         print("Let's use", torch.cuda.device_count(), "GPUs!")
    model.to(device)

    #initialize optimizer
    if model_type == 'BERT':
        epochs = CONFIG['bert_epochs']
        optimizer = AdamW(model.parameters(),
              lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
              eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
            )
        # Total number of training steps is number of batches * number of epochs.
        total_steps = len(train_dataloader) * epochs
        # total_steps = len(all_dataloader) * CONFIG['epochs']

        # Create the learning rate scheduler.
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)

    elif model_type == 'XLNet':
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                        )
        epochs = CONFIG['xlnet_epochs']

    # training

    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    model.zero_grad()

    # For each epoch...
    for epoch_i in range(0, epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.
        if verbose:
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            print('Training...')

        # Measure how long the training epoch takes.
        t0 = time()

        # Reset the total loss for this epoch.
        total_loss = 0

        # Set our model to training mode (as opposed to evaluation mode)
        model.train()

        # This training code is based on the `run_glue.py` script here:
        # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time() - t0)

                # Report progress.
                if verbose:
                    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))


            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the 
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Forward pass (evaluate the model on this training batch)
            # `model` is of type: pytorch_pretrained_bert.modeling.BertForSequenceClassification
            outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

            loss = outputs[0]

            # Accumulate the loss. `loss` is a Tensor containing a single value; 
            # the `.item()` function just returns the Python value from the tensor.
#             total_loss += loss.mean().item()
            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
#             loss.mean().backward()
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            if model_type == 'BERT':
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient
            optimizer.step()

            # Update the learning rate.
            if model_type == 'BERT':
                scheduler.step()

            # Clear out the gradients (by default they accumulate)
            model.zero_grad()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)            

        loss_values.append(avg_train_loss)

        if verbose:
            print("")
            print("  Average training loss: {0:.2f}".format(avg_train_loss))
            print("  Training epcoh took: {:}".format(time() - t0))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

            print("")
            print("Running Validation...")

        t0 = time()

        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        # Tracking variables 
        preds, labels = [], []

        # Evaluate data for one epoch
        for batch in validation_dataloader:

            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch

            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():        
                # Forward pass, calculate logit predictions
                # token_type_ids is for the segment ids, but we only have a single sentence here.
                # See https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L258 
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            logits = outputs[0]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            preds.append(np.argmax(logits, axis=1))
            labels.append(label_ids)


        # Report the final accuracy for this validation run.
        preds = [item for sublist in preds for item in sublist]
        labels = [item for sublist in labels for item in sublist]
        # print(preds, labels)

        if verbose:
            print_score(preds, labels, task)
            print("  Validation took: {:}".format(format_time(time() - t0)))
    if verbose:
        print("")
        print("Training complete!")

        # show the training loss figure
        plt.rcParams["figure.figsize"] = (12,6)

        # Plot the learning curve.
        plt.plot(loss_values, 'b-o')

        # Label the plot.
        plt.title("Training loss")
        plt.xlabel("Batch")
        plt.ylabel("Loss")

        plt.show()


        # Prediction on test set
        print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

    # Put model in evaluation mode
    model.eval()

    t0 = time()

    # Tracking variables 
    predictions , true_labels = [], []

    # Predict 
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)

        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(np.argmax(logits, axis=1))
        true_labels.append(label_ids)

    predictions = [item for sublist in predictions for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]
    print("Epochs: {}, batch size: {}".format(epochs, CONFIG['batch_size']))
    print("The {} model's result for task {} is:".format(name, task))
    print_score(true_labels, predictions, task)
    if each == True:
        print(precision_recall_fscore(true_labels, predictions, beta=1, labels=[0,1,2,3], each=True))
    if verbose:
        print("  Prediction took: {:}".format(format_time(time() - t0)))
        print('    DONE.')

In [8]:
ft(task='B', model_type='BERT', name='bert-large-uncased', verbose=False, each=True)
# ft(task='B', model_type='XLNet', name='xlnet-base-cased', verbose=False, each=True)

Training Dataset has 3817 sentences.
Test Dataset has 784 sentences.
Epochs: 4, batch size: 32
The bert-large-uncased model's result for task B is:
Accuracy:0.6645408163265306
Precision:0.44743937791473576
Recall:0.4685547165290084
F1-score:0.4449276661884266

[0.7583333333333334, 0.6390243902439025, 0.38235294117647056, 0.0]
