In [None]:
! pip install sentencepiece

In [None]:
! pip install transformers

In [None]:
import numpy as np
import pandas as pd
import gc

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# set a seed value
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

import transformers
from transformers import BertTokenizer, BertForSequenceClassification 
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import RobertaTokenizer, RobertaModel

from transformers import AdamW, Adafactor

import warnings
warnings.filterwarnings("ignore")


print(torch.__version__)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
!wget https://www.dropbox.com/s/89axyb0eabafct0/XNLI.zip?dl=0 -O XNLI.zip
!unzip XNLI.zip

In [None]:
test = pd.read_table('/content/XNLI-1.0/xnli.dev.tsv')
train = pd.read_table('/content/XNLI-1.0/xnli.test.tsv')

In [None]:
train = train[['sentence1_tokenized', 'sentence2_tokenized', 'gold_label', 'language']]
test = test[['sentence1_tokenized', 'sentence2_tokenized', 'gold_label', 'language']]

In [None]:
train

In [None]:
train['language'].unique()

In [None]:
test

In [None]:
class TextDataset(Dataset):

  def __init__(self, sense_source,	sense_target, targets, tokenizer, max_len):
    self.sense_source = sense_source
    self.sense_target = sense_target
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.sense_source)
  
  def __getitem__(self, item):
    sense_source = str(self.sense_source[item])
    sense_target = str(self.sense_target[item])

    encoded_dict = self.tokenizer.encode_plus(
      sense_source,
      sense_target,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=True,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    # These are torch tensors already.
    padded_token_list = encoded_dict['input_ids'][0]
    att_mask = encoded_dict['attention_mask'][0]
    #token_type_ids = encoded_dict['token_type_ids'][0]
        
    # Convert the target to a torch tensor
    target = torch.tensor(self.targets[item])

    sample = (padded_token_list, att_mask, target, item)

    return sample

In [None]:
BATCH_SIZE = 64
MAX_LEN = 100
MODEL_TYPE = 'xlm-roberta-base'
NUM_EPOCHS = 50

In [None]:
NUM_CLASSES = len(train['gold_label'].unique())

In [None]:
train['gold_label'].unique()

In [None]:
train['label'] = train['gold_label'].map({'contradiction':0, 'neutral':1, 'entailment':2})
test['label'] = test['gold_label'].map({'contradiction':0, 'neutral':1, 'entailment':2})

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_TYPE, 
    num_labels = NUM_CLASSES, # The number of output labels
)

for param in model.parameters():
    param.requires_grad = True

model.to(device)

In [None]:
#sentence1_tokenized	sentence2_tokenized	gold_label	language	label

train_data = TextDataset(
  sense_source=train.sentence1_tokenized.to_numpy(),
  sense_target=train.sentence2_tokenized.to_numpy(),
  targets=train.label.to_numpy(),
  tokenizer=tokenizer,
  max_len=MAX_LEN
  )
test_data = TextDataset(
  sense_source=test.sentence1_tokenized.to_numpy(),
  sense_target=test.sentence2_tokenized.to_numpy(),
  targets=test.label.to_numpy(),
  tokenizer=tokenizer,
  max_len=MAX_LEN
  )

train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True)

In [None]:
# Define the optimizer
optimizer = AdamW(model.parameters(), 
              lr = 1e-4,
              #eps = 1e-8 
            )

In [None]:
# Set the seed.
seed_val = 101

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []


# For each epoch...
for epoch in range(NUM_EPOCHS):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))
    

    stacked_val_labels = []
    stacked_val_item_ids = []
    targets_list = []

    # ========================================
    #               Training
    # ========================================
    
    print('Training...')
    
    # put the model into train mode
    model.train()
    
    # This turns gradient calculations on and off.
    torch.set_grad_enabled(True)


    # Reset the total loss for this epoch.
    total_train_loss = 0

    for i, batch in enumerate(train_dataloader):
        
        train_status = 'Batch ' + str(i) + ' of ' + str(len(train_dataloader))
        
        print(train_status, end='\r')
        # Zero the gradients
        optimizer.zero_grad()

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
       

        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask,
                    labels=b_labels)
        
        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]
        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_train_loss += loss.item()
        
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        
        # Use the optimizer to update the weights.
        
        # Optimizer for GPU
        optimizer.step() 

        
    print('Train loss:' ,total_train_loss)


    # ========================================
    #               Validation
    # ========================================
    
    print('\nValidation...')

    # Put the model in evaluation mode.
    model.eval()

    # Turn off the gradient calculations.
    # This tells the model not to compute or store gradients.
    # This step saves memory and speeds up validation.
    torch.set_grad_enabled(False)
    
    
    # Reset the total loss for this epoch.
    total_val_loss = 0
    

    for j, batch in enumerate(test_dataloader):
        
        val_status = 'Batch ' + str(j) + ' of ' + str(len(test_dataloader))
        
        print(val_status, end='\r')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)  
        b_item_ids = batch[3]


        outputs = model(b_input_ids, 
                attention_mask=b_input_mask, 
                labels=b_labels)
        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]
        
        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_val_loss += loss.item()
        

        # Get the preds
        preds = outputs[1]


        # Move preds to the CPU
        val_preds = preds.detach().cpu().numpy()
        
        # Move the labels to the cpu
        targets_np = b_labels.to('cpu').numpy()

        # Append the labels to a numpy list
        targets_list.extend(targets_np)

        if j == 0:  # first batch
            stacked_val_preds = val_preds

        else:
            stacked_val_preds = np.vstack((stacked_val_preds, val_preds))
            
        stacked_val_item_ids.extend(b_item_ids.numpy())

    
    # Calculate the validation accuracy
    y_true = targets_list
    y_pred = np.argmax(stacked_val_preds, axis=1)
    
    val_acc = accuracy_score(y_true, y_pred)
    
    
    print('Val loss:' ,total_val_loss)
    print('Val acc: ', val_acc)


    # Save the Model
    torch.save(model.state_dict(), 'XNLI_xlm_model.pth')
    
    # Use the garbage collector to save memory.
    gc.collect()

In [None]:
!cp XNLI_xlm_model.pth "/content/drive/My Drive/term_model"