In [1]:
import pandas as pd
import numpy as np
import os
import gc
import random
from tqdm import tqdm

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import AutoTokenizer, AdamW, AutoModelForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

In [2]:
INPUT_FOLDER = "/kaggle/input/contradictory-my-dear-watson/"
TRAIN_PATH = os.path.join(INPUT_FOLDER, "train.csv")
TEST_PATH = os.path.join(INPUT_FOLDER, "test.csv")

In [3]:
MODEL_TYPE = 'xlm-roberta-base'
L_RATE = 1e-5
MAX_LEN = 256

NUM_EPOCHS = 5 #training on more epochs, the validation accuracy seems to be stable
BATCH_SIZE = 32
NUM_FOLDS = 10
NUM_FOLDS_TO_TRAIN = 3 
NUM_CORES = os.cpu_count()

NUM_CORES

tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)
path_model = 'model_unique.bin'

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [4]:
# set the seed
seed_val = 2022
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

# LOADING DATA

In [5]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
print(f"Number of instances in the training set: {len(train)}")
print(f"Number of instances in the test set: {len(test)}")
train.head()

Number of instances in the training set: 12120
Number of instances in the test set: 5195


Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


### Validation set out of the training set

In [6]:
train, validation = train_test_split(train, train_size=0.9, shuffle=True, random_state=seed_val)
train

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
5918,58817ac035,Bir Fransız salonundaki pervazların arabeskler...,Kadınların giysilerinde büyük uzun kurdeleler ...,tr,Turkish,1
11325,ea87264b91,"year, they gave morethan a half million dollar...",They make annual donations to legal services.,en,English,1
10820,0906d4f661,"Όχι, γεννήθηκε το 1900 επειδή ήταν 16 ετών, κα...",γενήθηκε στην αλλαγή του αιώνα.,el,Greek,0
7254,45b0dda4ad,Continue along the Quai Saint-Nicolas to the M...,There are houses only from the 10th century on...,en,English,2
7210,165f1529a7,The new rights are nice enough,Everyone really likes the newest benefits,en,English,1
...,...,...,...,...,...,...
6384,6ce715093c,Morrison se ha ganado el derecho a ser tan idi...,Gaddis y Pynchon no son tan idiosincráticos co...,es,Spanish,2
4720,838784c0b2,Postal Service could increase those same rates...,The rates could go up by 13 during a Postal Se...,en,English,0
173,cfcc3db071,Περιπλανηθείτε στους ορόφους και μιλήστε στους...,Οι ηθοποιοί πληρώνονται δέκα δολάρια την ημέρα...,el,Greek,1
1244,9850f913bc,في الاحتجاز ، ينفي KSM أن القاعدة لديها أي عمل...,كان للقاعدة ثلاثة عملاء يعملون في أريزونا.,ar,Arabic,1


### Data Augmentation

In [7]:
!pip install translators --upgrade
import translators as ts

Collecting translators
  Downloading translators-5.4.8-py3-none-any.whl (30 kB)
Collecting PyExecJS>=1.5.1
  Downloading PyExecJS-1.5.1.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting cryptography>=38.0.1
  Downloading cryptography-38.0.1-cp36-abi3-manylinux_2_28_x86_64.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: PyExecJS
  Building wheel for PyExecJS (setup.py) ... [?25l- \ done
[?25h  Created wheel for PyExecJS: filename=PyExecJS-1.5.1-py3-none-any.whl size=14598 sha256=33f37704f1d3c3c41d12f8fe3dfab92a73e703a79b774dc1f8af87854ff7ef7f
  Stored in directory: /root/.cache/pip/wheels/9a/ee/03/da5c0b4a8c13362beeb844eb913bbe58a89bde1de2b9157007
Successfully built PyExecJS
Installing collected packages: PyExecJS, cryptography, translators
  Attempting uninstall: cryptography
    Found existing installation: cry

Using state New South Wales server backend.


In [8]:
import translators as ts 
from dask import bag, diagnostics

In [9]:
display(train, train.lang_abv.value_counts())

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
5918,58817ac035,Bir Fransız salonundaki pervazların arabeskler...,Kadınların giysilerinde büyük uzun kurdeleler ...,tr,Turkish,1
11325,ea87264b91,"year, they gave morethan a half million dollar...",They make annual donations to legal services.,en,English,1
10820,0906d4f661,"Όχι, γεννήθηκε το 1900 επειδή ήταν 16 ετών, κα...",γενήθηκε στην αλλαγή του αιώνα.,el,Greek,0
7254,45b0dda4ad,Continue along the Quai Saint-Nicolas to the M...,There are houses only from the 10th century on...,en,English,2
7210,165f1529a7,The new rights are nice enough,Everyone really likes the newest benefits,en,English,1
...,...,...,...,...,...,...
6384,6ce715093c,Morrison se ha ganado el derecho a ser tan idi...,Gaddis y Pynchon no son tan idiosincráticos co...,es,Spanish,2
4720,838784c0b2,Postal Service could increase those same rates...,The rates could go up by 13 during a Postal Se...,en,English,0
173,cfcc3db071,Περιπλανηθείτε στους ορόφους και μιλήστε στους...,Οι ηθοποιοί πληρώνονται δέκα δολάρια την ημέρα...,el,Greek,1
1244,9850f913bc,في الاحتجاز ، ينفي KSM أن القاعدة لديها أي عمل...,كان للقاعدة ثلاثة عملاء يعملون في أريزونا.,ar,Arabic,1


en    6177
zh     370
fr     354
ar     343
ru     342
el     341
th     340
ur     340
vi     339
hi     338
es     338
sw     337
tr     323
de     319
bg     307
Name: lang_abv, dtype: int64

In [10]:
import timeit
import time
from functools import wraps
def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
#         print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
        return result
    return timeit_wrapper

In [11]:
@timeit
def translate(words, dest):
    decoded = ts.google(query_text=words, to_language=dest)
    return decoded


#TODO: use a dask dataframe instead of all this
def trans_parallel(df, dest):
    dest_choices = ['zh-CN','ar','fr','sw','ur','vi','ru','hi','el','th','es','de','tr','bg']
    if not dest:
        dest = np.random.choice(dest_choices)
    premise_bag = bag.from_sequence(df.premise.tolist()).map(translate, dest)
    hypo_bag =  bag.from_sequence(df.hypothesis.tolist()).map(translate, dest)
    with diagnostics.ProgressBar():
        premises = premise_bag.compute()
        hypos = hypo_bag.compute()
    df[['premise', 'hypothesis']] = list(zip(premises, hypos))
    return df



In [12]:
eng = train.loc[train.lang_abv == "en"].iloc[:1000].copy() \
           .pipe(trans_parallel, dest=None)

non_eng = train.loc[train.lang_abv != "en"].iloc[:1000].copy() \
        .pipe(trans_parallel, dest='en')

[########################################] | 100% Completed | 21min 12.6s
[                                        ] | 0% Completed |  0.0s

Using state New South Wales server backend.
Using state New South Wales server backend.


[########################################] | 100% Completed | 21min  5.3s
[                                        ] | 0% Completed |  0.0s

Using state New South Wales server backend.
Using state New South Wales server backend.


[########################################] | 100% Completed | 20min 20.8s
[                                        ] | 0% Completed |  0.0s

Using state New South Wales server backend.
Using state New South Wales server backend.


[########################################] | 100% Completed | 19min 46.8s


Using state New South Wales server backend.
Using state New South Wales server backend.


In [13]:
eng.to_csv("train_aug_eng.csv")
non_eng.to_csv("train_aug_non_eng.csv")
train = train.append([eng, non_eng]).reset_index(drop=True)

In [14]:
eng

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
11325,ea87264b91,Година те дадоха повече от половин милион дола...,Те правят годишни дарения за правни услуги.,en,English,1
7254,45b0dda4ad,Продължете по протежение на Quai Saint-Nicolas...,Има къщи само от 10 век на Quai.,en,English,2
7210,165f1529a7,Новите права са достатъчно хубави,Всички наистина харесват най -новите предимства,en,English,1
8490,997f6f7d3c,Да и след това на всеки пет години трябва да г...,Те продължават вечно.,en,English,2
9934,d349c3ef63,Прекарах няколко години в услугата като анализ...,Бях анализатор на разузнаването от доста време.,en,English,0
...,...,...,...,...,...,...
5283,6f1277e3f5,Втората половина на книгата се занимаваше с из...,Първата част се занимава с използването на фал...,en,English,1
1819,b607c97e57,"Тук се предполага, че други функционални компо...","Предполага се, че пощенската услуга няма значи...",en,English,0
6586,15a62aef3d,"Той твърди, че районът е благословен с най -ви...",Той говореше зле за естествените характеристик...,en,English,2
2284,0c1d674391,Китайският календар беше използван за изчисляв...,Изчисляването на годината на фондацията на Япо...,en,English,1


In [15]:
class CompDataset(Dataset):

    def __init__(self, df):
        self.df_data = df
        
    def __getitem__(self, index):
        # get the sentence from the dataframe
        sentence1 = self.df_data.iloc[index]['premise']
        sentence2 = self.df_data.iloc[index]['hypothesis']

        # Process the sentence
        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,           # Sentences to encode.
                    add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                    max_length = MAX_LEN,           # Pad or truncate all sentences.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )  
        
        # These are torch tensors already.
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        # Convert the target to a torch tensor
        target = torch.tensor(self.df_data.loc[index, 'label'])
        sample = (padded_token_list, att_mask, target)
        
        return sample

    def __len__(self):
        return len(self.df_data)
    
class TestDataset(Dataset):

    def __init__(self, df):
        self.df_data = df

    def __getitem__(self, index):

        # get the sentence from the dataframe
        sentence1 = self.df_data.iloc[index]['premise']
        sentence2 = self.df_data.iloc[index]['hypothesis']

        # Process the sentence
        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,           # Sentence to encode.
                    add_special_tokens = True,      # Add '[CLS] [\s' and '[SEP] \s'
                    max_length = MAX_LEN,           # Pad or truncate all sentences.
                    pad_to_max_length = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )
        
        # These are torch tensors already.
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
    
        sample = (padded_token_list, att_mask)

        return sample
    
    def __len__(self):
        return len(self.df_data)
    
train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train_data = CompDataset(train)
val_data = CompDataset(validation)
test_data = TestDataset(test)

train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                        num_workers=NUM_CORES)
val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                        num_workers=NUM_CORES)
test_dataloader = torch.utils.data.DataLoader(test_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                        num_workers=NUM_CORES)

# MODEL

In [16]:
tokenizer.vocab_size
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [17]:
print('bos_token_id <s>:', tokenizer.bos_token_id)
print('eos_token_id </s>:', tokenizer.eos_token_id)
print('sep_token_id </s>:', tokenizer.sep_token_id)
print('pad_token_id <pad>:', tokenizer.pad_token_id)

bos_token_id <s>: 0
eos_token_id </s>: 2
sep_token_id </s>: 2
pad_token_id <pad>: 1


# TRAINING

In [18]:
max_val_acc = 0

for epoch in range(0, NUM_EPOCHS):
    
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))
    
    # For this epoch, store the val acc scores for each fold in this list.
    # We will use this list to calculate the cv at the end of the epoch.
    epoch_acc_scores_list = []
    
    if epoch == 0:
        # define the model and the optimizer
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels = 3)
        model.to(device)
        optimizer = AdamW(model.parameters(), lr=L_RATE, eps=1e-8)
    else:
        # Get the fold model
        model.load_state_dict(torch.load(path_model))
        model.to(device)

    # ========================================
    #               Training
    # ========================================
    print('Training...')

    # put the model into train mode
    model.train()
    # This turns gradient calculations on and off.
    torch.set_grad_enabled(True)

    # Reset the total loss for this epoch.
    total_train_loss = 0

    for i, batch in enumerate(tqdm(train_dataloader)):

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask,
                    labels=b_labels)

        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]

        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_train_loss = total_train_loss + loss.item()

        # Zero the gradients
        optimizer.zero_grad()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Use the optimizer to update Weights

        # Optimizer for GPU
        optimizer.step() 

        # Optimizer for TPU
        # https://pytorch.org/xla/
#             xm.optimizer_step(optimizer, barrier=True)
    print('Train loss:', total_train_loss)


    # ========================================
    #               Validation
    # ========================================

    print('\nValidation...')
    targets_list = []
    # Put the model in evaluation mode.
    model.eval()

    # Turn off the gradient calculations.
    # This tells the model not to compute or store gradients.
    # This step saves memory and speeds up validation.
    torch.set_grad_enabled(False)

    # Reset the total loss for this epoch.
    total_val_loss = 0

    for j, val_batch in enumerate(tqdm(val_dataloader)):
        b_input_ids = val_batch[0].to(device)
        b_input_mask = val_batch[1].to(device)
        b_labels = val_batch[2].to(device)      


        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]

        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_val_loss = total_val_loss + loss.item()

        # Get the preds
        preds = outputs[1]


        # Move preds to the CPU
        val_preds = preds.detach().cpu().numpy()

        # Move the labels to the cpu
        targets_np = b_labels.to('cpu').numpy()

        # Append the labels to a numpy list
        targets_list.extend(targets_np)

        if j == 0:  # first batch
            stacked_val_preds = val_preds

        else:
            stacked_val_preds = np.vstack((stacked_val_preds, val_preds))

    # .........................................
    # Calculate the val accuracy for this fold
    # .........................................      


    # Calculate the validation accuracy
    y_true = targets_list
    y_pred = np.argmax(stacked_val_preds, axis=1)

    val_acc = accuracy_score(y_true, y_pred)

    print('Val loss:' ,total_val_loss)
    print('Val acc: ', val_acc)


    # .........................
    # Save the model
    # .........................
    if val_acc > max_val_acc:
        max_val_acc = val_acc
        torch.save(model.state_dict(), path_model)
        print(f'Saved model as {path_model} with new best validation accuracy score: {max_val_acc}')
    
    # Use the garbage collector to save memory.
    gc.collect()



Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Training...


  0%|          | 0/404 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 404/404 [05:14<00:00,  1.28it/s]


Train loss: 417.8368291258812

Validation...


  0%|          | 0/38 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 38/38 [00:09<00:00,  4.08it/s]


Val loss: 35.431178629398346
Val acc:  0.6039603960396039
Saved model as model_unique.bin with new best validation accuracy score: 0.6039603960396039
Training...


  0%|          | 0/404 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 404/404 [05:13<00:00,  1.29it/s]


Train loss: 324.7466068267822

Validation...


  0%|          | 0/38 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 38/38 [00:09<00:00,  4.04it/s]


Val loss: 29.34390652179718
Val acc:  0.6666666666666666
Saved model as model_unique.bin with new best validation accuracy score: 0.6666666666666666
Training...


  0%|          | 0/404 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 404/404 [05:13<00:00,  1.29it/s]


Train loss: 267.30894735455513

Validation...


  0%|          | 0/38 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 38/38 [00:09<00:00,  4.05it/s]


Val loss: 28.094684332609177
Val acc:  0.6963696369636964
Saved model as model_unique.bin with new best validation accuracy score: 0.6963696369636964
Training...


  0%|          | 0/404 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 404/404 [05:13<00:00,  1.29it/s]


Train loss: 220.49330435693264

Validation...


  0%|          | 0/38 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 38/38 [00:09<00:00,  4.04it/s]


Val loss: 31.786369800567627
Val acc:  0.7103960396039604
Saved model as model_unique.bin with new best validation accuracy score: 0.7103960396039604
Training...


  0%|          | 0/404 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 404/404 [05:14<00:00,  1.29it/s]


Train loss: 177.11117209494114

Validation...


  0%|          | 0/38 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 38/38 [00:09<00:00,  4.05it/s]


Val loss: 31.526852279901505
Val acc:  0.7070957095709571


# TESTING

In [19]:
# ========================================
#               Test Set
# ========================================

print('\nTest Set...')
print('Total batches:', len(test_dataloader))

model_preds_list = []

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels = 3)
model.load_state_dict(torch.load(path_model))
# Send the model to the device.
model.to(device)

# Put the model in evaluation mode.
model.eval()

# Turn off the gradient calculations.
# This tells the model not to compute or store gradients.
# This step saves memory and speeds up validation.
torch.set_grad_enabled(False)

# Reset the total loss for this epoch.
total_val_loss = 0

for j, h_batch in enumerate(tqdm(test_dataloader)):

#         inference_status = 'Batch ' + str(j + 1)
#         print(inference_status, end='\r')

    b_input_ids = h_batch[0].to(device)
    b_input_mask = h_batch[1].to(device)  

    outputs = model(b_input_ids, attention_mask=b_input_mask)

    # Get the preds
    preds = outputs[0]

    # Move preds to the CPU
    val_preds = preds.detach().cpu().numpy()

    # Stack the predictions.
    if j == 0:  # first batch
        stacked_val_preds = val_preds
    else:
        stacked_val_preds = np.vstack((stacked_val_preds, val_preds))

            
print('\nPrediction complete.')   


Test Set...
Total batches: 163


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p


Prediction complete.





In [20]:
stacked_val_preds

array([[-2.1786685 , -0.4552829 ,  2.9270284 ],
       [-3.0232186 ,  1.3627121 ,  1.7784853 ],
       [ 3.147342  , -0.997078  , -1.7299685 ],
       ...,
       [ 1.7311021 , -0.08891276, -1.2928483 ],
       [ 3.3076186 , -1.466726  , -1.4329134 ],
       [-1.7255358 , -1.121924  ,  3.0675635 ]], dtype=float32)

# SUBMISSION

In [21]:
test_preds = np.argmax(stacked_val_preds, axis=1)
test_preds

array([2, 2, 0, ..., 0, 0, 2])

In [22]:
path = '/kaggle/input/contradictory-my-dear-watson/sample_submission.csv'

df_sample = pd.read_csv(path)
df_sample['prediction'] = test_preds
df_sample.head()

Unnamed: 0,id,prediction
0,c6d58c3f69,2
1,cefcc82292,2
2,e98005252c,0
3,58518c10ba,1
4,c32b0d16df,0


In [23]:
df_sample.to_csv('submission.csv', index=False)