In [1]:
import random
import gc
from math import floor, ceil

from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from scipy.stats import spearmanr

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel, AdamW

pd.set_option('max_column', 50)
gc.collect()

22

In [2]:
random.seed(0)
torch.manual_seed(0)
np.random.seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_df = pd.read_csv('../input/google-quest-challenge/train.csv')

print(train_df.info())
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6079 entries, 0 to 6078
Data columns (total 41 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   qa_id                                  6079 non-null   int64  
 1   question_title                         6079 non-null   object 
 2   question_body                          6079 non-null   object 
 3   question_user_name                     6079 non-null   object 
 4   question_user_page                     6079 non-null   object 
 5   answer                                 6079 non-null   object 
 6   answer_user_name                       6079 non-null   object 
 7   answer_user_page                       6079 non-null   object 
 8   url                                    6079 non-null   object 
 9   category                               6079 non-null   object 
 10  host                                   6079 non-null   object 
 11  ques

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,photo.stackexchange.com,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,rpg.stackexchange.com,1.0,1.0,0.0,0.5,1.0,1.0,0.444444,0.444444,0.666667,0.0,0.0,0.666667,0.666667,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,electronics.stackexchange.com,0.888889,0.666667,0.0,1.0,1.0,1.0,0.666667,0.444444,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.333333,0.0,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,judaism.stackexchange.com,0.888889,0.666667,0.666667,1.0,1.0,1.0,0.444444,0.444444,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,graphicdesign.stackexchange.com,1.0,0.666667,0.0,1.0,1.0,1.0,0.666667,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [4]:
test_df = pd.read_csv('../input/google-quest-challenge/test.csv')

print(test_df.info())
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   qa_id               476 non-null    int64 
 1   question_title      476 non-null    object
 2   question_body       476 non-null    object
 3   question_user_name  476 non-null    object
 4   question_user_page  476 non-null    object
 5   answer              476 non-null    object
 6   answer_user_name    476 non-null    object
 7   answer_user_page    476 non-null    object
 8   url                 476 non-null    object
 9   category            476 non-null    object
 10  host                476 non-null    object
dtypes: int64(1), object(10)
memory usage: 41.0+ KB
None


Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


In [5]:
sub = pd.read_csv('../input/google-quest-challenge/sample_submission.csv')
TARGET_COLUMNS = sub.columns.values[1:].tolist()
TARGET_COLUMNS
sub.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308
1,46,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448
2,70,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673
3,132,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401
4,200,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074


In [6]:
MAX_LEN = 512
SEP_TOKEN = 102

class QUESTDataset(Dataset):
    def __init__(self, df, tokenizer, train_mode=True, labeled=True):
        self.df = df
        self.train_mode = train_mode
        self.labeled = labeled
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_ids, seg_ids, attention_mask = self.get_token_ids(row)
#         attention_mask = (input_ids > 0)
        if self.labeled:
            labels = self.get_label(row)
            return input_ids, seg_ids, attention_mask, labels
        else:
            return input_ids, seg_ids, attention_mask

    def __len__(self):
        return len(self.df)
    
    def select_tokens(self, tokens, max_num):
        if len(tokens) <= max_num:
            return tokens
        if self.train_mode:
            num_remove = len(tokens) - max_num
            remove_start = random.randint(0, len(tokens)-num_remove-1)
            return tokens[:remove_start] + tokens[remove_start + num_remove:]
        else:
            return tokens[:max_num//2] + tokens[-(max_num - max_num//2):]
            
    def trim_input(self, title, question, answer, max_sequence_length=MAX_LEN, 
                t_max_len=30, q_max_len=238, a_max_len=238):
        t = self.tokenizer.tokenize(title)
        q = self.tokenizer.tokenize(question)
        a = self.tokenizer.tokenize(answer)
        t_len = len(t)
        q_len = len(q)
        a_len = len(a)
        
        if (t_len+q_len+a_len+6) > max_sequence_length:
            if t_max_len > t_len:
                t_new_len = t_len
                a_max_len = a_max_len + floor((t_max_len - t_len)/2)
                q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
            else:
                t_new_len = t_max_len

            if a_max_len > a_len:
                a_new_len = a_len 
                q_new_len = q_max_len + (a_max_len - a_len)
            elif q_max_len > q_len:
                a_new_len = a_max_len + (q_max_len - q_len)
                q_new_len = q_len
            else:
                a_new_len = a_max_len
                q_new_len = q_max_len


            if t_new_len+a_new_len+q_new_len+6 != max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (max_sequence_length, (t_new_len+a_new_len+q_new_len+6)))

            if t_len > t_new_len:
                ind1 = floor(t_new_len/2)
                ind2 = ceil(t_new_len/2)
                t = t[:ind1]+t[-ind2:]
            else:
                t = t[:t_new_len]

            if q_len > q_new_len:
                ind1 = floor(q_new_len/2)
                ind2 = ceil(q_new_len/2)
                q = q[:ind1]+q[-ind2:]
            else:
                q = q[:q_new_len]

            if a_len > a_new_len:
                ind1 = floor(a_new_len/2)
                ind2 = ceil(a_new_len/2)
                a = a[:ind1]+a[-ind2:]
            else:
                a = a[:a_new_len]

        return t, q, a

    def get_attention_masks(self, tokens):
        if len(tokens) > 512:
            raise IndexError("Token length more than max seq length!")
        return [1]*len(tokens) + [0] * (512 - len(tokens))

    def get_seg_ids(self, ids):
        seq_ids = torch.zeros_like(ids)
        seq_idx = 0
        first_sep = True
        sep_token = self.tokenizer.sep_token
        for i, e in enumerate(ids):
            seq_ids[i] = seq_idx
            if e == sep_token:
                if first_sep:
                    first_sep = False
                else:
                    seq_idx = 1
        pad_idx = torch.nonzero(ids == 0)
        seq_ids[pad_idx] = 0

        return seq_ids
    
    def get_token_ids(self, row):
        t_tokens, q_tokens, a_tokens = self.trim_input(row.question_title, row.question_body, row.answer)
        tokens = ['<s>'] + t_tokens + ['</s>','</s>'] + q_tokens + ['</s>','</s>'] + a_tokens + ['</s>']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < MAX_LEN:
            token_ids += [self.tokenizer.pad_token_id] * (MAX_LEN - len(token_ids)) # padding
        
        ids = torch.tensor(token_ids)
        seg = self.get_seg_ids(ids)
        attention_masks = self.get_attention_masks(tokens)
        attention_masks = torch.tensor(attention_masks)
        return ids, seg, attention_masks

    def get_label(self, row):
        return torch.tensor(row[TARGET_COLUMNS].values.astype(np.float32))
    
    def collate_fn(self, batch):
        input_ids = torch.stack([x[0] for x in batch])
        seg_ids = torch.stack([x[1] for x in batch])
        attention_mask = torch.stack([x[2] for x in batch])
        
        if self.labeled:
            labels = torch.stack([x[3] for x in batch])
            return input_ids, seg_ids, attention_mask, labels
        else:
            return input_ids, seg_ids, attention_mask

In [7]:
def get_test_loader(tokenizer, batch_size=4):
    test_dataset = QUESTDataset(test_df, tokenizer, train_mode=False, labeled=False)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=test_dataset.collate_fn, drop_last=False)
    test_loader.num = len(test_df)
    
    return test_loader


def get_train_val_loaders(df, tokenizer, batch_size=4, val_batch_size=4, ifold=0):
    dataset = QUESTDataset(df, tokenizer)
    custom_collat_fn = dataset.collate_fn

    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=custom_collat_fn, drop_last=True)
    loader.num = len(loader)

    return loader

In [8]:
def train(train_loader, val_loader):
    total_loss = []
    total_spearman_val = []
    for i in range(5): # 5
        epoch_loss = []
        epoch_spearman_val = []
        for input_ids, seg_ids, attention_mask, target in tqdm(train_loader):
            input_ids, seg_ids, attention_mask, target = (input_ids.to(device), seg_ids.to(device), 
                                                          attention_mask.to(device), target.to(device))
            optimizer.zero_grad()
            y_pred = model(input_ids, seg_ids, attention_mask)
            loss = criterion(y_pred, target)
            loss.backward()
            optimizer.step()
            epoch_loss.append(loss.item())
        mean_loss = sum(epoch_loss) / len(epoch_loss)
        total_loss.append(mean_loss)

        del input_ids, seg_ids, attention_mask, target
        gc.collect()
        for input_ids, seg_ids, attention_mask, target in tqdm(val_loader):
            input_ids, seg_ids, attention_mask = (input_ids.to(device), seg_ids.to(device), attention_mask.to(device))
            y_pred = model(input_ids, seg_ids, attention_mask)
            y_pred = y_pred.cpu().detach().numpy()
            target = target.cpu().detach().numpy()
            rho_val = np.mean([spearmanr(target[:, ind] + np.random.normal(0, 1e-7, target.shape[0]), y_pred[:, ind] + np.random.normal(0, 1e-7, y_pred.shape[0]), nan_policy='omit').correlation for ind in range(y_pred.shape[1])])
            rho_val = round(rho_val, 4)
            epoch_spearman_val.append(rho_val)

        mean_spearman_val = sum(epoch_spearman_val) / len(epoch_spearman_val)
        total_spearman_val.append(mean_spearman_val)
        print("EPOCH", i+1, "-- TRAIN Loss:", mean_loss, "-- Spearman VAL:", mean_spearman_val)
    
    torch.cuda.empty_cache()
    gc.collect()
    
    return total_loss, total_spearman_val

In [9]:
class QUESTModel(nn.Module):
    def __init__(self, transformer):
        super(QUESTModel, self).__init__()
        self.roberta = AutoModel.from_pretrained(transformer)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, 30)
    
    def forward(self, input_ids, seg_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids, token_type_ids=seg_ids, attention_mask=attention_mask)
        pooler_output = roberta_output[1]
        out = self.classifier(pooler_output)
        return out
gc.collect()

60

In [10]:
gc.collect()
RoBERTa = '../input/robertalarge'
tokenizer = AutoTokenizer.from_pretrained(RoBERTa)
criterion = nn.BCEWithLogitsLoss()
kf = KFold(n_splits=5, random_state=0)
for fold, (train_index, val_index) in enumerate(kf.split(train_df)):
    print("Fold:", fold+1)
    print("Creating model...")
    model = QUESTModel(RoBERTa)
    model = model.to(device)
    print("Model has been creating and moved to GPU")
    optimizer = AdamW(model.parameters(), lr=2e-5)
#     lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=2e-6)
    sub_train_df, sub_val_df = train_df.iloc[train_index], train_df.iloc[val_index]
    print("Train and Valid Shapes are", sub_train_df.shape, sub_val_df.shape)
    print("Preparing DataLoaders...")
    train_loader = get_train_val_loaders(sub_train_df, tokenizer)
    val_loader = get_train_val_loaders(sub_val_df, tokenizer)
    
    print("Started training...")
    for i in range(4):
        epoch_loss = []
        epoch_spearman_val = []
        for input_ids, seg_ids, attention_mask, target in tqdm(train_loader):
            input_ids, seg_ids, attention_mask, target = (input_ids.to(device), seg_ids.to(device), 
                                                          attention_mask.to(device), target.to(device))
            optimizer.zero_grad()
            y_pred = model(input_ids, seg_ids, attention_mask)
            loss = criterion(y_pred, target)
            loss.backward()
            optimizer.step()
            epoch_loss.append(loss.item())
        
#         lr_scheduler.step()
        mean_loss = sum(epoch_loss) / len(epoch_loss)
        del input_ids, seg_ids, attention_mask, target
        gc.collect()

        for input_ids, seg_ids, attention_mask, target in tqdm(val_loader):
            input_ids, seg_ids, attention_mask = (input_ids.to(device), seg_ids.to(device), attention_mask.to(device))
            y_pred = model(input_ids, seg_ids, attention_mask)
            y_pred = y_pred.cpu().detach().numpy()
            target = target.cpu().detach().numpy()
            rho_val = np.mean([spearmanr(target[:, ind] + np.random.normal(0, 1e-7, target.shape[0]), y_pred[:, ind] + np.random.normal(0, 1e-7, y_pred.shape[0]), nan_policy='omit').correlation for ind in range(y_pred.shape[1])])
            rho_val = round(rho_val, 4)
            epoch_spearman_val.append(rho_val)

        mean_spearman_val = sum(epoch_spearman_val) / len(epoch_spearman_val)
        print("EPOCH", i+1, "-- TRAIN Loss:", mean_loss, "-- Spearman VAL:", mean_spearman_val)

    print("Saving model....")
    torch.save(model.state_dict(), f'questRoBERTa_{fold}.pt')
    del model
    torch.cuda.empty_cache()
    gc.collect()
    print("Model has been saved!")
    print("Fold", fold+1, "is done!")



Fold: 1
Creating model...


Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1215 [00:00<?, ?it/s]

Model has been creating and moved to GPU
Train and Valid Shapes are (4863, 41) (1216, 41)
Preparing DataLoaders...
Started training...


100%|██████████| 1215/1215 [19:08<00:00,  1.06it/s]
100%|██████████| 304/304 [02:15<00:00,  2.24it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 1 -- TRAIN Loss: 0.3907098813312044 -- Spearman VAL: 0.2408536184210527


100%|██████████| 1215/1215 [19:13<00:00,  1.05it/s]
100%|██████████| 304/304 [02:16<00:00,  2.23it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 2 -- TRAIN Loss: 0.36400956047905814 -- Spearman VAL: 0.2493200657894737


100%|██████████| 1215/1215 [19:20<00:00,  1.05it/s]
100%|██████████| 304/304 [02:17<00:00,  2.22it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 3 -- TRAIN Loss: 0.3500045690026303 -- Spearman VAL: 0.27554999999999996


100%|██████████| 1215/1215 [19:22<00:00,  1.05it/s]
100%|██████████| 304/304 [02:16<00:00,  2.23it/s]


EPOCH 4 -- TRAIN Loss: 0.3419937912819317 -- Spearman VAL: 0.26324407894736834
Saving model....
Model has been saved!
Fold 1 is done!
Fold: 2
Creating model...


Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1215 [00:00<?, ?it/s]

Model has been creating and moved to GPU
Train and Valid Shapes are (4863, 41) (1216, 41)
Preparing DataLoaders...
Started training...


100%|██████████| 1215/1215 [19:20<00:00,  1.05it/s]
100%|██████████| 304/304 [02:16<00:00,  2.23it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 1 -- TRAIN Loss: 0.39221839904785155 -- Spearman VAL: 0.23210394736842088


100%|██████████| 1215/1215 [19:22<00:00,  1.04it/s]
100%|██████████| 304/304 [02:16<00:00,  2.22it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 2 -- TRAIN Loss: 0.36655432304727686 -- Spearman VAL: 0.26112039473684184


100%|██████████| 1215/1215 [19:26<00:00,  1.04it/s]
100%|██████████| 304/304 [02:18<00:00,  2.20it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 3 -- TRAIN Loss: 0.35318150629477246 -- Spearman VAL: 0.2805049342105265


100%|██████████| 1215/1215 [19:28<00:00,  1.04it/s]
100%|██████████| 304/304 [02:18<00:00,  2.20it/s]


EPOCH 4 -- TRAIN Loss: 0.3396624461613565 -- Spearman VAL: 0.2748901315789475
Saving model....
Model has been saved!
Fold 2 is done!
Fold: 3
Creating model...


Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1215 [00:00<?, ?it/s]

Model has been creating and moved to GPU
Train and Valid Shapes are (4863, 41) (1216, 41)
Preparing DataLoaders...
Started training...


100%|██████████| 1215/1215 [19:35<00:00,  1.03it/s]
100%|██████████| 304/304 [02:21<00:00,  2.15it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 1 -- TRAIN Loss: 0.39209907101505576 -- Spearman VAL: 0.23519868421052617


100%|██████████| 1215/1215 [19:36<00:00,  1.03it/s]
100%|██████████| 304/304 [02:19<00:00,  2.17it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 2 -- TRAIN Loss: 0.3649274524347282 -- Spearman VAL: 0.2584871710526316


100%|██████████| 1215/1215 [19:23<00:00,  1.04it/s]
100%|██████████| 304/304 [02:16<00:00,  2.23it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 3 -- TRAIN Loss: 0.3532154031741766 -- Spearman VAL: 0.27576611842105264


100%|██████████| 1215/1215 [19:24<00:00,  1.04it/s]
100%|██████████| 304/304 [02:19<00:00,  2.17it/s]


EPOCH 4 -- TRAIN Loss: 0.3404721587528417 -- Spearman VAL: 0.26644934210526344
Saving model....
Model has been saved!
Fold 3 is done!
Fold: 4
Creating model...


Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1215 [00:00<?, ?it/s]

Model has been creating and moved to GPU
Train and Valid Shapes are (4863, 41) (1216, 41)
Preparing DataLoaders...
Started training...


100%|██████████| 1215/1215 [19:31<00:00,  1.04it/s]
100%|██████████| 304/304 [02:19<00:00,  2.18it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 1 -- TRAIN Loss: 0.3918881716796891 -- Spearman VAL: 0.23504440789473677


100%|██████████| 1215/1215 [19:32<00:00,  1.04it/s]
100%|██████████| 304/304 [02:18<00:00,  2.19it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 2 -- TRAIN Loss: 0.3697597707372634 -- Spearman VAL: 0.26495690789473697


100%|██████████| 1215/1215 [19:27<00:00,  1.04it/s]
100%|██████████| 304/304 [02:18<00:00,  2.20it/s]
  0%|          | 0/1215 [00:00<?, ?it/s]

EPOCH 3 -- TRAIN Loss: 0.3578720175802953 -- Spearman VAL: 0.2617976973684211


100%|██████████| 1215/1215 [19:27<00:00,  1.04it/s]
100%|██████████| 304/304 [02:19<00:00,  2.18it/s]


EPOCH 4 -- TRAIN Loss: 0.3467442948263859 -- Spearman VAL: 0.2631138157894738
Saving model....
Model has been saved!
Fold 4 is done!
Fold: 5
Creating model...


Some weights of the model checkpoint at ../input/robertalarge were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1216 [00:00<?, ?it/s]

Model has been creating and moved to GPU
Train and Valid Shapes are (4864, 41) (1215, 41)
Preparing DataLoaders...
Started training...


100%|██████████| 1216/1216 [19:27<00:00,  1.04it/s]
100%|██████████| 303/303 [02:17<00:00,  2.21it/s]
  0%|          | 0/1216 [00:00<?, ?it/s]

EPOCH 1 -- TRAIN Loss: 0.38927347211804436 -- Spearman VAL: 0.23522508250825075


100%|██████████| 1216/1216 [19:27<00:00,  1.04it/s]
100%|██████████| 303/303 [02:17<00:00,  2.21it/s]
  0%|          | 0/1216 [00:00<?, ?it/s]

EPOCH 2 -- TRAIN Loss: 0.3630618649828983 -- Spearman VAL: 0.25718316831683163


100%|██████████| 1216/1216 [19:29<00:00,  1.04it/s]
100%|██████████| 303/303 [02:17<00:00,  2.20it/s]
  0%|          | 0/1216 [00:00<?, ?it/s]

EPOCH 3 -- TRAIN Loss: 0.35091120530361014 -- Spearman VAL: 0.26314785478547864


100%|██████████| 1216/1216 [19:33<00:00,  1.04it/s]
100%|██████████| 303/303 [02:19<00:00,  2.17it/s]


EPOCH 4 -- TRAIN Loss: 0.33921235881892864 -- Spearman VAL: 0.2821765676567657
Saving model....
Model has been saved!
Fold 5 is done!
