# Overview
XLNet-base: 5 FOLD CV to predict. <br>

In [None]:
import numpy as np
import pandas as pd
import sys, gc, os, re, tqdm, datetime, random, itertools, copy, math, html

sys.path.extend([
    '../../input/sacremoses/', 
    '../../input/transformers/'
])
import sacremoses, transformers

import torch
from torchvision import datasets, models, transforms
from sklearn.utils import shuffle


In [None]:
VERSION = 'XLNetMD005'

LOCAL_PATH = '../../input/google-quest-challenge'
MODEL_PRETRAINED_WEIGHTS_PATH = '../../input/xlnet-base-cased-huggingface-weights'
WEIGHT_PATH = '../../input/weights'
N_SPLIT = 5
FOLD_ID = [0,1,2,3,4]
SEED = 9253
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 8
BATCH_ACCUMULATION_COUNT = 4
EPOCHS = 100
EPOCH_RELEASE = 2
EARLY_STOPPING = 3
LR = 4e-5
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
NUM_WORKERS = 4
TRAINING = True

def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(SEED)


# 1. Load Datasets

In [None]:
train = pd.read_csv(LOCAL_PATH+'/train.csv')
print(train.shape)
train.head(2)

In [None]:
test = pd.read_csv(LOCAL_PATH+'/test.csv')
print(test.shape)
test.head(2)

In [None]:
sample_submission = pd.read_csv(LOCAL_PATH+'/sample_submission.csv')
print(sample_submission.shape)
sample_submission.head(2)

# 2. Preprocessing
Credit to [Bert-base TF2.0 (now Huggingface transformer)](https://www.kaggle.com/akensert/bert-base-tf2-0-now-huggingface-transformer)

In [None]:
##########################
# Define Tokenizer and some utility variables
##########################

tokenizer = transformers.XLNetTokenizer(
    vocab_file = MODEL_PRETRAINED_WEIGHTS_PATH + '/xlnet-base-cased-spiece.model'
)
TOKEN_CLS = tokenizer.cls_token
TOKEN_SEP = tokenizer.sep_token
SEP_TOKEN_ID = tokenizer.sep_token_id

ADD_TOKEN_LIST = [
    '[TITLE]', 
    '[BODY]',
    '[CATEGORY]',
    '[DOMAIN]',
    '[HOST]',
    '[category:LIFE_ARTS]', 
    '[category:CULTURE]', 
    '[category:SCIENCE]', 
    '[category:STACKOVERFLOW]', 
    '[category:TECHNOLOGY]', 
    '[domain:stackexchange]',
    '[domain:stackoverflow]',
    '[domain:askubuntu]',
    '[domain:serverfault]',
    '[domain:superuser]',
    '[domain:mathoverflow]',
    '\n'
] + list(train.host.unique())
num_added_tokens = tokenizer.add_tokens(ADD_TOKEN_LIST)
print('Number of Tokens Added : ', num_added_tokens)

output_categories_question = list(train.columns[11:32])
output_categories_answer = list(train.columns[32:])

train.question_title= train.question_title.apply(html.unescape)
train.question_body = train.question_body.apply(html.unescape)
train.answer        = train.answer.apply(html.unescape)
test.question_title = test.question_title.apply(html.unescape)
test.question_body  = test.question_body.apply(html.unescape)
test.answer         = test.answer.apply(html.unescape)


In [None]:
##########################
# Define Datasets and Dataloaders
##########################
from math import floor, ceil

class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, target_columns, max_sequence_length=MAX_SEQUENCE_LENGTH, 
                 target_level=0, train_mode=True, labeled=True):
        '''
        target_level
            0 : question only
            1 : answer only
        '''
        self.df = df
        self.target_columns = target_columns
        self.max_sequence_length = max_sequence_length
        self.target_level = target_level
        self.train_mode = train_mode
        self.labeled = labeled
        self.tokenizer =  transformers.XLNetTokenizer(
            vocab_file = MODEL_PRETRAINED_WEIGHTS_PATH + '/xlnet-base-cased-spiece.model'
        )
        self.tokenizer.add_tokens(ADD_TOKEN_LIST)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        token_ids, seg_ids = self.get_token_ids(row)
        if self.labeled:
            labels = self.get_label(row)
            return token_ids, seg_ids, labels
        else:
            return token_ids, seg_ids

    def __len__(self):
        return len(self.df)

    def select_tokens(self, tokens, max_num):
        if len(tokens) <= max_num:
            return tokens
        if self.train_mode:
            num_remove = len(tokens) - max_num
            remove_start = random.randint(0, len(tokens)-num_remove-1)
            return tokens[:remove_start] + tokens[remove_start + num_remove:]
        else:
            return tokens[:max_num//2] + tokens[-(max_num - max_num//2):]

    def trim_input_q(self, title, question, t_max_len=58, q_max_len=442):
            
        t = self.tokenizer.tokenize(title)
        q = self.tokenizer.tokenize(question)

        t_len = len(t)
        q_len = len(q)

        if (t_len+q_len+12) > self.max_sequence_length:

            if t_max_len > t_len:
                t_new_len = t_len
                q_new_len = q_max_len + t_max_len - t_len
            else:
                t_new_len = t_max_len
                q_new_len = q_max_len

            if t_new_len+q_new_len+12 != self.max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (self.max_sequence_length, (t_new_len+q_new_len+12)))

            t = t[:t_new_len]

            q_len_head = round(q_new_len*3/4)
            q_len_tail = -1 * (q_new_len - q_len_head)
            q = q[:q_len_head] + q[q_len_tail:]

        return t, q
    
    def trim_input_a(self, title, answer, t_max_len=58, a_max_len=442):
        
        t = self.tokenizer.tokenize(title)
        a = self.tokenizer.tokenize(answer)

        t_len = len(t)
        a_len = len(a)

        if (t_len+a_len+12) > self.max_sequence_length:

            if t_max_len > t_len:
                t_new_len = t_len
                a_new_len = a_max_len + t_max_len - t_len
            else:
                t_new_len = t_max_len
                a_new_len = a_max_len

            if t_new_len+a_new_len+12 != self.max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (self.max_sequence_length, (t_new_len+a_new_len+12)))

            t = t[:t_new_len]
            
            a_len_head = round(a_new_len*3/4)
            a_len_tail = -1 * (a_new_len - a_len_head)
            a = a[:a_len_head] + a[a_len_tail:]

        return t, a
    
    def get_token_ids(self, row):
        if self.target_level == 0:
            t_tokens, q_tokens = self.trim_input_q(row.question_title, row.question_body)
            tokens = [TOKEN_CLS] + ['[CATEGORY]'] + ['[category:{}]'.format(row['category'])] + \
                        ['[DOMAIN]'] + ['[domain:{}]'.format(row['host'].split('.')[-2])] + \
                        ['[HOST]'] + [row['host']] + [TOKEN_SEP] + ['[TITLE]'] + [TOKEN_SEP] + \
                        t_tokens + ['[BODY]'] + q_tokens + [TOKEN_SEP]
        elif self.target_level == 1:
            t_tokens, a_tokens = self.trim_input_a(row.question_title, row.answer)
            tokens = [TOKEN_CLS] + ['[CATEGORY]'] + ['[category:{}]'.format(row['category'])] + \
                        ['[DOMAIN]'] + ['[domain:{}]'.format(row['host'].split('.')[-2])] + \
                        ['[HOST]'] + [row['host']] + [TOKEN_SEP] + ['[TITLE]'] + [TOKEN_SEP] + \
                        t_tokens + ['[BODY]'] + a_tokens + [TOKEN_SEP]
        else:
            raise ValueError('target_level should be 0 or 1')
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < self.max_sequence_length:
            token_ids += [0] * (self.max_sequence_length - len(token_ids))
        ids = torch.tensor(token_ids)
        seg_ids = self.get_seg_ids(ids)
        return ids, seg_ids
    
    def get_seg_ids(self, ids):
        seg_ids = torch.zeros_like(ids)
        seg_idx = 0
        first_sep = True
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID:
                if first_sep:
                    first_sep = False
                else:
                    seg_idx = 1
        pad_idx = torch.nonzero(ids == 0)
        seg_ids[pad_idx] = 0
        return seg_ids

    def get_label(self, row):
        return torch.tensor(row[self.target_columns].values.astype(np.float32))

    def collate_fn(self, batch):
        token_ids = torch.stack([x[0] for x in batch])
        seg_ids = torch.stack([x[1] for x in batch])
        if self.labeled:
            labels = torch.stack([x[2] for x in batch])
            return token_ids, seg_ids, labels
        else:
            return token_ids, seg_ids


def get_test_loader(batch_size=BATCH_SIZE, target_level=0):
    df = pd.read_csv(LOCAL_PATH+'/test.csv')
    ds_test = QuestDataset(df, None, target_level=target_level, train_mode=False, labeled=False)
    loader = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, collate_fn=ds_test.collate_fn, drop_last=False)
    loader.num = len(df)
    return loader


def get_train_val_loaders(target_cols, batch_size=BATCH_SIZE, target_level=0, val_batch_size=4, ifold=0):

    df = pd.read_csv(LOCAL_PATH+'/train.csv')
    df = shuffle(df, random_state=SEED)
    gkf = GroupKFold(n_splits=N_SPLIT).split(X=df.question_body, groups=df.question_body)
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        if fold == ifold:
            df_train = df.iloc[train_idx]
            df_val = df.iloc[valid_idx]
            break

    ds_train = QuestDataset(df_train, target_cols, target_level=target_level)
    train_loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, collate_fn=ds_train.collate_fn, drop_last=True)
    train_loader.num = len(df_train)

    ds_val = QuestDataset(df_val, target_cols, target_level=target_level, train_mode=False)
    val_loader = torch.utils.data.DataLoader(ds_val, batch_size=val_batch_size, shuffle=False, num_workers=NUM_WORKERS, collate_fn=ds_val.collate_fn, drop_last=False)
    val_loader.num = len(df_val)
    val_loader.df = df_val

    return train_loader, val_loader


# 3. Model Definition

In [None]:
import torch
import torch.nn as nn
from torch.nn import Identity
import torch.nn.functional as F

class SequenceSummary(nn.Module):
    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
        Args of the config class:
            summary_type:
                - 'last' => [default] take the last token hidden state (like XLNet)
                - 'first' => take the first token hidden state (like Bert)
                - 'mean' => take the mean of all tokens hidden states
                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
                - 'attn' => Not implemented now, use multi-head attention
            summary_use_proj: Add a projection after the vector extraction
            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
            summary_first_dropout: Add a dropout before the projection and activation
            summary_last_dropout: Add a dropout after the projection and activation
    """
    def __init__(self, config):
        super(SequenceSummary, self).__init__()

        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
        if self.summary_type == 'attn':
            # We should use a standard multi-head attention module with absolute positional embedding for that.
            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
            raise NotImplementedError

        self.summary = Identity()
        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
                num_classes = config.num_labels
            else:
                num_classes = config.hidden_size
            self.summary = nn.Linear(config.hidden_size, num_classes)

        self.activation = Identity()
        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
            self.activation = nn.Tanh()

        self.first_dropout = Identity()
        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
            self.first_dropout = nn.Dropout(config.summary_first_dropout)

        self.last_dropout = Identity()
        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
            self.last_dropout = nn.Dropout(config.summary_last_dropout)

    def forward(self, hidden_states, cls_index=None):
        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
                if summary_type == 'cls_index' and cls_index is None:
                    we take the last token of the sequence as classification token
        """
        if self.summary_type == 'last':
            output = hidden_states[:, -1]
        elif self.summary_type == 'first':
            output = hidden_states[:, 0]
        elif self.summary_type == 'mean':
            output = hidden_states.mean(dim=1)
        elif self.summary_type == 'cls_index':
            if cls_index is None:
                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
            else:
                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
                cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
            output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
        elif self.summary_type == 'attn':
            raise NotImplementedError

        output = self.first_dropout(output)
        output = self.summary(output)
        output = self.activation(output)
        output = self.last_dropout(output)

        return output

class XLNetForGoogleQuest(nn.Module):
    def __init__(self, n_classes=30):
        super(XLNetForGoogleQuest, self).__init__()
        self.model_name = 'XLNetForGoogleQuest'
        self.config = transformers.XLNetConfig.from_json_file(
            MODEL_PRETRAINED_WEIGHTS_PATH + '/xlnet-base-cased-config.json'
        )
        self.transformer_model = transformers.XLNetModel.from_pretrained(
            MODEL_PRETRAINED_WEIGHTS_PATH + '/xlnet-base-cased-pytorch_model.bin', 
            config=self.config
        )
        self.transformer_model.resize_token_embeddings(len(tokenizer))
        self.sequence_summary = SequenceSummary(
            config=self.config
        )
        self.fc = nn.Sequential(
            nn.Dropout(p = 0.2),
            nn.Linear(768, n_classes)
        )

    def forward(self, ids, seg_ids):
        attention_mask = (ids > 0).float()
        seq_out = self.transformer_model(
            input_ids=ids, token_type_ids=seg_ids, attention_mask=attention_mask)[0]
        pool_out = self.sequence_summary(seq_out)
        logits = self.fc(pool_out)
        return logits
    
# def test_model():
#     x = torch.tensor([[1,2,3,4,5, 0, 0], [1,2,3,4,5, 0, 0]])
#     seg_ids = torch.tensor([[0,0,0,0,0, 0, 0], [0,0,0,0,0, 0, 0]])
#     model = XLNetForGoogleQuest()
#     y = model(x, seg_ids)
#     print(y)
    
# test_model()

# 4. Training Utility Functions

In [None]:
##########################
# Training Utility Functions
##########################
from scipy.stats import spearmanr

def training_with_accumulation(model, train_loader, optimizer, criterion, scheduler):
    
    model.train()
    avg_loss = 0.
    optimizer.zero_grad()
    
    bar = tqdm.tqdm_notebook(
        enumerate(train_loader), 
        total=len(train_loader), 
        postfix={"train_loss":0.0,}
    )
    for idx, batch in bar:
        
        token_ids, seg_ids, labels = batch
        token_ids, seg_ids, labels = token_ids.to(DEVICE), seg_ids.to(DEVICE), labels.to(DEVICE)
        
        logits = model(token_ids.long(),seg_ids.long())
        loss = criterion(logits, labels)
        loss.backward()
        if (idx + 1) % BATCH_ACCUMULATION_COUNT == 0:    
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        avg_loss += loss.item() / (len(train_loader))
        
        bar.set_postfix(ordered_dict={
            "train_loss":loss.item(),
        })
        del token_ids, seg_ids, labels

    torch.cuda.empty_cache()
    gc.collect()
    
    return avg_loss


def validate_model(model, val_loader, target_cols, batch_size=4, verbose=False):

    avg_val_loss = 0.
    model.eval()
    
    y_preds = np.zeros((val_loader.num, len(target_cols)))
    y_true = np.zeros((val_loader.num, len(target_cols)))
    
    with torch.no_grad():
        
        for idx, batch in enumerate(val_loader):
            token_ids, seg_ids, labels = batch
            token_ids, seg_ids, labels = token_ids.to(DEVICE), seg_ids.to(DEVICE), labels.to(DEVICE)
            
            logits = model(token_ids.long(),seg_ids.long())
            logits = torch.sigmoid(logits)
            
            avg_val_loss += criterion(logits, labels).item() / len(val_loader)
            y_preds[idx*batch_size : (idx+1)*batch_size] = logits.detach().cpu().squeeze().numpy()
            y_true[idx*batch_size : (idx+1)*batch_size]  = labels.detach().cpu().squeeze().numpy()
            
            del token_ids, seg_ids, labels
            
        torch.cuda.empty_cache()
        gc.collect()
        
        score = 0
        for i in range(len(target_cols)):
            spear = np.nan_to_num(spearmanr(y_true[:, i], y_preds[:, i]).correlation)
            score += spear
            if verbose:
                print('Target Column {} : {}'.format(target_cols[i], spear))
            
    return avg_val_loss, score/len(target_cols)


def predict(model, test_loader, target_cols, batch_size=BATCH_SIZE):
    
    test_preds = np.zeros((test_loader.num, len(target_cols)))
    
    model.eval()
    tk0 = tqdm.tqdm_notebook(enumerate(test_loader))
    for idx, x_batch in tk0:
        with torch.no_grad():
            token_ids, seg_ids = x_batch
            token_ids, seg_ids = token_ids.to(DEVICE), seg_ids.to(DEVICE)
            predictions = model(token_ids.long(),seg_ids.long())
            predictions = torch.sigmoid(predictions)
            test_preds[idx*batch_size : (idx+1)*batch_size] = predictions.detach().cpu().squeeze().numpy()

    return test_preds


# 5. Fitting Starts Here
Inspired by Nirjhar's [kernel](https://www.kaggle.com/phoenix9032/pytorch-bert-plain)

## 5.1 Question Related Targets

In [None]:
best_scores = []

In [None]:
##########################
# Question Related Targets
##########################
from sklearn.model_selection import GroupKFold

target_cols = output_categories_question
target_string = 'questions'
target_level = 0

if TRAINING:
    
    gkf = GroupKFold(n_splits=N_SPLIT).split(
        X=train.question_body, groups=train.question_body
    )
    for fold in range(N_SPLIT):

        if fold not in FOLD_ID:
            continue

        train_loader, val_loader = get_train_val_loaders(target_cols=target_cols, target_level=target_level, ifold=fold)

        model = XLNetForGoogleQuest(n_classes=len(target_cols))
        model.zero_grad()
        model.to(DEVICE)
        torch.cuda.empty_cache()

        if EPOCH_RELEASE > 0:
            for param in model.transformer_model.parameters():
                param.requires_grad = False
            for i, param in enumerate(model.transformer_model.word_embedding.parameters()):
                if i >= len(tokenizer)-num_added_tokens:
                    param.requires_grad = True

        model.train()

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.9},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LR, eps=4e-5)
        criterion = nn.BCEWithLogitsLoss()
        scheduler = transformers.get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=0.05, 
            num_training_steps= EPOCHS*len(train_loader)//BATCH_ACCUMULATION_COUNT)

        train_start_time = datetime.datetime.now()
        best_score = 0.0
        reduce_lr_count = 0
        for epoch in range(EPOCHS):
            epoch_start_time = datetime.datetime.now()
            torch.cuda.empty_cache()

            if epoch == EPOCH_RELEASE:
                for param in model.transformer_model.parameters():
                    param.requires_grad = True

            avg_loss = training_with_accumulation(
                model, train_loader, optimizer, criterion, scheduler)
            avg_val_loss, val_spearmanr = validate_model(
                model, val_loader, target_cols=target_cols, batch_size=4)

            print("Epoch {} : {} seconds : train loss {:.4f} : valid loss {:.4f} : valid spearmanr {:.4f}".format(
                epoch, (datetime.datetime.now() - epoch_start_time).seconds, avg_loss, avg_val_loss, val_spearmanr))

            if val_spearmanr > best_score:
                best_score = val_spearmanr
                torch.save(model.state_dict(), os.path.join(WEIGHT_PATH, "model_{}_{}_{}.ckpt".format(VERSION, target_string, fold)))
                early_stopping_count = 0
            else:
                early_stopping_count += 1
                if early_stopping_count == EARLY_STOPPING:
                    print("Early Stopping : ", epoch)
                    break

        print('-'*20)
        print("Fold {} : Total Training Time {}, Best Score : {}".format(
            fold, datetime.datetime.now()-train_start_time, best_score))
        print('-'*20)
        
        model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, "model_{}_{}_{}.ckpt".format(VERSION, target_string, fold))))
        avg_val_loss, val_spearmanr = validate_model(
            model, val_loader, target_cols=target_cols, batch_size=4, verbose=True)
        best_scores.append(val_spearmanr)
        
        del model
        gc.collect()
    

## 5.2 Answer Related Targets

In [None]:
##########################
# Answer Related Targets
##########################
from sklearn.model_selection import GroupKFold

target_cols = output_categories_answer
target_string = 'answers'
target_level = 1

if TRAINING:
    
    gkf = GroupKFold(n_splits=N_SPLIT).split(
        X=train.question_body, groups=train.question_body
    )
    for fold in range(N_SPLIT):

        if fold not in FOLD_ID:
            continue

        train_loader, val_loader = get_train_val_loaders(target_cols=target_cols, target_level=target_level, ifold=fold)

        model = XLNetForGoogleQuest(n_classes=len(target_cols))
        model.zero_grad()
        model.to(DEVICE)
        torch.cuda.empty_cache()

        if EPOCH_RELEASE > 0:
            for param in model.transformer_model.parameters():
                param.requires_grad = False
            for i, param in enumerate(model.transformer_model.word_embedding.parameters()):
                if i >= len(tokenizer)-num_added_tokens:
                    param.requires_grad = True

        model.train()

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.9},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LR, eps=4e-5)
        criterion = nn.BCEWithLogitsLoss()
        scheduler = transformers.get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=0.05, 
            num_training_steps= EPOCHS*len(train_loader)//BATCH_ACCUMULATION_COUNT)

        train_start_time = datetime.datetime.now()
        best_score = 0.0
        early_stopping_count = 0
        for epoch in range(EPOCHS):
            epoch_start_time = datetime.datetime.now()
            torch.cuda.empty_cache()

            if epoch == EPOCH_RELEASE:
                for param in model.transformer_model.parameters():
                    param.requires_grad = True

            avg_loss = training_with_accumulation(
                model, train_loader, optimizer, criterion, scheduler)
            avg_val_loss, val_spearmanr = validate_model(
                model, val_loader, target_cols=target_cols, batch_size=4)

            print("Epoch {} : {} seconds : train loss {:.4f} : valid loss {:.4f} : valid spearmanr {:.4f}".format(
                epoch, (datetime.datetime.now() - epoch_start_time).seconds, avg_loss, avg_val_loss, val_spearmanr))

            if val_spearmanr > best_score:
                best_score = val_spearmanr
                torch.save(model.state_dict(), os.path.join(WEIGHT_PATH, "model_{}_{}_{}.ckpt".format(VERSION, target_string, fold)))
                early_stopping_count = 0
            else:
                early_stopping_count += 1
                if early_stopping_count == EARLY_STOPPING:
                    print("Early Stopping : ", epoch)
                    break

        print('-'*20)
        print("Fold {} : Total Training Time {}, Best Score : {}".format(
            fold, datetime.datetime.now()-train_start_time, best_score))
        print('-'*20)
        
        model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, "model_{}_{}_{}.ckpt".format(VERSION, target_string, fold))))
        avg_val_loss, val_spearmanr = validate_model(
            model, val_loader, target_cols=target_cols, batch_size=4, verbose=True)
        best_scores.append(val_spearmanr)
        
        del model
        gc.collect()
    

In [None]:
if TRAINING:
    print('Fold {}: Cross Validation Spearman Correlation Coefficient : {}'.format(
        FOLD_ID, np.average(
            best_scores, 
            weights=[len(output_categories_question)/30]*len(FOLD_ID), [len(output_categories_answer)/30]*len(FOLD_ID)
        )
    ))

# 6. Inference

In [None]:
##########################
# Inference Question
##########################

test_loader = get_test_loader(target_level=0)

y_preds_question = np.zeros((test_loader.num, len(output_categories_question)))
for fold in range(N_SPLIT):
    
    if fold not in FOLD_ID:
        continue
        
    model = XLNetForGoogleQuest(n_classes=len(output_categories_question))
    model.to(DEVICE)
    torch.cuda.empty_cache()
    
    model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, "model_{}_questions_{}.ckpt".format(VERSION, fold))))
    y_preds_question += predict(model, test_loader, output_categories_question)
    
##########################
# Inference Answer
##########################

test_loader = get_test_loader(target_level=1)

y_preds_answer = np.zeros((test_loader.num, len(output_categories_answer)))
for fold in range(N_SPLIT):
    
    if fold not in FOLD_ID:
        continue
        
    model = XLNetForGoogleQuest(n_classes=len(output_categories_answer))
    model.to(DEVICE)
    torch.cuda.empty_cache()
    
    model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, "model_{}_answers_{}.ckpt".format(VERSION, fold))))
    y_preds_answer += predict(model, test_loader, output_categories_answer)
    

# 7. Postprocessing

In [None]:
##########################
# Initial Submission DataFrame
##########################

y_preds = np.concatenate([y_preds_question, y_preds_answer], axis=1)

submission = pd.read_csv(LOCAL_PATH+'/sample_submission.csv')
submission.loc[:, 'question_asker_intent_understanding':] = y_preds
submission.head()


In [None]:
##########################
# Postprocessing for question_type_spelling
##########################

vocab_list_large = [
    'pronounced', 'pronounce', 'pronunciation', 'correct adjective', 'How many syllables', 'spell'
]
def rule_large(x):
    if x == 0:
        return 0.0
    elif x == 1:
        return 1/3
    else:
        return 2/3
    
vocab_list_base = [
    'sound', 'prefix', 'adjective', 'verb', 'noun', 'word', 'Ngram', 'conversation', 'syllable'
]
def rule_base(x):
    if x == 0:
        return 0.0
    elif x == 1:
        return 1/64
    else:
        return 1/32
    
y_preds_question_type_spelling = (
    test['question_title'].apply(
        lambda x: sum([x.count(vocab) for vocab in vocab_list_large])
    ) + test['question_body'].apply(
        lambda x: sum([x.count(vocab) for vocab in vocab_list_large])
    )).apply(rule_large) + (test['question_title'].apply(
        lambda x: sum([x.count(vocab) for vocab in vocab_list_base])
    ) + test['question_body'].apply(
        lambda x: sum([x.count(vocab) for vocab in vocab_list_base])
    )).apply(rule_base)

stackexchange_particles = test['url'].apply(
    lambda x:(('ell.stackexchange.com' in x) or ('english.stackexchange.com' in x))
).tolist()
spelling=[]
for x in stackexchange_particles:
    if x:
        spelling.append(1/6)
    else:
        spelling.append(0.)
        
y_preds_question_type_spelling = y_preds_question_type_spelling + np.array(spelling)

submission['question_type_spelling'] = y_preds_question_type_spelling
submission.loc[test['category']!='CULTURE', 'question_type_spelling'] = 0.0
submission.head()


# 8. Submission

In [None]:
submission.to_csv('submission.csv', index=False)