# Overview
T5-base: 5 SPLIT and 5 FOLD CV to predict. <br>
### Versions
- T5MD004<br>
CV : 0.40192<br>

In [None]:
import numpy as np
import pandas as pd
import sys, gc, os, re, tqdm, datetime, random, itertools, copy, math, html

sys.path.extend([
    '../../input/sacremoses/', 
    '../../input/transformers/'
])
import sacremoses, transformers

import torch
from torchvision import datasets, models, transforms
from sklearn.utils import shuffle


In [None]:
VERSION = 'T5MD004'

LOCAL_PATH = '../../input/google-quest-challenge'
MODEL_PRETRAINED_WEIGHTS_PATH = '../../input/t5-base-huggingface-weights'
WEIGHT_PATH = '../../input/weights'
N_SPLIT = 5
FOLD_ID = [0,1,2,3,4]
SEED = 9253
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 4
BATCH_ACCUMULATION_COUNT = 8
EPOCHS = 100
EPOCH_RELEASE = 2
EARLY_STOPPING = 3
LR = 1e-3
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
NUM_WORKERS = 4
TRAINING = True

def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(SEED)


# 1. Load Datasets

In [None]:
train = pd.read_csv(LOCAL_PATH+'/train.csv')
print(train.shape)
train.head(2)

In [None]:
test = pd.read_csv(LOCAL_PATH+'/test.csv')
print(test.shape)
test.head(2)

In [None]:
sample_submission = pd.read_csv(LOCAL_PATH+'/sample_submission.csv')
print(sample_submission.shape)
sample_submission.head(2)

# 2. Preprocessing
Credit to [Bert-base TF2.0 (now Huggingface transformer)](https://www.kaggle.com/akensert/bert-base-tf2-0-now-huggingface-transformer)

In [None]:
from shutil import copyfile
from transformers.tokenization_utils import PreTrainedTokenizer

SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
    }
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "t5-small": 512,
    "t5-base": 512,
    "t5-large": 512,
    "t5-3b": 512,
    "t5-11b": 512,
}

class T5Tokenizer(PreTrainedTokenizer):
    """
        SentencePiece based tokenizer. Peculiarities:
            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
            - `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels.
                These tokens are accessible as `<extra_id_{%d}>` where `{%d}` is a number between 0 and extra_ids-1.
                Extra tokens are indexed from the end of the vocabulary up to beginnning (<extra_id_0> is the last token in the vocabulary)
                (like in T5 preprocessing
                see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
        self,
        vocab_file,
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        extra_ids=100,
        additional_special_tokens=None,
        **kwargs
    ):
        # Add extra_ids to the special token list
        if extra_ids > 0:
            if additional_special_tokens is None:
                additional_special_tokens = []
            additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)])

        super().__init__(
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )

        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning(
                "You need to install SentencePiece to use T5Tokenizer:"
                "https://github.com/google/sentencepiece"
                "pip install sentencepiece"
            )
            raise

        self.vocab_file = vocab_file
        self._extra_ids = extra_ids

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

    @property
    def vocab_size(self):
        return self.sp_model.get_piece_size() + self._extra_ids

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning(
                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                "pip install sentencepiece"
            )
            raise
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def _tokenize(self, text, sample=False):
        """ Take as input a string and return a list of strings (tokens) for words/sub-words
        """
        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        return pieces

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        if token.startswith("<extra_id_"):
            match = re.match(r"<extra_id_(\d+)>", token)
            num = int(match.group(1))
            return self.vocab_size - num - 1
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index < self.sp_model.get_piece_size():
            token = self.sp_model.IdToPiece(index)
        else:
            token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
        return token

    def convert_tokens_to_string(self, tokens):
        """ Converts a sequence of tokens (string) in a single string. """
        out_string = self.sp_model.decode_pieces(tokens)
        return out_string

    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)

In [None]:
##########################
# Define Tokenizer and some utility variables
##########################

tokenizer = T5Tokenizer(
    vocab_file = MODEL_PRETRAINED_WEIGHTS_PATH + '/t5-spiece.model'
)
tokenizer.cls_token = '[CLS]'
tokenizer.sep_token = '[SEP]'
TOKEN_CLS = tokenizer.cls_token
TOKEN_SEP = tokenizer.sep_token
SEP_TOKEN_ID = tokenizer.sep_token_id

ADD_TOKEN_LIST = [
    '[TITLE]', 
    '[BODY]',
    '[CATEGORY]',
    '[DOMAIN]',
    '[HOST]',
    '[category:LIFE_ARTS]', 
    '[category:CULTURE]', 
    '[category:SCIENCE]', 
    '[category:STACKOVERFLOW]', 
    '[category:TECHNOLOGY]', 
    '[domain:stackexchange]',
    '[domain:stackoverflow]',
    '[domain:askubuntu]',
    '[domain:serverfault]',
    '[domain:superuser]',
    '[domain:mathoverflow]',
    '\n'
] + list(train.host.unique())
num_added_tokens = tokenizer.add_tokens(ADD_TOKEN_LIST)
print('Number of Tokens Added : ', num_added_tokens)

output_categories_question = list(train.columns[11:32])
output_categories_answer = list(train.columns[32:])

train.question_title= train.question_title.apply(html.unescape)
train.question_body = train.question_body.apply(html.unescape)
train.answer        = train.answer.apply(html.unescape)
test.question_title = test.question_title.apply(html.unescape)
test.question_body  = test.question_body.apply(html.unescape)
test.answer         = test.answer.apply(html.unescape)


In [None]:
##########################
# Define Datasets and Dataloaders
##########################
from math import floor, ceil

class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, target_columns, max_sequence_length=MAX_SEQUENCE_LENGTH, 
                 target_level=0, train_mode=True, labeled=True):
        '''
        target_level
            0 : question only
            1 : answer only
        '''
        self.df = df
        self.target_columns = target_columns
        self.max_sequence_length = max_sequence_length
        self.target_level = target_level
        self.train_mode = train_mode
        self.labeled = labeled
        self.tokenizer = T5Tokenizer(
            vocab_file = MODEL_PRETRAINED_WEIGHTS_PATH + '/t5-spiece.model'
        )
        self.tokenizer.add_tokens(ADD_TOKEN_LIST)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        token_ids, seg_ids = self.get_token_ids(row)
        if self.labeled:
            labels = self.get_label(row)
            return token_ids, seg_ids, labels
        else:
            return token_ids, seg_ids

    def __len__(self):
        return len(self.df)

    def select_tokens(self, tokens, max_num):
        if len(tokens) <= max_num:
            return tokens
        if self.train_mode:
            num_remove = len(tokens) - max_num
            remove_start = random.randint(0, len(tokens)-num_remove-1)
            return tokens[:remove_start] + tokens[remove_start + num_remove:]
        else:
            return tokens[:max_num//2] + tokens[-(max_num - max_num//2):]

    def trim_input_q(self, title, question, t_max_len=58, q_max_len=442):
            
        t = self.tokenizer.tokenize(title)
        q = self.tokenizer.tokenize(question)

        t_len = len(t)
        q_len = len(q)

        if (t_len+q_len+12) > self.max_sequence_length:

            if t_max_len > t_len:
                t_new_len = t_len
                q_new_len = q_max_len + t_max_len - t_len
            else:
                t_new_len = t_max_len
                q_new_len = q_max_len

            if t_new_len+q_new_len+12 != self.max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (self.max_sequence_length, (t_new_len+q_new_len+12)))

            t = t[:t_new_len]

            q_len_head = round(q_new_len*3/4)
            q_len_tail = -1 * (q_new_len - q_len_head)
            q = q[:q_len_head] + q[q_len_tail:]

        return t, q
    
    def trim_input_a(self, title, answer, t_max_len=58, a_max_len=442):
        
        t = self.tokenizer.tokenize(title)
        a = self.tokenizer.tokenize(answer)

        t_len = len(t)
        a_len = len(a)

        if (t_len+a_len+12) > self.max_sequence_length:

            if t_max_len > t_len:
                t_new_len = t_len
                a_new_len = a_max_len + t_max_len - t_len
            else:
                t_new_len = t_max_len
                a_new_len = a_max_len

            if t_new_len+a_new_len+12 != self.max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (self.max_sequence_length, (t_new_len+a_new_len+12)))

            t = t[:t_new_len]
            
            a_len_head = round(a_new_len*3/4)
            a_len_tail = -1 * (a_new_len - a_len_head)
            a = a[:a_len_head] + a[a_len_tail:]

        return t, a
    
    def get_token_ids(self, row):
        if self.target_level == 0:
            t_tokens, q_tokens = self.trim_input_q(row.question_title, row.question_body)
            tokens = [TOKEN_CLS] + ['[CATEGORY]'] + ['[category:{}]'.format(row['category'])] + \
                        ['[DOMAIN]'] + ['[domain:{}]'.format(row['host'].split('.')[-2])] + \
                        ['[HOST]'] + [row['host']] + [TOKEN_SEP] + ['[TITLE]'] + [TOKEN_SEP] + \
                        t_tokens + ['[BODY]'] + q_tokens + [TOKEN_SEP]
        elif self.target_level == 1:
            t_tokens, a_tokens = self.trim_input_a(row.question_title, row.answer)
            tokens = [TOKEN_CLS] + ['[CATEGORY]'] + ['[category:{}]'.format(row['category'])] + \
                        ['[DOMAIN]'] + ['[domain:{}]'.format(row['host'].split('.')[-2])] + \
                        ['[HOST]'] + [row['host']] + [TOKEN_SEP] + ['[TITLE]'] + [TOKEN_SEP] + \
                        t_tokens + ['[BODY]'] + a_tokens + [TOKEN_SEP]
        else:
            raise ValueError('target_level should be 0 or 1')
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < self.max_sequence_length:
            token_ids += [0] * (self.max_sequence_length - len(token_ids))
        ids = torch.tensor(token_ids)
        seg_ids = self.get_seg_ids(ids)
        return ids, seg_ids
    
    def get_seg_ids(self, ids):
        seg_ids = torch.zeros_like(ids)
        seg_idx = 0
        first_sep = True
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID:
                if first_sep:
                    first_sep = False
                else:
                    seg_idx = 1
        pad_idx = torch.nonzero(ids == 0)
        seg_ids[pad_idx] = 0
        return seg_ids

    def get_label(self, row):
        return torch.tensor(row[self.target_columns].values.astype(np.float32))

    def collate_fn(self, batch):
        token_ids = torch.stack([x[0] for x in batch])
        seg_ids = torch.stack([x[1] for x in batch])
        if self.labeled:
            labels = torch.stack([x[2] for x in batch])
            return token_ids, seg_ids, labels
        else:
            return token_ids, seg_ids


def get_test_loader(batch_size=BATCH_SIZE, target_level=0):
    df = pd.read_csv(LOCAL_PATH+'/test.csv')
    ds_test = QuestDataset(df, None, target_level=target_level, train_mode=False, labeled=False)
    loader = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, collate_fn=ds_test.collate_fn, drop_last=False)
    loader.num = len(df)
    return loader


def get_train_val_loaders(target_cols, batch_size=BATCH_SIZE, target_level=0, val_batch_size=4, ifold=0):

    df = pd.read_csv(LOCAL_PATH+'/train.csv')
    df = shuffle(df, random_state=SEED)
    gkf = GroupKFold(n_splits=N_SPLIT).split(X=df.question_body, groups=df.question_body)
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        if fold == ifold:
            df_train = df.iloc[train_idx]
            df_val = df.iloc[valid_idx]
            break

    ds_train = QuestDataset(df_train, target_cols, target_level=target_level)
    train_loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, collate_fn=ds_train.collate_fn, drop_last=True)
    train_loader.num = len(df_train)

    ds_val = QuestDataset(df_val, target_cols, target_level=target_level, train_mode=False)
    val_loader = torch.utils.data.DataLoader(ds_val, batch_size=val_batch_size, shuffle=False, num_workers=NUM_WORKERS, collate_fn=ds_val.collate_fn, drop_last=False)
    val_loader.num = len(df_val)
    val_loader.df = df_val

    return train_loader, val_loader


# 3. Model Definition

In [None]:
import torch
import torch.nn as nn
from torch.nn import Identity
import torch.nn.functional as F
from transformers import PreTrainedModel
from transformers.configuration_utils import PretrainedConfig

    
T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
}
T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
    "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
    "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
    "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
    "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
    "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
}
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]

class SequenceSummary(nn.Module):
    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
        Args of the config class:
            summary_type:
                - 'last' => [default] take the last token hidden state (like XLNet)
                - 'first' => take the first token hidden state (like Bert)
                - 'mean' => take the mean of all tokens hidden states
                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
                - 'attn' => Not implemented now, use multi-head attention
            summary_use_proj: Add a projection after the vector extraction
            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
            summary_first_dropout: Add a dropout before the projection and activation
            summary_last_dropout: Add a dropout after the projection and activation
    """
    def __init__(self, config):
        super(SequenceSummary, self).__init__()

        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
        if self.summary_type == 'attn':
            # We should use a standard multi-head attention module with absolute positional embedding for that.
            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
            raise NotImplementedError

        self.summary = Identity()
        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
                num_classes = config.num_labels
            else:
                num_classes = config.hidden_size
            self.summary = nn.Linear(config.hidden_size, num_classes)

        self.activation = Identity()
        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
            self.activation = nn.Tanh()

        self.first_dropout = Identity()
        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
            self.first_dropout = nn.Dropout(config.summary_first_dropout)

        self.last_dropout = Identity()
        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
            self.last_dropout = nn.Dropout(config.summary_last_dropout)

    def forward(self, hidden_states, cls_index=None):
        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
                if summary_type == 'cls_index' and cls_index is None:
                    we take the last token of the sequence as classification token
        """
        if self.summary_type == 'last':
            output = hidden_states[:, -1]
        elif self.summary_type == 'first':
            output = hidden_states[:, 0]
        elif self.summary_type == 'mean':
            output = hidden_states.mean(dim=1)
        elif self.summary_type == 'cls_index':
            if cls_index is None:
                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
            else:
                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
                cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
            output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
        elif self.summary_type == 'attn':
            raise NotImplementedError

        output = self.first_dropout(output)
        output = self.summary(output)
        output = self.activation(output)
        output = self.last_dropout(output)

        return output

class T5Config(PretrainedConfig):
    r"""
        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
        `T5Model`.
        Arguments:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
                the Transformer encoder.
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
            hidden_dropout_prob: The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
            max_position_embeddings: The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048).
            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                `T5Model`.
            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
            layer_norm_eps: The epsilon used by LayerNorm.
    """
    pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "t5"

    def __init__(
        self,
        vocab_size=32128,
        n_positions=512,
        d_model=512,
        d_kv=64,
        d_ff=2048,
        num_layers=6,
        num_heads=8,
        relative_attention_num_buckets=32,
        dropout_rate=0.1,
        layer_norm_epsilon=1e-6,
        initializer_factor=1.0,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.d_model = d_model
        self.d_kv = d_kv
        self.d_ff = d_ff
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.relative_attention_num_buckets = relative_attention_num_buckets
        self.dropout_rate = dropout_rate
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_factor = initializer_factor

    @property
    def max_position_embeddings(self):
        return self.n_positions

    @property
    def hidden_size(self):
        return self.d_model

    @property
    def num_attention_heads(self):
        return self.num_heads

    @property
    def num_hidden_layers(self):
        return self.num_layers

def add_start_docstrings(*docstr):
    def docstring_decorator(fn):
        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
        return fn

    return docstring_decorator

def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
    """ Load tf checkpoints in a pytorch model.
    """
    try:
        import re
        import numpy as np
        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    tf_weights = {}
    for name, shape in init_vars:
        logger.info("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        tf_weights[name] = array

    for txt_name in names:
        name = txt_name.split("/")
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
            logger.info("Skipping {}".format("/".join(name)))
            tf_weights.pop(txt_name, None)
            continue
        if "_slot_" in name[-1]:
            logger.info("Skipping {}".format("/".join(name)))
            tf_weights.pop(txt_name, None)
            continue
        pointer = model
        array = tf_weights[txt_name]
        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            if scope_names[0] in ["kernel", "scale", "embedding"]:
                pointer = getattr(pointer, "weight")
            # elif scope_names[0] == 'scale':
            #     pointer = getattr(pointer, 'weight')
            # elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
            #     pointer = getattr(pointer, 'bias')
            # elif scope_names[0] == 'squad':
            #     pointer = getattr(pointer, 'classifier')
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info("Skipping {}".format("/".join(name)))
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        if scope_names[0] not in ["kernel", "scale", "embedding"]:
            pointer = getattr(pointer, "weight")
        if scope_names[0] != "embedding":
            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        logger.info("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array.astype(np.float32))
        tf_weights.pop(txt_name, None)

    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
    # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
    return model

class T5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """ Construct a layernorm module in the T5 style
            No bias and no substraction of mean.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x / torch.sqrt(variance + self.variance_epsilon)
        return self.weight * x


class T5DenseReluDense(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states):
        h = self.wi(hidden_states)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.wo(h)
        return h


class T5LayerFF(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.DenseReluDense = T5DenseReluDense(config)
        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states):
        norm_x = self.layer_norm(hidden_states)
        y = self.DenseReluDense(norm_x)
        layer_output = hidden_states + self.dropout(y)
        return layer_output


class T5Attention(nn.Module):
    NEW_ID = itertools.count()

    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        self.layer_id = next(T5Attention.NEW_ID)
        self.is_decoder = config.is_decoder
        self.has_relative_attention_bias = has_relative_attention_bias

        self.output_attentions = config.output_attentions
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        self.d_model = config.d_model
        self.d_kv = config.d_kv
        self.n_heads = config.num_heads
        self.dropout = config.dropout_rate
        self.inner_dim = self.n_heads * self.d_kv

        # Mesh TensorFlow initialization to avoid scaling before softmax
        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        mask = torch.ones(self.n_heads, self.d_kv)
        heads = set(heads) - self.pruned_heads
        for head in heads:
            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
            mask[head] = 0
        mask = mask.view(-1).contiguous().eq(1)
        index = torch.arange(len(mask))[mask].long()
        # Prune linear layers
        self.q = prune_linear_layer(self.q, index)
        self.k = prune_linear_layer(self.k, index)
        self.v = prune_linear_layer(self.v, index)
        self.o = prune_linear_layer(self.o, index, dim=1)
        # Update hyper params
        self.n_heads = self.n_heads - len(heads)
        self.inner_dim = self.d_kv * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    @staticmethod
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
        Translate relative position to a bucket number for relative attention.
        The relative position is defined as memory_position - query_position, i.e.
        the distance in tokens from the attending position to the attended-to
        position.  If bidirectional=False, then positive relative positions are
        invalid.
        We use smaller buckets for small absolute relative_position and larger buckets
        for larger absolute relative_positions.  All relative positions >=max_distance
        map to the same bucket.  All relative positions <=-max_distance map to the
        same bucket.  This should allow for more graceful generalization to longer
        sequences than the model has been trained on.
        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer
        Returns:
            a Tensor with the same shape as relative_position, containing int32
            values in the range [0, num_buckets)
        """
        ret = 0
        n = -relative_position
        if bidirectional:
            num_buckets //= 2
            ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
            n = torch.abs(n)
        else:
            n = torch.max(n, torch.zeros_like(n))
        # now n is in the range [0, inf)

        # half of the buckets are for exact increments in positions
        max_exact = num_buckets // 2
        is_small = n < max_exact

        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
        val_if_large = max_exact + (
            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
        ).to(torch.long)
        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))

        ret += torch.where(is_small, n, val_if_large)
        return ret

    def compute_bias(self, qlen, klen):
        """ Compute binned relative position bias """
        context_position = torch.arange(qlen, dtype=torch.long)[:, None]
        memory_position = torch.arange(klen, dtype=torch.long)[None, :]
        relative_position = memory_position - context_position  # shape (qlen, klen)
        rp_bucket = self._relative_position_bucket(
            relative_position,  # shape (qlen, klen)
            bidirectional=not self.is_decoder,
            num_buckets=self.relative_attention_num_buckets,
        )
        rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
        return values

    def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
        """
        Self-attention (if kv is None) or attention over source sentence (provided by kv).
        """
        # Input is (bs, qlen, dim)
        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
        bs, qlen, dim = input.size()
        if kv is None:
            klen = qlen if cache is None else cache["slen"] + qlen
        else:
            klen = kv.size(1)

        def shape(x):
            """  projection """
            return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2)

        def unshape(x):
            """  compute context """
            return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)

        q = shape(self.q(input))  # (bs, n_heads, qlen, dim_per_head)
        if kv is None:
            k = shape(self.k(input))  # (bs, n_heads, qlen, dim_per_head)
            v = shape(self.v(input))  # (bs, n_heads, qlen, dim_per_head)
        elif cache is None or self.layer_id not in cache:
            k = v = kv
            k = shape(self.k(k))  # (bs, n_heads, qlen, dim_per_head)
            v = shape(self.v(v))  # (bs, n_heads, qlen, dim_per_head)

        if cache is not None:
            if self.layer_id in cache:
                if kv is None:
                    k_, v_ = cache[self.layer_id]
                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
                else:
                    k, v = cache[self.layer_id]
            cache[self.layer_id] = (k, v)

        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
        scores = torch.einsum("bnqd,bnkd->bnqk", q, k)  # (bs, n_heads, qlen, klen)

        if position_bias is None:
            if not self.has_relative_attention_bias:
                raise ValueError("No position_bias provided and no weights to compute position_bias")
            position_bias = self.compute_bias(qlen, klen)
            if mask is not None:
                position_bias = position_bias + mask  # (bs, n_heads, qlen, klen)

        scores += position_bias
        weights = F.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)

        # Mask heads if we want to
        if head_mask is not None:
            weights = weights * head_mask

        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
        context = unshape(context)  # (bs, qlen, dim)

        context = self.o(context)

        outputs = (context,)
        if self.output_attentions:
            outputs = outputs + (weights,)
        if self.has_relative_attention_bias:
            outputs = outputs + (position_bias,)
        return outputs


class T5LayerSelfAttention(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
        norm_x = self.layer_norm(hidden_states)
        attention_output = self.SelfAttention(
            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask
        )
        y = attention_output[0]
        layer_output = hidden_states + self.dropout(y)
        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
        return outputs


class T5LayerCrossAttention(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
        norm_x = self.layer_norm(hidden_states)
        attention_output = self.EncDecAttention(
            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask
        )
        y = attention_output[0]
        layer_output = hidden_states + self.dropout(y)
        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
        return outputs


class T5Block(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        self.is_decoder = config.is_decoder
        self.layer = nn.ModuleList()
        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
        if self.is_decoder:
            self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias))
            self.layer.append(T5LayerFF(config))
        else:
            self.layer.append(T5LayerFF(config))

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        encoder_decoder_position_bias=None,
        head_mask=None,
    ):
        self_attention_outputs = self.layer[0](
            hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask
        )
        hidden_states = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights

        if not self.is_decoder:
            hidden_states = self.layer[1](hidden_states)
        else:
            cross_attention_outputs = self.layer[1](
                hidden_states,
                kv=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                position_bias=encoder_decoder_position_bias,
                head_mask=head_mask,
            )
            hidden_states = cross_attention_outputs[0]
            outputs = (
                outputs + cross_attention_outputs[1:]
            )  # Keep cross-attention outputs and relative position weights
            hidden_states = self.layer[2](hidden_states)

        outputs = (hidden_states,) + outputs  # add attentions if we output them
        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)


class T5PreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    """

    config_class = T5Config
    pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
    load_tf_weights = load_tf_weights_in_t5
    base_model_prefix = "transformer"

    @property
    def dummy_inputs(self):
        input_ids = torch.tensor(DUMMY_INPUTS)
        input_mask = torch.tensor(DUMMY_MASK)
        dummy_inputs = {
            "decoder_input_ids": input_ids,
            "encoder_input_ids": input_ids,
            "decoder_attention_mask": input_mask,
        }
        return dummy_inputs

    def _init_weights(self, module):
        """ Initialize the weights """
        factor = self.config.initializer_factor  # Used for testing weights initialization
        if isinstance(module, T5LayerNorm):
            module.weight.data.fill_(factor * 1.0)
        elif isinstance(module, (T5Model, T5WithLMHeadModel)):
            # Mesh TensorFlow embeddings initialization
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
        elif isinstance(module, T5DenseReluDense):
            # Mesh TensorFlow FF initialization
            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
            if hasattr(module.wi, "bias") and module.wi.bias is not None:
                module.wi.bias.data.zero_()
            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
            if hasattr(module.wo, "bias") and module.wo.bias is not None:
                module.wo.bias.data.zero_()
        elif isinstance(module, T5Attention):
            # Mesh TensorFlow attention initialization to avoid scaling before softmax
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
            d_model = self.config.d_model
            d_kv = self.config.d_kv
            n_heads = self.config.num_heads
            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5))
            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5))
            if module.has_relative_attention_bias:
                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))


class T5Stack(T5PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.is_decoder = config.is_decoder

        self.block = nn.ModuleList(
            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
        )
        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

        self.init_weights()

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
    ):

        batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
        if attention_mask is None:
            attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device)
        if self.is_decoder and encoder_attention_mask is None:
            encoder_seq_length = encoder_hidden_states.shape[1]
            encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            # Provided a padding mask of dimensions [batch_size, seq_length]
            # - if the model is a decoder, apply a causal mask in addition to the padding mask
            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
            if self.config.is_decoder:
                seq_ids = torch.arange(seq_length, device=hidden_states.device)
                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
                causal_mask = causal_mask.to(attention_mask)
                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
            else:
                extended_attention_mask = attention_mask[:, None, None, :]

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -1e9 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.

        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
        # extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))

        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9

        if self.is_decoder:
            # If a 2D ou 3D attention mask is provided for the cross-attention
            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
            if encoder_attention_mask.dim() == 3:
                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
            if encoder_attention_mask.dim() == 2:
                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]

            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
            # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))

            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
                dtype=next(self.parameters()).dtype
            )  # fp16 compatibility
            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        if head_mask is not None:
            if head_mask.dim() == 1:
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
                head_mask = (
                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
                )  # We can specify head_mask for each layer
            head_mask = head_mask.to(
                dtype=next(self.parameters()).dtype
            )  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_layers

        all_hidden_states = ()
        all_attentions = ()
        position_bias = None
        encoder_decoder_position_bias = None

        hidden_states = self.dropout(hidden_states)
        for i, layer_module in enumerate(self.block):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_outputs = layer_module(
                hidden_states,
                attention_mask=extended_attention_mask,
                position_bias=position_bias,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_extended_attention_mask,
                encoder_decoder_position_bias=encoder_decoder_position_bias,
                head_mask=head_mask[i],
            )
            # layer_outputs is a tuple with:
            # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
            hidden_states = layer_outputs[0]
            if i == 0:
                # We share the position biases between the layers - the first layer store them
                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
                position_bias = layer_outputs[2 if self.output_attentions else 1]
                if self.is_decoder:
                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]

            if self.output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)  # We keep only self-attention weights for now

        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.dropout(hidden_states)

        # Add last layer
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        outputs = (hidden_states,)
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
            outputs = outputs + (all_attentions,)
        return outputs  # last-layer hidden state, (all hidden states), (all attentions)


T5_START_DOCSTRING = r"""    The T5 model was proposed in
    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
    refer to the PyTorch documentation for all matter related to general usage and behavior.
    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
        https://arxiv.org/abs/1910.10683
    .. _`torch.nn.Module`:
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""

T5_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
            (a) For sequence pairs:
                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
            (b) For single sequences:
                ``tokens:         [CLS] the dog is hairy . [SEP]``
            T5 is a model with relative position embeddings so you should be able to pad the inputs on
            the right or the left.
            Indices can be obtained using :class:`transformers.T5Tokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""


@add_start_docstrings(
    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
    T5_START_DOCSTRING,
    T5_INPUTS_DOCSTRING,
)
class T5Model(T5PreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the output of the last layer of the model.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        tokenizer = T5Tokenizer.from_pretrained('t5-small')
        model = T5Model.from_pretrained('t5-small')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids=input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """

    def __init__(self, config):
        super().__init__(config)
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        encoder_config = copy.deepcopy(config)
        self.encoder = T5Stack(encoder_config)

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        self.decoder = T5Stack(decoder_config)

        self.init_weights()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
            See base class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def forward(self, **kwargs):
        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
        # that apply to the model as whole.
        # We let the specific kwargs override the common ones in case of conflict.
        kwargs_common = dict(
            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
        )
        kwargs_encoder = kwargs_common.copy()
        kwargs_decoder = kwargs_common.copy()
        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))

        # Encode if needed (training, first prediction pass)
        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
        encoder_attention_mask = kwargs_encoder.get("attention_mask", None)
        if encoder_hidden_states is None:
            # Convert encoder inputs in embeddings if needed
            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
            if hidden_states is None:
                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings

            if encoder_attention_mask is not None:
                # Apply masking
                encoder_attention_mask = (encoder_attention_mask != 0).to(hidden_states)
                hidden_states = hidden_states * encoder_attention_mask.unsqueeze(-1)

            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
            encoder_hidden_states = encoder_outputs[0]
        else:
            encoder_outputs = ()

        # Decode
        # Convert decoder inputs in embeddings if needed
        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
        if hidden_states is None:
            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
            hidden_states = self.shared(decoder_inputs_ids)

        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
        kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask
        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)

        return decoder_outputs + encoder_outputs


@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
class T5WithLMHeadModel(T5PreTrainedModel):
    r"""
        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Labels for computing the masked language modeling loss.
            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
            in ``[0, ..., config.vocab_size]``.
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Masked language modeling loss.
        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        tokenizer = T5Tokenizer.from_pretrained('t5-small')
        model = T5WithLMHeadModel.from_pretrained('t5-small')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids=input_ids, lm_labels=input_ids)
        loss, prediction_scores = outputs[:2]
    """

    def __init__(self, config):
        super().__init__(config)
        self.model_dim = config.d_model

        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        encoder_config = copy.deepcopy(config)
        self.encoder = T5Stack(encoder_config)

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        self.decoder = T5Stack(decoder_config)

        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        self.init_weights()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings

    def get_output_embeddings(self):
        return self.lm_head

    def forward(self, **kwargs):
        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
        # that apply to the model as whole.
        # We let the specific kwargs override the common ones in case of conflict.

        lm_labels = kwargs.pop("decoder_lm_labels", None)

        kwargs_common = dict(
            (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
        )
        kwargs_encoder = kwargs_common.copy()
        kwargs_decoder = kwargs_common.copy()
        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))

        # Encode if needed (training, first prediction pass)
        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
        if encoder_hidden_states is None:
            # Convert encoder inputs in embeddings if needed
            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
            if hidden_states is None:
                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings

            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
            encoder_hidden_states = encoder_outputs[0]
        else:
            encoder_outputs = ()

        # Decode
        # Convert decoder inputs in embeddings if needed
        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
        if hidden_states is None:
            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
            hidden_states = self.shared(decoder_inputs_ids)

        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)

        sequence_output = decoder_outputs[0]
        # Rescale output before projecting on vocab
        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
        sequence_output = sequence_output * (self.model_dim ** -0.5)
        lm_logits = self.lm_head(sequence_output)

        decoder_outputs = (lm_logits,) + decoder_outputs[1:]  # Add hidden states and attention if they are here
        if lm_labels is not None:
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
            loss_fct = CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            decoder_outputs = (
                loss,
            ) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

        return decoder_outputs + encoder_outputs



In [None]:
class T5ForGoogleQuest(nn.Module):
    def __init__(self, n_classes=30):
        super(T5ForGoogleQuest, self).__init__()
        self.model_name = 'T5ForGoogleQuest'
        self.config = T5Config.from_json_file(
            MODEL_PRETRAINED_WEIGHTS_PATH + '/t5-base-config.json'
        )
        self.transformer_model = T5Model.from_pretrained(
            MODEL_PRETRAINED_WEIGHTS_PATH + '/t5-base-pytorch_model.bin', 
            config=self.config
        )
        self.transformer_model.resize_token_embeddings(len(tokenizer))
        self.sequence_summary = SequenceSummary(
            config=self.config
        )
        self.fc = nn.Sequential(
            nn.Dropout(p = 0.2),
            nn.Linear(768, n_classes)
        )

    def forward(self, ids):
        attention_mask = (ids > 0).float()
        seq_out = self.transformer_model(
            input_ids=ids, attention_mask=attention_mask)[0]
        pool_out = self.sequence_summary(seq_out)
        logits = self.fc(pool_out)
        return logits
    
# def test_model():
#     x = torch.tensor([[1,2,3,4,5, 0, 0], [1,2,3,4,5, 0, 0]])
#     model = T5ForGoogleQuest()
#     y = model(x)
#     print(y)
    
# test_model()

# 4. Training Utility Functions

In [None]:
##########################
# Training Utility Functions
##########################
from scipy.stats import spearmanr

def training_with_accumulation(model, train_loader, optimizer, criterion, scheduler):
    
    model.train()
    avg_loss = 0.
    optimizer.zero_grad()
    
    bar = tqdm.tqdm_notebook(
        enumerate(train_loader), 
        total=len(train_loader), 
        postfix={"train_loss":0.0,}
    )
    for idx, batch in bar:
        
        token_ids, _, labels = batch
        token_ids, labels = token_ids.to(DEVICE), labels.to(DEVICE)
        
        logits = model(token_ids.long())
        loss = criterion(logits, labels)
        loss.backward()
        if (idx + 1) % BATCH_ACCUMULATION_COUNT == 0:    
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        avg_loss += loss.item() / (len(train_loader))
        
        bar.set_postfix(ordered_dict={
            "train_loss":loss.item(),
        })
        del token_ids, labels

    torch.cuda.empty_cache()
    gc.collect()
    
    return avg_loss


def validate_model(model, val_loader, target_cols, batch_size=4, verbose=False):

    avg_val_loss = 0.
    model.eval()
    
    y_preds = np.zeros((val_loader.num, len(target_cols)))
    y_true = np.zeros((val_loader.num, len(target_cols)))
    
    with torch.no_grad():
        
        for idx, batch in enumerate(val_loader):
            token_ids, _, labels = batch
            token_ids, labels = token_ids.to(DEVICE), labels.to(DEVICE)
            
            logits = model(token_ids.long())
            logits = torch.sigmoid(logits)
            
            avg_val_loss += criterion(logits, labels).item() / len(val_loader)
            y_preds[idx*batch_size : (idx+1)*batch_size] = logits.detach().cpu().squeeze().numpy()
            y_true[idx*batch_size : (idx+1)*batch_size]  = labels.detach().cpu().squeeze().numpy()
            
            del token_ids, labels
            
        torch.cuda.empty_cache()
        gc.collect()
        
        score = 0
        for i in range(len(target_cols)):
            spear = np.nan_to_num(spearmanr(y_true[:, i], y_preds[:, i]).correlation)
            score += spear
            if verbose:
                print('Target Column {} : {}'.format(target_cols[i], spear))
            
    return avg_val_loss, score/len(target_cols)


def predict(model, test_loader, target_cols, batch_size=BATCH_SIZE):
    
    test_preds = np.zeros((test_loader.num, len(target_cols)))
    
    model.eval()
    tk0 = tqdm.tqdm_notebook(enumerate(test_loader))
    for idx, x_batch in tk0:
        with torch.no_grad():
            token_ids, _ = x_batch
            token_ids = token_ids.to(DEVICE)
            predictions = model(token_ids.long())
            predictions = torch.sigmoid(predictions)
            test_preds[idx*batch_size : (idx+1)*batch_size] = predictions.detach().cpu().squeeze().numpy()

    return test_preds


# 5. Fitting Starts Here
Inspired by Nirjhar's [kernel](https://www.kaggle.com/phoenix9032/pytorch-bert-plain)

## 5.1 Question Related Targets

In [None]:
best_scores = []

In [None]:
##########################
# Question Related Targets
##########################
from sklearn.model_selection import GroupKFold

target_cols = output_categories_question
target_string = 'questions'
target_level = 0

if TRAINING:
    
    gkf = GroupKFold(n_splits=N_SPLIT).split(
        X=train.question_body, groups=train.question_body
    )
    for fold in range(N_SPLIT):

        if fold not in FOLD_ID:
            continue

        train_loader, val_loader = get_train_val_loaders(target_cols=target_cols, target_level=target_level, ifold=fold)

        model = T5ForGoogleQuest(n_classes=len(target_cols))
        model.zero_grad()
        model.to(DEVICE)
        torch.cuda.empty_cache()

        if EPOCH_RELEASE > 0:
            for param in model.transformer_model.parameters():
                param.requires_grad = False
            for i, param in enumerate(model.transformer_model.shared.parameters()):
                if i >= len(tokenizer)-num_added_tokens:
                    param.requires_grad = True

        model.train()

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.9},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LR, eps=4e-5)
        criterion = nn.BCEWithLogitsLoss()
        scheduler = transformers.get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=0.05, 
            num_training_steps= EPOCHS*len(train_loader)//BATCH_ACCUMULATION_COUNT)

        train_start_time = datetime.datetime.now()
        best_score = 0.0
        reduce_lr_count = 0
        for epoch in range(EPOCHS):
            epoch_start_time = datetime.datetime.now()
            torch.cuda.empty_cache()

            if epoch == EPOCH_RELEASE:
                for param in model.transformer_model.parameters():
                    param.requires_grad = True

            avg_loss = training_with_accumulation(
                model, train_loader, optimizer, criterion, scheduler)
            avg_val_loss, val_spearmanr = validate_model(
                model, val_loader, target_cols=target_cols, batch_size=4)

            print("Epoch {} : {} seconds : train loss {:.4f} : valid loss {:.4f} : valid spearmanr {:.4f}".format(
                epoch, (datetime.datetime.now() - epoch_start_time).seconds, avg_loss, avg_val_loss, val_spearmanr))

            if val_spearmanr > best_score:
                best_score = val_spearmanr
                torch.save(model.state_dict(), os.path.join(WEIGHT_PATH, "model_{}_{}_{}.ckpt".format(VERSION, target_string, fold)))
                early_stopping_count = 0
            else:
                early_stopping_count += 1
                if early_stopping_count == EARLY_STOPPING:
                    print("Early Stopping : ", epoch)
                    break

        print('-'*20)
        print("Fold {} : Total Training Time {}, Best Score : {}".format(
            fold, datetime.datetime.now()-train_start_time, best_score))
        print('-'*20)
        
        model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, "model_{}_{}_{}.ckpt".format(VERSION, target_string, fold))))
        avg_val_loss, val_spearmanr = validate_model(
            model, val_loader, target_cols=target_cols, batch_size=4, verbose=True)
        best_scores.append(val_spearmanr)
        
        del model
        gc.collect()
    

## 5.2 Answer Related Targets

In [None]:
##########################
# Answer Related Targets
##########################
from sklearn.model_selection import GroupKFold

target_cols = output_categories_answer
target_string = 'answers'
target_level = 1

if TRAINING:
    
    gkf = GroupKFold(n_splits=N_SPLIT).split(
        X=train.question_body, groups=train.question_body
    )
    for fold in range(N_SPLIT):

        if fold not in FOLD_ID:
            continue

        train_loader, val_loader = get_train_val_loaders(target_cols=target_cols, target_level=target_level, ifold=fold)

        model = T5ForGoogleQuest(n_classes=len(target_cols))
        model.zero_grad()
        model.to(DEVICE)
        torch.cuda.empty_cache()

        if EPOCH_RELEASE > 0:
            for param in model.transformer_model.parameters():
                param.requires_grad = False
            for i, param in enumerate(model.transformer_model.shared.parameters()):
                if i >= len(tokenizer)-num_added_tokens:
                    param.requires_grad = True

        model.train()

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.9},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LR, eps=4e-5)
        criterion = nn.BCEWithLogitsLoss()
        scheduler = transformers.get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=0.05, 
            num_training_steps= EPOCHS*len(train_loader)//BATCH_ACCUMULATION_COUNT)

        train_start_time = datetime.datetime.now()
        best_score = 0.0
        early_stopping_count = 0
        for epoch in range(EPOCHS):
            epoch_start_time = datetime.datetime.now()
            torch.cuda.empty_cache()

            if epoch == EPOCH_RELEASE:
                for param in model.transformer_model.parameters():
                    param.requires_grad = True

            avg_loss = training_with_accumulation(
                model, train_loader, optimizer, criterion, scheduler)
            avg_val_loss, val_spearmanr = validate_model(
                model, val_loader, target_cols=target_cols, batch_size=4)

            print("Epoch {} : {} seconds : train loss {:.4f} : valid loss {:.4f} : valid spearmanr {:.4f}".format(
                epoch, (datetime.datetime.now() - epoch_start_time).seconds, avg_loss, avg_val_loss, val_spearmanr))

            if val_spearmanr > best_score:
                best_score = val_spearmanr
                torch.save(model.state_dict(), os.path.join(WEIGHT_PATH, "model_{}_{}_{}.ckpt".format(VERSION, target_string, fold)))
                early_stopping_count = 0
            else:
                early_stopping_count += 1
                if early_stopping_count == EARLY_STOPPING:
                    print("Early Stopping : ", epoch)
                    break

        print('-'*20)
        print("Fold {} : Total Training Time {}, Best Score : {}".format(
            fold, datetime.datetime.now()-train_start_time, best_score))
        print('-'*20)
        
        model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, "model_{}_{}_{}.ckpt".format(VERSION, target_string, fold))))
        avg_val_loss, val_spearmanr = validate_model(
            model, val_loader, target_cols=target_cols, batch_size=4, verbose=True)
        best_scores.append(val_spearmanr)
        
        del model
        gc.collect()
    

In [None]:
if TRAINING:
    print('Fold {}: Cross Validation Spearman Correlation Coefficient : {}'.format(
        FOLD_ID, np.average(
            best_scores, 
            weights=[len(output_categories_question)/(30*len(FOLD_ID))]*len(FOLD_ID)+[len(output_categories_answer)/(30*len(FOLD_ID))]*len(FOLD_ID)
        )
    ))

# 6. Inference

In [None]:
##########################
# Inference Question
##########################

test_loader = get_test_loader(target_level=0)

y_preds_question = np.zeros((test_loader.num, len(output_categories_question)))
for fold in range(N_SPLIT):
    
    if fold not in FOLD_ID:
        continue
        
    model = T5ForGoogleQuest(n_classes=len(output_categories_question))
    model.to(DEVICE)
    torch.cuda.empty_cache()
    
    model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, "model_{}_questions_{}.ckpt".format(VERSION, fold))))
    y_preds_question += predict(model, test_loader, output_categories_question) / len(FOLD_ID)
    
##########################
# Inference Answer
##########################

test_loader = get_test_loader(target_level=1)

y_preds_answer = np.zeros((test_loader.num, len(output_categories_answer)))
for fold in range(N_SPLIT):
    
    if fold not in FOLD_ID:
        continue
        
    model = T5ForGoogleQuest(n_classes=len(output_categories_answer))
    model.to(DEVICE)
    torch.cuda.empty_cache()
    
    model.load_state_dict(torch.load(os.path.join(WEIGHT_PATH, "model_{}_answers_{}.ckpt".format(VERSION, fold))))
    y_preds_answer += predict(model, test_loader, output_categories_answer) / len(FOLD_ID)
    

# 7. Postprocessing

In [None]:
##########################
# Initial Submission DataFrame
##########################

y_preds = np.concatenate([y_preds_question, y_preds_answer], axis=1)

submission = pd.read_csv(LOCAL_PATH+'/sample_submission.csv')
submission.loc[:, 'question_asker_intent_understanding':] = y_preds
submission.head()


In [None]:
##########################
# Postprocessing for question_type_spelling
##########################

vocab_list_large = [
    'pronounced', 'pronounce', 'pronunciation', 'correct adjective', 'How many syllables', 'spell'
]
def rule_large(x):
    if x == 0:
        return 0.0
    elif x == 1:
        return 1/3
    else:
        return 2/3
    
vocab_list_base = [
    'sound', 'prefix', 'adjective', 'verb', 'noun', 'word', 'Ngram', 'conversation', 'syllable'
]
def rule_base(x):
    if x == 0:
        return 0.0
    elif x == 1:
        return 1/64
    else:
        return 1/32
    
y_preds_question_type_spelling = (
    test['question_title'].apply(
        lambda x: sum([x.count(vocab) for vocab in vocab_list_large])
    ) + test['question_body'].apply(
        lambda x: sum([x.count(vocab) for vocab in vocab_list_large])
    )).apply(rule_large) + (test['question_title'].apply(
        lambda x: sum([x.count(vocab) for vocab in vocab_list_base])
    ) + test['question_body'].apply(
        lambda x: sum([x.count(vocab) for vocab in vocab_list_base])
    )).apply(rule_base)

stackexchange_particles = test['url'].apply(
    lambda x:(('ell.stackexchange.com' in x) or ('english.stackexchange.com' in x))
).tolist()
spelling=[]
for x in stackexchange_particles:
    if x:
        spelling.append(1/6)
    else:
        spelling.append(0.)
        
y_preds_question_type_spelling = y_preds_question_type_spelling + np.array(spelling)

submission['question_type_spelling'] = y_preds_question_type_spelling
submission.loc[test['category']!='CULTURE', 'question_type_spelling'] = 0.0
submission.head()

# 8. Submission

In [None]:
submission.to_csv('submission.csv', index=False)