# Correlation Networks for Extreme Multi-label Text Classification in Google Colaboratory
This is a Python implementation of the paper  "[Correlation Networks for Extreme Multi-label Text Classification](https://www.semanticscholar.org/paper/Correlation-Networks-for-Extreme-Multi-label-Text-Xun-Jha/2528e161a0e2d4bdb2b0482ec5866a3914d0758b)" by *Guangxu Xun, Kishlay Jha, Jianhui Sun, Aidong Zhang*.

## Baselines
This implementation is based on the work by [XunGuangxu](https://github.com/XunGuangxu) and the codes for the baseline model is adapted from the following repository available [here](https://github.com/XunGuangxu/CorNet).

# Acknowledgments
Machine Learning Project © Course held by Professor [Paolo Frasconi](https://www.unifi.it/p-doc2-2016-200006-F-3f2a3d2f332b2c-0.html) - Computer Engineering Master Degree @[University of Florence](https://www.unifi.it/changelang-eng.html)

Save the directory where the saved models are, so I always have the shared drive with all my model parameters.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

base_dir = "/content/gdrive/MyDrive/Machine_Learning/" # Specify your Drive dir where to load/save models

#Datasets
Below we will extract and manage the datasets that have been used for the experiments.
We use two benchmark datasets:

1.   One medium-scale dataset: ***AmazonCat-13k***
2.   One small-scale dataset: ***EUR-Lex***

For each dataset, the vocabulary size is limited to 500,000 words according to the word frequency in the training set.

Word embeddings are initialized with the 300-dimensional pretrained GloVe embeddings. Word embeddings are frozen for the EUR-Lex dataset.

Input text sequences are truncated to 500 words if longer. 



In [None]:
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenize(sentence: str, sep='/SEP/'):
    # We added a /SEP/ symbol between titles and descriptions such as Amazon datasets.
    return [token.lower() if token != sep else token for token in word_tokenize(sentence)
            if len(re.sub(r'[^\w]', '', token)) > 0]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from typing import Union, Iterable
from gensim.models import KeyedVectors
from collections import Counter
import numpy as np
def build_vocab(texts: Iterable, w2v_model: Union[KeyedVectors, str], vocab_size=500000, # with union the variable accepts more types
                pad='<PAD>', unknown='<UNK>', sep='/SEP/', max_times=1, freq_times=1):
    if isinstance(w2v_model, str):
        w2v_model = KeyedVectors.load(w2v_model)
    emb_size = w2v_model.vector_size
    vocab, emb_init = [pad, unknown], [np.zeros(emb_size), np.random.uniform(-1.0, 1.0, emb_size)]
    counter = Counter(token for t in texts for token in set(t.split())) # count the occurecies of each token in a dictionary
    for word, freq in sorted(counter.items(), key=lambda x: (x[1], x[0] in w2v_model), reverse=True):
        if word in w2v_model or freq >= freq_times:
            vocab.append(word)
            # We used embedding of '.' as embedding of '/SEP/' symbol.
            word = '.' if word == sep else word
            emb_init.append(w2v_model[word] if word in w2v_model else np.random.uniform(-1.0, 1.0, emb_size))
        # if the word occurs more than once or if the size of the vocabulary is equal to 500k we have a break
        if freq < max_times or vocab_size == len(vocab):  
            break
    return np.asarray(vocab), np.asarray(emb_init)

In [None]:
from tqdm import tqdm
import joblib
from sklearn.preprocessing import MultiLabelBinarizer
def truncate_text(texts, max_len=500, padding_idx=0, unknown_idx=1):
    if max_len is None:
        return texts
    texts = np.asarray([list(x[:max_len]) + [padding_idx] * (max_len - len(x)) for x in texts])
    texts[(texts == padding_idx).all(axis=1), 0] = unknown_idx
    return texts

def convert_to_binary(text_file, label_file=None, max_len=None, vocab=None, pad='<PAD>', unknown='<UNK>'):
    with open(text_file) as fp:
        texts = np.asarray([[vocab.get(word, vocab[unknown]) for word in line.split()] for line in tqdm(fp, desc='Converting token to id', leave=False)])
    labels = None
    if label_file is not None:
        with open(label_file) as fp:
            labels = np.asarray([[label for label in line.split()]
                                 for line in tqdm(fp, desc='Converting labels', leave=False)])
    return truncate_text(texts, max_len, vocab[pad], vocab[unknown]), labels


# method to return the word embedding
def get_word_emb(vec_path, vocab_path=None):
  if vocab_path is not None:
      with open(vocab_path) as fp:
          vocab = {word: idx for idx, word in enumerate(fp)}
      return np.load(vec_path), vocab
  else:
      return np.load(vec_path)   

      
# method that trains and binarize the labels if. doesn't already exist at the path, otherwise create it.
def get_mlb(mlb_path, labels=None) -> MultiLabelBinarizer:
    if os.path.exists(mlb_path):
        return joblib.load(mlb_path)
    mlb = MultiLabelBinarizer(sparse_output=True)
    mlb.fit(labels)
    joblib.dump(mlb, mlb_path)
    return mlb      

In [None]:
import os


def preprocessing(text_path=None, tokenized_path=None, label_path=None, truncate_label_path=None, vocab_path=None, emb_path=None, w2v_model=None, vocab_size=500000, max_len=500, truncate=False, max_lines=100000):
    number_lines_text=0
    number_lines_labels=0

    # decrease number of elements in the labels
    if truncate is True and truncate_label_path is not None :
        print('Decreasing the size of the labels...')
        with open(label_path) as lb, open(truncate_label_path, 'w') as lbout:
              for line in tqdm(lb, desc='Decreasing'):
                  lbout.write(line)
                  number_lines_labels = number_lines_labels + 1
                  if truncate is True and number_lines_labels==max_lines:
                    print(number_lines_labels)
                    break
              label_path = truncate_label_path     

    if tokenized_path is not None:
        print(F'Tokenizing Text... {text_path}')
        with open(text_path) as fp, open(tokenized_path, 'w') as fout:
            for line in tqdm(fp, desc='Tokenizing'):
                print(*tokenize(line), file=fout)
                number_lines_text = number_lines_text + 1
                if truncate is True  and number_lines_text==max_lines:
                  break
        text_path = tokenized_path

    if not os.path.exists(vocab_path):
        print(F'Building Vocab. {text_path}')
        with open(text_path) as fp:
            vocab, emb_init = build_vocab(fp, w2v_model, vocab_size=vocab_size)
        np.save(vocab_path, vocab)
        np.save(emb_path, emb_init)
    vocab = {word: i for i, word in enumerate(np.load(vocab_path))}
    print(F'Vocab Size: {len(vocab)}')

    print(F'Getting Dataset: {text_path} Max Length: {max_len}')
    texts, labels = convert_to_binary(text_path, label_path, max_len, vocab)
    print(F'Size of Samples: {len(texts)}')
    print(F'Size of Labels: {len(labels)}')
    np.save(os.path.splitext(text_path)[0], texts)
    if labels is not None:
        assert len(texts) == len(labels)
        np.save(os.path.splitext(label_path)[0], labels)

In [None]:
#  method to load the dataset and the associated labels 
def get_data(text_file, label_file=None):
    return np.load(text_file, allow_pickle=False), np.load(label_file, allow_pickle=True) if label_file is not None else None

In [None]:
from torch.utils.data import Dataset
from scipy.sparse import csr_matrix
from typing import Sequence, Optional

TDataX = Sequence[Sequence]
TDataY = Optional[csr_matrix]

class MultiLabelDataset(Dataset):
    def __init__(self, data_x: TDataX, data_y: TDataY = None, training=True):
        self.data_x, self.data_y, self.training = data_x, data_y, training

    def __getitem__(self, item):
        data_x = self.data_x[item]
        if self.training and self.data_y is not None:
            data_y = self.data_y[item].toarray().squeeze(0).astype(np.float32)
            return data_x, data_y
        else:
            return data_x

    def __len__(self):
        return len(self.data_x)

##  EUR-Lex
EUR-Lex dataset is already tokenized in advance so tokenized_path=None,
Emb_init and vocabulary  will be created and saved in the following paths, if they have not already been created.

In [None]:
print('Unzip EUR-Lex Dataset and Word Embedding...')

!tar -xvzf /content/gdrive/MyDrive/Machine_Learning/EUR-Lex.tar.gz 
!unzip /content/gdrive/MyDrive/Machine_Learning/glove.zip

                                     #####################################################################################################

TRAIN_TEXT_PATH = "/content/EUR-Lex/train_texts.txt"
TRAIN_LABEL_PATH = "/content/EUR-Lex/train_labels.txt"
TRAIN_W2V_MODEL_PATH = "/content/glove.840B.300d.gensim"
VOCAB_PATH = "/content/EUR-Lex/vocabulary.npy"
EMB_PATH = "/content/EUR-Lex/emb_init.npy"

TEST_TEXT_PATH = "/content/EUR-Lex/test_texts.txt"
TEST_LABEL_PATH = "/content/EUR-Lex/test_labels.txt"
TEST_RESULTS_PATH = "/content/EUR-Lex/results"


LABELS_BINARIZER_PATH = "/content/EUR-Lex/labels_binarizer"

In [None]:
print('Running Preprocessing...')

# Prepocessing del Trainset di EUR-lex
preprocessing(text_path=TRAIN_TEXT_PATH,label_path=TRAIN_LABEL_PATH,vocab_path=VOCAB_PATH,emb_path=EMB_PATH,w2v_model=TRAIN_W2V_MODEL_PATH)
# Prepocessing del Testset di EUR-lex
preprocessing(text_path=TEST_TEXT_PATH,label_path=TEST_LABEL_PATH,vocab_path=VOCAB_PATH)

print('Preprocessing Completed!')

## AmazonCat-13k
AmazonCat-13k dataset is not already tokenized in advance so you need to specify tokenized_path.

In [None]:

print('Unzip AmazonCat-13k Dataset and Word Embedding...')


!tar -xvzf /content/gdrive/MyDrive/Machine_Learning/AmazonCat-13K.tar.gz
!unzip /content/gdrive/MyDrive/Machine_Learning/glove.zip


                                     #####################################################################################################


TRAIN_TEXT_PATH = "/content/AmazonCat-13K/train_raw_texts.txt"
TRAIN_TOKENIZED_PATH = "/content/AmazonCat-13K/train_tokenized_texts.txt"
TRAIN_LABEL_PATH = "/content/AmazonCat-13K/train_labels.txt"
TRAIN_TRUNCATE_LABEL_PATH = "/content/AmazonCat-13K/train_truncate_labels.txt"

TRAIN_W2V_MODEL_PATH = "/content/glove.840B.300d.gensim"
VOCAB_PATH = "/content/AmazonCat-13K/vocabulary.npy"
EMB_PATH = "/content/AmazonCat-13K/emb_init.npy"


TEST_TEXT_PATH = "/content/AmazonCat-13K/test_raw_texts.txt"
TEST_TOKENIZED_PATH = "/content/AmazonCat-13K/test_tokenized_texts.txt"
TEST_LABEL_PATH = "/content/AmazonCat-13K/test_labels.txt"
TEST_TRUNCATE_LABEL_PATH = "/content/AmazonCat-13K/test_truncate_labels.txt"

TEST_RESULTS_PATH = "/content/AmazonCat-13K/results"


LABELS_BINARIZER_PATH = "/content/AmazonCat-13K/labels_binarizer"

In [None]:
print('Running Preprocessing...')

# Prepocessing del Trainset di AmazonCat-13k
preprocessing(text_path=TRAIN_TEXT_PATH,tokenized_path=TRAIN_TOKENIZED_PATH,label_path=TRAIN_LABEL_PATH,truncate_label_path=TRAIN_TRUNCATE_LABEL_PATH,vocab_path=VOCAB_PATH,emb_path=EMB_PATH,w2v_model=TRAIN_W2V_MODEL_PATH, truncate=True)
# Prepocessing del Testset di AmazonCat-13k
preprocessing(text_path=TEST_TEXT_PATH,tokenized_path=TEST_TOKENIZED_PATH,label_path=TEST_LABEL_PATH, truncate_label_path=TEST_TRUNCATE_LABEL_PATH, vocab_path=VOCAB_PATH, truncate=True, max_lines=30000)
fu
print('Preprocessing Completed!')


# CorNet

In this cell is reported th code to create a CorNet block.

It is possible to concatente more layers togheter to obatian a deep CorNet
module.

A CorNet block is a computational unit which maps raw label predictions to enhanced label predictions based on label correlations.

Formally, a CorNet building block is defined as:

***𝒚 = 𝐹 (𝒙) + 𝒙***

where 𝒙, 𝒚 are the input and the output of this CorNet block and 𝐹 stands for the underlying mapping function. Specifically, 𝒙 denotes the raw label predictions before the CorNet block and 𝒚 denotes the enhanced label predictions after the CorNet block.

𝐹 is the correlation enhancing function to be learned. The most straightforward design for function 𝐹 is one fully connected layer:

 ***𝐹 (𝒙) = 𝑾𝒙***

where 𝑾 denotes the weight matrix of the layer, and the bias term and the activation function are omitted for simplifying notations. In this way, the 𝑖𝑡h enhanced prediction 𝑦𝑖 is a linear combination of all raw predictions
*{𝑥1, 𝑥2, ..., 𝑥𝑖 , ...}* and hence all possible linear correlations between the 𝑖𝑡h label and other labels are taken into consideration.

We insert a bottleneck layer between 𝒙 and 𝒚. Let 𝑅 denote the dimension of the bottleneck layer.
By having a bottleneck layer and setting 𝑅 ≪ 𝑉.
The model size is significantly reduced and more complex correlations can be captured by the additional layer.
Therefore, our design for function 𝐹 can be formally defined as:

***𝐹(𝒙)=𝑾2𝛿(𝑾1𝜎(𝒙)+𝒃1)+𝒃2***

where 𝑾1, 𝑾2 are the weight matrices, 𝒃1, 𝒃2 are the biases, and 𝜎, 𝛿 are the sigmoid activation function and the ELU activation function respectively. Recall that 𝒙 are the raw label prediction logits, hence we first need to use the sigmoid activation to convert label logits 𝒙 to label probabilities 𝜎(𝒙) ranging from 0 to 1, representing the confidence level of each label prediction. The correlations are then calculated based on the label probabilities. By doing so, the interpretability of label correlations is also achieved.



---


We have assumed that 𝒙 is the output label predictions of a deep XMTC network, but actually 𝒙 could also be the output label predictions of a previous CorNet block. This means we can stack any number of CorNet blocks to form a deep CorNet module and the output of each block is a correlation enhancement over the output of the previous block.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

ACT2FN = {'elu': F.elu, 'relu': F.relu, 'sigmoid': torch.sigmoid, 'tanh': torch.tanh}

class CorNetBlock(nn.Module):
    def __init__(self, context_size, output_size, cornet_act='sigmoid', **kwargs):
        super(CorNetBlock, self).__init__()
        self.dstbn2cntxt = nn.Linear(output_size, context_size)
        self.cntxt2dstbn = nn.Linear(context_size, output_size)
        self.act_fn = ACT2FN[cornet_act]
    
    def forward(self, output_dstrbtn):        
        identity_logits = output_dstrbtn        
        output_dstrbtn = self.act_fn(output_dstrbtn) # Sigmoid activation Function 
        context_vector = self.dstbn2cntxt(output_dstrbtn) # linear layer
        context_vector = F.elu(context_vector) # ELU atctivation function
        output_dstrbtn = self.cntxt2dstbn(context_vector) # linear layer
        output_dstrbtn = output_dstrbtn + identity_logits        
        return output_dstrbtn
    
    
class CorNet(nn.Module):
    def __init__(self, output_size, cornet_dim=1000, n_cornet_blocks=2, **kwargs):
        super(CorNet, self).__init__()
        self.intlv_layers = nn.ModuleList([CorNetBlock(cornet_dim, output_size, **kwargs) for _ in range(n_cornet_blocks)])
        for layer in self.intlv_layers:
            nn.init.xavier_uniform_(layer.dstbn2cntxt.weight)
            nn.init.xavier_uniform_(layer.cntxt2dstbn.weight)

    def forward(self, logits):        
        for layer in self.intlv_layers:
            logits = layer(logits)        
        return logits

#AttentionXML

The main AttentionXML components are:

1.   Bidirectional LSTMS;
2.   Multi-label attention layer.

---

The goal is to improve the performance of this model by introducing the Module CorNet.

Using the correlation, the hope is to improve the assignment of the labels.

### Word Representation
The input of AttentionXML is raw tokenized text with length Tˆ. Each word is represented by a deep semantic dense vector, called word embedding.

In this experiments, we use pre-trained 300-dimensional ***GloVe word embedding*** as our initial word representation.


In [None]:
import numpy as np

class Embedding(nn.Module):
    def __init__(self, vocab_size=None, emb_size=None, emb_init=None, emb_trainable=True, padding_idx=0, dropout=0.2):
        super(Embedding, self).__init__()
        if emb_init is not None:
            vocab_size, emb_size = emb_init.shape
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx, sparse=True,
                                  _weight=torch.from_numpy(emb_init).float() if emb_init is not None else None)
        self.emb.weight.requires_grad = emb_trainable
        self.dropout = nn.Dropout(dropout)
        self.padding_idx = padding_idx

    def forward(self, inputs):
        emb_out = self.dropout(self.emb(inputs))
        lengths, masks = (inputs != self.padding_idx).sum(dim=-1), inputs != self.padding_idx
        return emb_out[:, :lengths.max()], lengths, masks[:, :lengths.max()]

### Bidirectional LSTM
We use a Bidirectional LSTM (BiLSTM) to capture
both the left and right-sides context, where at each time step t the output h_t is obtained by concatenating the forward output h_t→ and the backward output h_t←.

In [None]:
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, layers_num, dropout):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, layers_num, batch_first=True, bidirectional=True)
        self.init_state = nn.Parameter(torch.zeros(2*2*layers_num, 1, hidden_size))
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, lengths, **kwargs):
        self.lstm.flatten_parameters()
        init_state = self.init_state.repeat([1, inputs.size(0), 1]) #repeats along the second direction for the number input_size
        cell_init, hidden_init = init_state[:init_state.size(0)//2], init_state[init_state.size(0)//2:]
        idx = torch.argsort(lengths, descending=True)
        packed_inputs = nn.utils.rnn.pack_padded_sequence(inputs[idx], lengths[idx].to('cpu'), batch_first=True) #    PERCHE VENGO IMPACCHETTATI E DISPACHETTATI 
        outputs, _ = nn.utils.rnn.pad_packed_sequence(
            self.lstm(packed_inputs, (hidden_init, cell_init))[0], batch_first=True)
        return self.dropout(outputs[torch.argsort(idx)])

###Attention
Recently, an attention mechanism in neural networks has been successfully used in many NLP tasks, such as machine translation, machine comprehension, relation extraction, and speech recognition. The most relevant context to each label can be different in XMTC. 

AttentionXML computes the (linear) combination of context vectors hˆi for each label through a multi-label attention mechanism, to capture various intensive parts of a text.

In [None]:
class MLAttention(nn.Module):
    def __init__(self, labels_num, hidden_size):
        super(MLAttention, self).__init__()
        self.attention = nn.Linear(hidden_size, labels_num, bias=False)
        nn.init.xavier_uniform_(self.attention.weight)

    def forward(self, inputs, masks):
        masks = torch.unsqueeze(masks, 1)  # N, 1, L
        attention = self.attention(inputs).transpose(1, 2).masked_fill(~masks, -np.inf)  # N, labels_num, L
        attention = F.softmax(attention, -1)
        return attention @ inputs   # N, labels_num, hidden_size

AttentionXML has one (or two) fully connected layers and one output layer. The same parameter values are used for all labels at the fully connected (and output) layers, to emphasize differences of attention among all labels. Also sharing the parameter values of fully connected layers among all labels can largely reduce the number of parameters to avoid overfitting and keep the model scale small.

In [None]:
class MLLinear(nn.Module):
    def __init__(self, linear_size, output_size):
        super(MLLinear, self).__init__()
        self.linear = nn.ModuleList(nn.Linear(in_s, out_s) for in_s, out_s in zip(linear_size[:-1], linear_size[1:]))
        for linear in self.linear:
            nn.init.xavier_uniform_(linear.weight)
        self.output = nn.Linear(linear_size[-1], output_size)
        nn.init.xavier_uniform_(self.output.weight)

    def forward(self, inputs):
        linear_out = inputs
        for linear in self.linear:
            linear_out = F.relu(linear(linear_out))
        return torch.squeeze(self.output(linear_out), -1)

Now let's create the AttentionXML model on which we will then add the CorNet module.

In [None]:
class AttentionXML(nn.Module):
    def __init__(self, labels_num, emb_size, hidden_size, layers_num, linear_size, dropout, 
                 vocab_size=None, emb_init=None, emb_trainable=True, padding_idx=0, emb_dropout=0.2, **kwargs):
        super(AttentionXML, self).__init__()
        self.emb = Embedding(vocab_size, emb_size, emb_init, emb_trainable, padding_idx, emb_dropout)
        self.lstm = LSTMEncoder(emb_size, hidden_size, layers_num, dropout)
        self.attention = MLAttention(labels_num, hidden_size * 2)
        self.linear = MLLinear([hidden_size * 2] + linear_size, 1)

    def forward(self, inputs, **kwargs):
        emb_out, lengths, masks = self.emb(inputs, **kwargs)
        rnn_out = self.lstm(emb_out, lengths)   # N, L, hidden_size * 2
        attn_out = self.attention(rnn_out, masks)      # N, labels_num, hidden_size * 2
        return self.linear(attn_out)

In [None]:
class CorNetAttentionXML(nn.Module):
    def __init__(self, labels_num, emb_size, hidden_size, layers_num, linear_size, dropout, **kwargs):
        super(CorNetAttentionXML, self).__init__()
        self.attnrnn = AttentionXML(labels_num, emb_size, hidden_size, layers_num, linear_size, dropout, **kwargs)
        self.cornet = CorNet(labels_num, **kwargs)

    def forward(self, input_variables):
        raw_logits = self.attnrnn(input_variables)
        cor_logits = self.cornet(raw_logits)        
        return cor_logits

#Optimizer


In [None]:
import math
import torch
from torch.optim.optimizer import Optimizer


class DenseSparseAdam(Optimizer):
    """
    """
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        super(DenseSparseAdam, self).__init__(params, defaults)

    def step(self, closure=None):
        """
        Performs a single optimization step.
        Parameters
        ----------
        closure : ``callable``, optional.
            A closure that reevaluates the model and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                state = self.state[p]

                # State initialization
                if 'step' not in state:
                    state['step'] = 0
                if 'exp_avg' not in state:
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                if 'exp_avg_sq' not in state:
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                state['step'] += 1

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                weight_decay = group['weight_decay']

                if grad.is_sparse:
                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
                    grad_indices = grad._indices()
                    grad_values = grad._values()
                    size = grad.size()

                    def make_sparse(values):
                        constructor = grad.new
                        if grad_indices.dim() == 0 or values.dim() == 0:
                            return constructor().resize_as_(grad)
                        return constructor(grad_indices, values, size)

                    # Decay the first and second moment running average coefficient
                    #      old <- b * old + (1 - b) * new
                    # <==> old += (1 - b) * (new - old)
                    old_exp_avg_values = exp_avg.sparse_mask(grad)._values()
                    exp_avg_update_values = grad_values.sub(old_exp_avg_values).mul_(1 - beta1)
                    exp_avg.add_(make_sparse(exp_avg_update_values))
                    old_exp_avg_sq_values = exp_avg_sq.sparse_mask(grad)._values()
                    exp_avg_sq_update_values = grad_values.pow(2).sub_(old_exp_avg_sq_values).mul_(1 - beta2)
                    exp_avg_sq.add_(make_sparse(exp_avg_sq_update_values))

                    # Dense addition again is intended, avoiding another sparse_mask
                    numer = exp_avg_update_values.add_(old_exp_avg_values)
                    exp_avg_sq_update_values.add_(old_exp_avg_sq_values)
                    denom = exp_avg_sq_update_values.sqrt_().add_(group['eps'])
                    del exp_avg_update_values, exp_avg_sq_update_values

                    bias_correction1 = 1 - beta1 ** state['step']
                    bias_correction2 = 1 - beta2 ** state['step']
                    step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                    p.data.add_(make_sparse(-step_size * numer.div_(denom)))
                    if weight_decay > 0.0:
                        p.data.add_(-group['lr'] * weight_decay, p.data.sparse_mask(grad))
                else:
                    # Decay the first and second moment running average coefficient
                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                    bias_correction1 = 1 - beta1 ** state['step']
                    bias_correction2 = 1 - beta2 ** state['step']
                    step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                    p.data.addcdiv_(-step_size, exp_avg, denom)
                    if weight_decay > 0.0:
                        p.data.add_(-group['lr'] * weight_decay, p.data)

        return loss

Utility methods that aid in testing.

In [None]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MultiLabelBinarizer
from typing import Union, Optional, List, Iterable, Hashable
import numpy as np


TPredict = np.ndarray
TTarget = Union[Iterable[Iterable[Hashable]], csr_matrix] # variable that can take on both types
TMlb = Optional[MultiLabelBinarizer]
TClass = Optional[List[Hashable]]



def get_mlb_evaluation(classes: TClass = None, mlb: TMlb = None, targets: TTarget = None):
    if classes is not None:
        mlb = MultiLabelBinarizer(classes, sparse_output=True)
    if mlb is None and targets is not None:
        if isinstance(targets, csr_matrix):
            mlb = MultiLabelBinarizer(classes=range(targets.shape[1]), sparse_output=True)
            mlb.fit(None)
        else:
            mlb = MultiLabelBinarizer(sparse_output=True)
            mlb.fit(targets)
    return mlb

#Evaluation
We adopt two instance-based ranking metrics to evaluate the models: the precision at top k (precision@k) and the normalized Discounted Cumulative Gain at top k (nDCG@k)

In [None]:
#PRECISION @K

from functools import partial

def get_precision(prediction: TPredict, targets: TTarget, mlb: TMlb = None, classes: TClass = None, top=5):
    mlb = get_mlb_evaluation(classes, mlb, targets)
    if not isinstance(targets, csr_matrix):
        targets = mlb.transform(targets)
    prediction = mlb.transform(prediction[:, :top])
    return prediction.multiply(targets).sum() / (top * targets.shape[0])


get_p_1 = partial(get_precision, top=1)
get_p_3 = partial(get_precision, top=3)
get_p_5 = partial(get_precision, top=5)

In [None]:
# Normalized Discounted Cumulative Gain


def get_ndcg(prediction: TPredict, targets: TTarget, mlb: TMlb = None, classes: TClass = None, top=5):
    mlb = get_mlb_evaluation(classes, mlb, targets)
    log = 1.0 / np.log2(np.arange(top) + 2) # add 2 because python 0-index
    dcg = np.zeros((targets.shape[0], 1))
    if not isinstance(targets, csr_matrix):
        targets = mlb.transform(targets)
    for i in range(top):
        p = mlb.transform(prediction[:, i: i+1])
        dcg += p.multiply(targets).sum(axis=-1) * log[i]
    return np.average(dcg / log.cumsum()[np.minimum(targets.sum(axis=-1), top) - 1])


get_n_1 = partial(get_ndcg, top=1)
get_n_3 = partial(get_ndcg, top=3)
get_n_5 = partial(get_ndcg, top=5)


Method that recalls the two metrics we use to compare the model with respect to the other configurations.

They are compared as the k value varies, which varies between 1-3-5.

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def testing(results, targets, train_labels):
    res, targets = np.load(results, allow_pickle=True), np.load(targets, allow_pickle=True)
    mlb = MultiLabelBinarizer(sparse_output=True)
    targets = mlb.fit_transform(targets)
    print('Precision@1,3,5:', get_p_1(res, targets, mlb), get_p_3(res, targets, mlb), get_p_5(res, targets, mlb))
    print('nDCG@1,3,5:', get_n_1(res, targets, mlb), get_n_3(res, targets, mlb), get_n_5(res, targets, mlb))

#Model
Class that manages the model and perform training and testing

In [None]:
from collections import deque
from typing import Optional, Mapping
from torch.utils.data import DataLoader


class Model(object):
    def __init__(self, network, model_path, gradient_clip_value=5.0, device_ids=None, **kwargs):
        self.model = nn.DataParallel(network(**kwargs).cuda(), device_ids=device_ids)
        self.loss_fn = nn.BCEWithLogitsLoss() # loss function
        self.model_path, self.state = model_path, {}
        os.makedirs(os.path.split(self.model_path)[0], exist_ok=True)
        self.gradient_clip_value, self.gradient_norm_queue = gradient_clip_value, deque([np.inf], maxlen=5)
        self.optimizer = None




# set of the optimizer
    def get_optimizer(self, **kwargs):
        self.optimizer = DenseSparseAdam(self.model.parameters(), **kwargs)


    def swa_init(self):
      if 'swa' not in self.state:
          print('SWA Initializing')
          swa_state = self.state['swa'] = {'models_num': 1}
          for n, p in self.model.named_parameters(): # Returns an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself.
              swa_state[n] = p.data.cpu().detach()    
  
    def swa_step(self):
        if 'swa' in self.state:
            swa_state = self.state['swa']
            swa_state['models_num'] += 1
            beta = 1.0 / swa_state['models_num']
            with torch.no_grad():
                for n, p in self.model.named_parameters():
                    swa_state[n].mul_(1.0 - beta).add_(beta, p.data.cpu())

    def swap_swa_params(self):
        if 'swa' in self.state:
            swa_state = self.state['swa']
            for n, p in self.model.named_parameters():
                gpu_id = p.get_device()
                p.data, swa_state[n] = swa_state[n], p.data.cpu()
                p.data = p.data.cuda(gpu_id)

  # Methods to save and load model 

    def save_model(self, last_epoch):
      if not last_epoch: return
      for trial in range(5):
          try:                
              torch.save(self.model.module.state_dict(), self.model_path)
              break
          except:
              print('saving failed')

    def load_model(self):
        self.model.module.load_state_dict(torch.load(self.model_path))             


# Method to manage the problems of exploding gradients and vanishing gradients
    def clip_gradient(self):
        if self.gradient_clip_value is not None:
            max_norm = max(self.gradient_norm_queue)
            total_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm * self.gradient_clip_value)
            self.gradient_norm_queue.append(min(total_norm, max_norm * 2.0, 1.0))
            if total_norm > max_norm * self.gradient_clip_value:
                print(F'Clipping gradients with total norm {total_norm} '
                            F'and max norm {max_norm}')



# step of the Training Phase

    def train_step(self, train_x: torch.Tensor, train_y: torch.Tensor):
        self.optimizer.zero_grad()
        self.model.train()
        scores = self.model(train_x)
        loss = self.loss_fn(scores, train_y)
        loss.backward()
        #self.clip_gradient()
        self.optimizer.step(closure=None)
        return loss.item()

# step of the Testing Phase
   
    def predict_step(self, data_x: torch.Tensor, k: int):
        self.model.eval()
        with torch.no_grad():
            scores, labels = torch.topk(self.model(data_x), k)
            return torch.sigmoid(scores).cpu(), labels.cpu()


# method of the training
    def train(self, train_loader: DataLoader, valid_loader: DataLoader, opt_params: Optional[Mapping] = None,
              nb_epoch=100, step=100, k=5, early=100, verbose=True, swa_warmup=None, **kwargs):
      
        self.get_optimizer(**({} if opt_params is None else opt_params))
        global_step, best_n5, e = 0, 0.0, 0
        print_loss = 0.0
        for epoch_idx in range(nb_epoch):
            print(F'Epoch Number: {epoch_idx}')
            if epoch_idx == swa_warmup:
                self.swa_init()
            for i, (train_x, train_y) in enumerate(train_loader, 1):
                global_step += 1
                loss = self.train_step(train_x, train_y.cuda())
                print_loss += loss
                if global_step % step == 0:
                    self.swa_step()
                    self.swap_swa_params()
                    labels = []
                    valid_loss = 0.0
                    self.model.eval()
                    with torch.no_grad():
                        for (valid_x, valid_y) in valid_loader:
                            logits = self.model(valid_x)
                            valid_loss += self.loss_fn(logits, valid_y.cuda()).item()
                            scores, tmp = torch.topk(logits, k)
                            labels.append(tmp.cpu())
                    valid_loss /= len(valid_loader)
                    labels = np.concatenate(labels)
                    targets = valid_loader.dataset.data_y
                    p5, n5 = get_p_5(labels, targets), get_n_5(labels, targets) 
                    if n5 > best_n5:
                        self.save_model(True)#epoch_idx > 1 * swa_warmup)
                        best_n5, e = n5, 0
                    else:
                        e += 1
                        if early is not None and e > early:
                            return
                    self.swap_swa_params()
                    if verbose:
                        log_msg = '%d %d train loss: %.7f valid loss: %.7f P@5: %.5f N@5: %.5f early stop: %d' % \
                        (epoch_idx, i * train_loader.batch_size, print_loss / step, valid_loss, round(p5, 5), round(n5, 5), e)
                        print(log_msg)
                        print_loss = 0.0



    def predict(self, data_loader: DataLoader, k=100, desc='Predict', **kwargs):
        self.load_model()
        scores_list, labels_list = zip(*(self.predict_step(data_x, k) for data_x in tqdm(data_loader, desc=desc, leave=False)))
        return np.concatenate(scores_list), np.concatenate(labels_list)



In [None]:
# method to save the results obtained in a specific path

def output_res(output_path, name, scores, labels):
    os.makedirs(output_path, exist_ok=True)
    np.save(os.path.join(output_path, F'{name}-scores'), scores)
    np.save(os.path.join(output_path, F'{name}-labels'), labels)

#Main Function
This method takes care of managing all the model and dataset that we want to use.

In particular, it will load the datasets (Traing-Validation-Test), then the model and finally training and testing it.

Run the cell corresponding to the dataset you are using. 
The corresponding dictionary will be passed into the main method via **kwargs taking the dictionary keys and values.


### CorNetAttentionXML-EUR-Lex Parameters and Paths

In [None]:
# CorNetAttentionXML-EUR-Lex Parameters

model_param = {"hidden_size": 256, "layers_num": 1, "linear_size": [256], "dropout": 0.5, "emb_trainable": False}
data_param = {"emb_size" : 300}
predict_param = {"batch_size" : 40}

VALID_SIZE=200

In [None]:
# CorNetAttentionXML-EUR-Lex Paths
TRAIN_TEXT_PATH_NPY = "/content/EUR-Lex/train_texts.npy"
TRAIN_LABEL_PATH_NPY = "/content/EUR-Lex/train_labels.npy"


TEST_TEXT_PATH_NPY = "/content/EUR-Lex/test_texts.npy"
TEST_LABEL_PATH_NPY= "/content/EUR-Lex/test_labels.npy"


EURLEX_PREDICTION_RESULTS= '/content/EUR-Lex/results/CorNetAttentionXML-EUR-Lex-labels.npy'


###CorNetAttentionXML-AmazonCat-13K Parmeters and Paths

In [None]:
# CorNetAttentionXML-AmazonCat-13K Parmeters

model_param = {"hidden_size": 200, "layers_num": 1, "linear_size": [100], "dropout": 0.5}
data_param = {"emb_size" : 300}
predict_param = {"batch_size" : 400}

VALID_SIZE=4000

In [None]:
# CorNetAttentionXML-AmazonCat-13K Paths
TRAIN_TEXT_PATH_NPY = "/content/AmazonCat-13K/train_tokenized_texts.npy"
TRAIN_LABEL_PATH_NPY = "/content/AmazonCat-13K/train_truncate_labels.npy"


TEST_TEXT_PATH_NPY = "/content/AmazonCat-13K/test_tokenized_texts.npy"
TEST_LABEL_PATH_NPY= "/content/AmazonCat-13K/test_truncate_labels.npy"


AMAZON_CAT_13K_PREDICTION_RESULTS= '/content/AmazonCat-13K/results/CorNetAttentionXML-AmazonCat-13K-labels.npy'


##Main Method

In [None]:

from torch.utils.data import DataLoader
import os
from pathlib import Path
from sklearn.model_selection import train_test_split



def main( valid_size, model_name, data_name, nb_epoch, mode='train'):
    #model_path = F'/content/EUR-Lex/{model_name}_{data_name}' # in this way the model is saved based on the dataset that we are using
    model_path = F'/content/gdrive/MyDrive/Machine_Learning/{model_name}_{data_name}'
    emb_init = get_word_emb(EMB_PATH) # path of the embedding
    model= None

    print(F'Model Name: {model_name}')
    
    if mode is None or mode == 'train':
      # loading current dataset
        print('Loading Training and Validation Set')
        train_x, train_labels = get_data(TRAIN_TEXT_PATH_NPY, TRAIN_LABEL_PATH_NPY)
        random_state = 1240
        train_x, valid_x, train_labels, valid_labels = train_test_split(train_x, train_labels,test_size=valid_size,random_state=random_state)
        mlb = get_mlb(LABELS_BINARIZER_PATH, np.hstack((train_labels, valid_labels)))
        train_y, valid_y = mlb.transform(train_labels), mlb.transform(valid_labels)


        labels_num = len(mlb.classes_)
        print(F'Number of Labels: {labels_num}')
        print(F'Size of Training Set: {len(train_x)}')
        print(F'Size of Validation Set: {len(valid_x)}')
        print('Training')


        """
        
         Eurlex - Train_batch_size: 40, Valid_batch_size: 40, Predict_batch_size: 40
         AmazonCat-13K - Train_batch_size: 200, Valid_batch_size: 200, Predict_batch_size: 200

        """


        train_loader = DataLoader(MultiLabelDataset(train_x, train_y),batch_size=40, shuffle=True, num_workers=4)
        valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, training=True),batch_size=40, num_workers=4)
        model = Model(network=CorNetAttentionXML, labels_num=labels_num, model_path=model_path, emb_init=emb_init, **data_param, **model_param)
      
        model.train(train_loader, valid_loader, nb_epoch=nb_epoch, swa_warmup=10) 
        print('Finish Training')

# section for evaluation
    if mode is None or mode == 'eval':
      print('Loading Test Set')
      mlb = get_mlb(LABELS_BINARIZER_PATH)
      labels_num = len(mlb.classes_)
      test_x, _ = get_data(TEST_TEXT_PATH_NPY, None)
      print(F'Size of Test Set: {len(test_x)}')

      print('Predicting')
      test_loader = DataLoader(MultiLabelDataset(test_x), batch_size=40, num_workers=4)
      if model is None:
          model = Model(network=CorNetAttentionXML, labels_num=labels_num, model_path=model_path, emb_init=emb_init, **data_param, **model_param)
      scores, labels = model.predict(test_loader, k=predict_param.get('k', 100))
      print('Finish Predicting')
      labels = mlb.classes_[labels]
      output_res(TEST_RESULTS_PATH, F'{model_name}-{data_name}', scores, labels)


In [None]:
# TRAINING
main(valid_size=VALID_SIZE, model_name='CorNetAttentionXML', data_name='EUR-Lex',nb_epoch=30, mode='train')

In [None]:
# PREDICTING AND TESTING
main(valid_size=VALID_SIZE, model_name='CorNetAttentionXML', data_name='AmazonCat-13K', nb_epoch=1, mode='eval')

testing(results=AMAZON_CAT_13K_PREDICTION_RESULTS, targets= TEST_LABEL_PATH_NPY, train_labels=TRAIN_LABEL_PATH_NPY)
