In [219]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import os
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn.metrics import classification_report

In [220]:

def make_dataframe(input_folder, labels_folder=None):
    text = []

    for fil in tqdm(filter(lambda x: x.endswith('.txt'),
                           os.listdir(input_folder))):
        iD, txt = fil[7:].split('.')[0], open(os.path.join(input_folder, fil),
                                              'r', encoding='utf-8').read()
        text.append((iD, txt))

    df_text = pd.DataFrame(text, columns=['id','text']).set_index('id')
    df = df_text

    if labels_folder:
        labels = pd.read_csv(labels_folder, sep='\t', header=None)
        labels = labels.rename(columns={0:'id', 1:'frames'})
        labels.id = labels.id.apply(str)
        labels = labels.set_index('id')

        df = labels.join(df_text)[['text', 'frames']]

    return df


# def read_data(data):
#     X_data = data['text'].values
#     Y_data = data['frames'].str.split(',').values
#     Y_data = encoder.fit_transform(Y_data)
#
#     return (X_data, Y_data)

In [221]:
import re

mispell_dict = {"ain’t": "is not", "aren’t": "are not","can’t": "cannot", "’cause": "because", "could’ve": "could have", "couldn’t": "could not", "didn’t": "did not",  "doesn’t": "does not", "don’t": "do not", "hadn’t": "had not", "hasn’t": "has not", "haven’t": "have not", "he’d": "he would","he’ll": "he will", "he’s": "he is", "how’d": "how did", "how’d’y": "how do you", "how’ll": "how will", "how’s": "how is",  "I’d": "I would", "I’d’ve": "I would have", "I’ll": "I will", "I’ll’ve": "I will have","I’m": "I am", "I’ve": "I have", "i’d": "i would", "i’d’ve": "i would have", "i’ll": "i will",  "i’ll’ve": "i will have","i’m": "i am", "i’ve": "i have", "isn’t": "is not", "it’d": "it would", "it’d’ve": "it would have", "it’ll": "it will", "it’ll’ve": "it will have","it’s": "it is", "let’s": "let us", "ma’am": "madam", "mayn’t": "may not", "might’ve": "might have","mightn’t": "might not","mightn’t’ve": "might not have", "must’ve": "must have", "mustn’t": "must not", "mustn’t’ve": "must not have", "needn’t": "need not", "needn’t’ve": "need not have","o’clock": "of the clock", "oughtn’t": "ought not", "oughtn’t’ve": "ought not have", "shan’t": "shall not", "sha’n’t": "shall not", "shan’t’ve": "shall not have", "she’d": "she would", "she’d’ve": "she would have", "she’ll": "she will", "she’ll’ve": "she will have", "she’s": "she is", "should’ve": "should have", "shouldn’t": "should not", "shouldn’t’ve": "should not have", "so’ve": "so have","so’s": "so as", "this’s": "this is","that’d": "that would", "that’d’ve": "that would have", "that’s": "that is", "there’d": "there would", "there’d’ve": "there would have", "there’s": "there is", "here’s": "here is","they’d": "they would", "they’d’ve": "they would have", "they’ll": "they will", "they’ll’ve": "they will have", "they’re": "they are", "they’ve": "they have", "to’ve": "to have", "wasn’t": "was not", "we’d": "we would", "we’d’ve": "we would have", "we’ll": "we will", "we’ll’ve": "we will have", "we’re": "we are", "we’ve": "we have", "weren’t": "were not", "what’ll": "what will", "what’ll’ve": "what will have", "what’re": "what are",  "what’s": "what is", "what’ve": "what have", "when’s": "when is", "when’ve": "when have", "where’d": "where did", "where’s": "where is", "where’ve": "where have", "who’ll": "who will", "who’ll’ve": "who will have", "who’s": "who is", "who’ve": "who have", "why’s": "why is", "why’ve": "why have", "will’ve": "will have", "won’t": "will not", "won’t’ve": "will not have", "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have", "y’all": "you all", "y’all’d": "you all would","y’all’d’ve": "you all would have","y’all’re": "you all are","y’all’ve": "you all have","you’d": "you would", "you’d’ve": "you would have", "you’ll": "you will", "you’ll’ve": "you will have", "you’re": "you are", "you’ve": "you have", "she`s": "she is", "\n": " "}

def replace_typical_misspell(text):
    for key in mispell_dict.keys():
        text = text.replace(key, mispell_dict[key])
    return text

# train_df = pd.DataFrame({'text': train_data.text})
# train_df.text = train_df['text'].progress_apply(lambda x: replace_typical_misspell(x.lower()))

In [222]:
train = '../data/en/train-articles-subtask-2'
train_label = '../data/en/train-labels-subtask-2.txt'
test = '../data/en/dev-articles-subtask-2'
test_label = '../data/en/dev-labels-subtask-2.txt'

data = make_dataframe(train, train_label)
train_data = make_dataframe(train, train_label)[:400]
dev_data = make_dataframe(train, train_label)[400:]

X_train = data['text'].values[:400]
X_dev = data['text'].values[400:]

encoder = MultiLabelBinarizer()

Y = data['frames'].str.split(',').values
Y_1 = encoder.fit_transform(Y)[:, [0, 2, 3, 5, 6, 9, 11, 12]]
Y_2 = encoder.fit_transform(Y)[:, [1, 4, 7, 8, 10, 13]]
Y_train = Y_1[:400]
Y_dev = Y_1[400:]
Y_2_train = Y_2[:400]
Y_2_dev = Y_2[400:]

433it [00:00, 7715.33it/s]
433it [00:00, 35676.22it/s]
433it [00:00, 35558.87it/s]


In [223]:
test_data = make_dataframe(test, test_label)

X_test = test_data['text'].values

Y_test = test_data['frames'].str.split(',').values
Y_2_test = encoder.fit_transform(Y_test)[:, [1, 4, 7, 8, 10, 13]]
Y_test = encoder.fit_transform(Y_test)[:,[0, 2, 3, 5, 6, 9, 11, 12]]

# Y_test = encoder.fit_transform(Y_test)

83it [00:00, 9503.36it/s]


In [224]:
X_train = [replace_typical_misspell(x.lower()) for x in X_train]
X_dev = [replace_typical_misspell(x.lower()) for x in X_dev]
X_test = [replace_typical_misspell(x.lower()) for x in X_test]

In [225]:
from transformers import PreTrainedTokenizerFast, XLNetTokenizerFast

from tokenizers import Tokenizer, normalizers, pre_tokenizers
from tokenizers.models import WordLevel
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Digits, Whitespace, Punctuation
from tokenizers.trainers import WordLevelTrainer


# train a tokenizer, initialize WordLevel tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
# we first define a normalizer applied before tokenization
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
# pre-tokenizer defines a "preprocessing" before the tokenization.
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Punctuation(),
                                                   Digits(individual_digits=True)])
# training a tokenizer is effectively building a vocabulary in this case
trainer = WordLevelTrainer(vocab_size=50000, special_tokens=["[PAD]", "[UNK]"])
tokenizer.train_from_iterator(train_data.text.values, trainer=trainer)
tokenizer.save("tokenizer.json")

#load a tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer.json",
    unk_token="[UNK]",
    pad_token="[PAD]"
)

# tokenizer = XLNetTokenizerFast(
#     tokenizer_file="tokenizer.json",
#     unk_token="[UNK]",
#     pad_token="[PAD]"
# )

In [226]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


seed = 0
torch.manual_seed(seed)


class SemEvalTask3Subtask2(Dataset):
    def __init__(self, texts, labels, tokenizer, max_token_len=512, labels_1=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.labels_1=labels_1
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int):
        text = self.texts[idx]
        label = self.labels[idx]
        label_1 = self.labels_1[idx]

        encoding = self.tokenizer.encode(
            text,
            padding='max_length',
            max_length=self.max_token_len,
            truncation=True,
            return_tensors='pt'
        )

        return dict(
            input_ids=encoding,
            label=torch.FloatTensor(label),
            label_1=torch.FloatTensor(label_1),
            label_1_output=torch.FloatTensor(label_1)
        )

In [227]:
train_dataset = SemEvalTask3Subtask2(
    X_train, Y_train, tokenizer, labels_1=Y_2_train
)
dev_dataset = SemEvalTask3Subtask2(
    X_dev, Y_dev, tokenizer, labels_1=Y_2_dev
)
test_dataset = SemEvalTask3Subtask2(
    X_test, Y_test, tokenizer, labels_1=Y_2_test
)

In [228]:
BATCH_SIZE = 64 # batch size for training

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [229]:
train_dataset[0]['input_ids']

tensor([[ 4289,  6890,  2867,    74,    12,   170,   142,  1825,  3939,     5,
           576,   566,    11,   271,    16,   176, 13156,  4289,  6890,    37,
           872,    74,    12,    14,   142,  1825,  3939,    15,     5,   178,
           584,  1060,   576,   566,     3,   112,   969,    83,    26,  1533,
            32,  1509,   110,  1273,  1308,   276,   296,     7,   315,    55,
            11,    13,  1040,   367,     4,    20,   507,  2649,     3,     2,
           255,   211,   957,    10,   566,    51,  1533,    32,  1509,   110,
             2,   445,     7,    10,    26,    73,    29,    27,  7240,   103,
            86,  2676,   100,    55,    54,   541,   836,   581,     8,    32,
           367,     4,   353,   480,     2,  7505,     6,  7903,  1641,    28,
             2,  1065,    20,     2,  4402,   176,     3,  4289,  6890,   872,
            10,    76,  8499,    25,     2,    98,    95,    16,   144,    16,
           291,  1814,   108,    31,    63,   109,  

In [230]:
class CNNClassifier_1(nn.Module):
    def __init__(self,
                 vocab_size,
                 output_size,
                 embedding_size=300,
                 in_channels=1,
                 out_channels=100,
                 kernel_sizes=[3,4,5]):
        super(CNNClassifier_1, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.output_size = output_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_sizes = kernel_sizes

        self.embed = nn.Embedding(self.vocab_size, self.embedding_size)
        self.convs = nn.ModuleList(
            [nn.Conv2d(self.in_channels, self.out_channels,
                       (kernel_size, self.embedding_size))
             for kernel_size in self.kernel_sizes])
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(self.kernel_sizes) * self.out_channels,
                             self.output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embed(torch.squeeze(x))  # (batch_size, sequence_length, embedding_size)
        x = x.unsqueeze(1)  # (batch_size, in_channels, sequence_length, embedding_size)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(batch_size, out_channels, embedding_size), ...]*len(kernel_sizes)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(batch_size, out_channels), ...]*len(kernel_sizes)
        x = torch.cat(x, 1)  # (batch_size, len(kernel_sizes)*out_channels)
        x = self.dropout(x)  # (batch_size, len(kernel_sizes)*out_channels)
        y = self.fc1(x)  # (batch_size, output_size)
        return y

    def predict(self, x, threshold=0.5):
        preds = self.sigmoid(self.forward(x))
        preds = np.array(preds.cpu() > threshold, dtype=float)
        return preds

In [231]:
class CNNClassifier_2(nn.Module):
    def __init__(self,
                 vocab_size,
                 output_size,
                 embedding_size=300,
                 in_channels=1,
                 out_channels=100,
                 kernel_sizes=[3,4,5]):
        super(CNNClassifier_2, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.output_size = output_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_sizes = kernel_sizes
        self.output2=50
        self.embed = nn.Embedding(self.vocab_size, self.embedding_size)
        self.input2 = nn.Linear(6, self.output2)
        self.convs = nn.ModuleList(
            [nn.Conv2d(self.in_channels, self.out_channels,
                       (kernel_size, self.embedding_size))
             for kernel_size in self.kernel_sizes])
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(self.kernel_sizes) * self.out_channels+self.output2,
                             self.output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, labels):
        x = self.embed(torch.squeeze(x))  # (batch_size, sequence_length, embedding_size)
        input2 = self.input2(labels)
        x = x.unsqueeze(1)  # (batch_size, in_channels, sequence_length, embedding_size)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(batch_size, out_channels, embedding_size), ...]*len(kernel_sizes)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(batch_size, out_channels), ...]*len(kernel_sizes)
        x = torch.cat(x, 1)  # (batch_size, len(kernel_sizes)*out_channels)
        x = self.dropout(x)  # (batch_size, len(kernel_sizes)*out_channels)
        combined = torch.cat((x.view(x.size(0), -1),
                              input2.view(input2.size(0), -1)), dim=1)
        y = self.fc1(combined)  # (batch_size, output_size)
        return y

    def predict(self, x, labels, threshold=0.5):
        preds = self.sigmoid(self.forward(x, labels))
        preds = np.array(preds.cpu() > threshold, dtype=float)
        return preds

In [232]:
class RnnType:
    GRU = 1
    LSTM = 2

class AttentionModel:
    NONE = 0
    DOT = 1
    GENERAL = 2

class Parameters:
    def __init__(self, data_dict):
        for k, v in data_dict.items():
            exec("self.%s=%s" % (k, v))


class Attention(nn.Module):
    def __init__(self, device, method, hidden_size):
        super(Attention, self).__init__()
        self.device = device

        self.method = method
        self.hidden_size = hidden_size

        self.concat_linear = nn.Linear(self.hidden_size * 2, self.hidden_size)

        if self.method == AttentionModel.GENERAL:
            self.attn = nn.Linear(self.hidden_size, hidden_size)

    def forward(self, rnn_outputs, final_hidden_state):
        # rnn_output.shape:         (batch_size, seq_len, hidden_size)
        # final_hidden_state.shape: (batch_size, hidden_size)
        # NOTE: hidden_size may also reflect bidirectional hidden states (hidden_size = num_directions * hidden_dim)
        batch_size, seq_len, _ = rnn_outputs.shape
        if self.method == AttentionModel.DOT:
            attn_weights = torch.bmm(rnn_outputs, final_hidden_state.unsqueeze(2))
        elif self.method == AttentionModel.GENERAL:
            attn_weights = self.attn(rnn_outputs) # (batch_size, seq_len, hidden_dim)
            attn_weights = torch.bmm(attn_weights, final_hidden_state.unsqueeze(2))

        else:
            raise Exception("[Error] Unknown AttentionModel.")

        attn_weights = torch.softmax(attn_weights.squeeze(2), dim=1)

        context = torch.bmm(rnn_outputs.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(2)

        attn_hidden = torch.tanh(self.concat_linear(torch.cat((context, final_hidden_state), dim=1)))

        return attn_hidden, attn_weights


class RnnClassifier(nn.Module):
    def __init__(self, device, params):
        super(RnnClassifier, self).__init__()
        self.params = params
        self.device = device

        # Embedding layer
        self.word_embeddings = nn.Embedding(self.params.vocab_size, self.params.embed_dim)

        # Calculate number of directions
        self.num_directions = 2 if self.params.bidirectional == True else 1

        self.linear_dims = [self.params.rnn_hidden_dim * self.num_directions] + self.params.linear_dims
        self.linear_dims.append(self.params.label_size)

        # RNN layer
        rnn = None
        if self.params.rnn_type == RnnType.GRU:
            rnn = nn.GRU
        elif self.params.rnn_type == RnnType.LSTM:
            rnn = nn.LSTM
        else:
            raise Exception("[Error] Unknown RnnType. Currently supported: RnnType.GRU=1, RnnType.LSTM=2")
        self.rnn = rnn(self.params.embed_dim,
                       self.params.rnn_hidden_dim,
                       num_layers=self.params.num_layers,
                       bidirectional=self.params.bidirectional,
                       dropout=self.params.dropout,
                       batch_first=False)


        # Define set of fully connected layers (Linear Layer + Activation Layer) * #layers
        self.linears = nn.ModuleList()
        for i in range(0, len(self.linear_dims)-1):
            if self.params.dropout > 0.0:
                self.linears.append(nn.Dropout(p=self.params.dropout))
            linear_layer = nn.Linear(self.linear_dims[i], self.linear_dims[i+1])
            self.init_weights(linear_layer)
            self.linears.append(linear_layer)
            if i == len(self.linear_dims) - 1:
                break  # no activation after output layer!!!
            self.linears.append(nn.ReLU())

        self.hidden = None

        # Choose attention model
        if self.params.attention_model != AttentionModel.NONE:
            self.attn = Attention(self.device, self.params.attention_model, self.params.rnn_hidden_dim * self.num_directions)
        self.sigmoid = nn.Sigmoid()


    def init_hidden(self, batch_size):
        if self.params.rnn_type == RnnType.GRU:
            return torch.zeros(self.params.num_layers * self.num_directions, batch_size, self.params.rnn_hidden_dim).to(self.device)
        elif self.params.rnn_type == RnnType.LSTM:
            return (torch.zeros(self.params.num_layers * self.num_directions, batch_size, self.params.rnn_hidden_dim).to(self.device),
                    torch.zeros(self.params.num_layers * self.num_directions, batch_size, self.params.rnn_hidden_dim).to(self.device))
        else:
            raise Exception('Unknown rnn_type. Valid options: "gru", "lstm"')

    # def freeze_layer(self, layer):
    #     for param in layer.parameters():
    #         param.requires_grad = False


    def forward(self, inputs):
        batch_size, seq_len, ems = inputs.shape

        # Push through embedding layer
        X = self.word_embeddings(torch.squeeze(inputs)).transpose(0, 1)

        self.hidden = self.init_hidden(batch_size)
        # Push through RNN layer
        rnn_output, self.hidden = self.rnn(X, self.hidden)

        # Extract last hidden state
        final_state = None
        if self.params.rnn_type == RnnType.GRU:
            final_state = self.hidden.view(self.params.num_layers, self.num_directions, batch_size, self.params.rnn_hidden_dim)[-1]
        elif self.params.rnn_type == RnnType.LSTM:
            final_state = self.hidden[0].view(self.params.num_layers, self.num_directions, batch_size, self.params.rnn_hidden_dim)[-1]
        # Handle directions
        final_hidden_state = None
        if self.num_directions == 1:
            final_hidden_state = final_state.squeeze(0)
        elif self.num_directions == 2:
            h_1, h_2 = final_state[0], final_state[1]
            final_hidden_state = torch.cat((h_1, h_2), 1)  # Concatenate both states

        # Push through attention layer
        if self.params.attention_model != AttentionModel.NONE:
            rnn_output = rnn_output.permute(1, 0, 2)  #
            X = self.attn(rnn_output, final_hidden_state)[0]
        else:
            X = final_hidden_state

        # Push through linear layers
        for l in self.linears:
            X = l(X)

        return X


    def init_weights(self, layer):
        if type(layer) == nn.Linear:
            # print("Initialize layer with nn.init.xavier_uniform_: {}".format(layer))
            torch.nn.init.xavier_uniform_(layer.weight)
            layer.bias.data.fill_(0.01)

    def predict(self, x, threshold=0.5):
        preds = self.sigmoid(self.forward(x))
        preds = np.array(preds.cpu() > threshold, dtype=float)
        return preds

In [233]:
parameters_dictionary = {}
parameters = Parameters({'vocab_size': tokenizer.vocab_size, 'embed_dim': 300,
                         'rnn_hidden_dim': 500, 'bidirectional': True, 'linear_dims': [128, 300, 14],
                         'label_size': len(encoder.classes_), 'rnn_type': RnnType.LSTM, 'num_layers': 4,
                         'dropout': 0.0, 'attention_model': AttentionModel.GENERAL}
                        )

In [234]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Hyperparameters
EPOCHS = 30 # epoch
LR = 0.001  # learning rate

model = CNNClassifier_1(
    tokenizer.vocab_size,
    6
)

# model = RnnClassifier(
#     torch.device(device),
#     parameters
# )

# model = CNNClassifier_2(
#     tokenizer.vocab_size,
#     8
# )
model.to(device)

loss_fun = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


for epoch in range(1, EPOCHS + 1):
    epoch_loss = 0
    model.train()
    for idx, data in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(data['input_ids'].to(device))
        loss = loss_fun(outputs, data['label_1'].to(device))
        loss.backward()
        # print(model.linears[4].weight.grad)
        optimizer.step()
        epoch_loss += loss.item()

    model.eval()
    outputs = []
    targets = []
    with torch.no_grad():
        for idx, data in enumerate(dev_dataloader):
            output_batch = model.predict(data['input_ids'].to(device))
            target_batch = np.array(data['label_1'])
            outputs.extend(output_batch)
            targets.extend(target_batch)
            # dev_dataloader[idx]['label_1_output'] = outputs

    micro_f1 = f1_score(targets, outputs, average='micro')
    dev_loss = loss_fun(torch.FloatTensor(outputs), torch.FloatTensor(targets))
    print(f'\rEpoch: {epoch}/{EPOCHS}, Micro-f1: {micro_f1:.3f}, Train Loss: {epoch_loss/len(train_dataloader):.3f}, Dev Loss: {dev_loss:.3f}', end='')

Epoch: 30/30, Micro-f1: 0.713, Train Loss: 0.147, Dev Loss: 0.615

In [235]:
dev_dataset = SemEvalTask3Subtask2(
    X_dev, Y_dev, tokenizer, labels_1=outputs
)

dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
print(classification_report(targets, outputs))

              precision    recall  f1-score   support

           0       0.62      0.29      0.40        17
           1       1.00      0.50      0.67        16
           2       0.80      0.75      0.77        16
           3       0.70      0.50      0.58        14
           4       0.91      0.87      0.89        23
           5       0.83      0.77      0.80        13

   micro avg       0.83      0.63      0.71        99
   macro avg       0.81      0.61      0.69        99
weighted avg       0.82      0.63      0.70        99
 samples avg       0.76      0.61      0.65        99



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [236]:
model.eval()
train_outputs = []
targets = []
with torch.no_grad():
    for idx, data in enumerate(train_dataloader):
        output_batch = model.predict(data['input_ids'].to(device))
        target_batch = np.array(data['label_1'])
        train_outputs.extend(output_batch)
        targets.extend(target_batch)

print(classification_report(targets, train_outputs))
train_dataset = SemEvalTask3Subtask2(
    X_train, Y_train, tokenizer, labels_1=train_outputs
)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       210
           1       1.00      1.00      1.00       105
           2       1.00      1.00      1.00       187
           3       1.00      1.00      1.00       189
           4       1.00      1.00      1.00       212
           5       1.00      1.00      1.00       174

   micro avg       1.00      1.00      1.00      1077
   macro avg       1.00      1.00      1.00      1077
weighted avg       1.00      1.00      1.00      1077
 samples avg       0.95      0.95      0.95      1077



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [237]:
model2 = CNNClassifier_2(
    tokenizer.vocab_size,
    8
)
model2.to(device)

loss_fun = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=LR)
for epoch in range(1, EPOCHS + 1):
    epoch_loss = 0
    model2.train()
    for idx, data in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model2(data['input_ids'].to(device), data['label_1_output'])
        loss = loss_fun(outputs, data['label'].to(device))
        loss.backward()
        # print(model.linears[4].weight.grad)
        optimizer.step()
        epoch_loss += loss.item()

    model2.eval()
    outputs = []
    targets = []
    with torch.no_grad():
        for idx, data in enumerate(dev_dataloader):
            output_batch = model2.predict(data['input_ids'].to(device), data['label_1_output'])
            target_batch = np.array(data['label'])
            outputs.extend(output_batch)
            targets.extend(target_batch)

    micro_f1 = f1_score(targets, outputs, average='micro')
    dev_loss = loss_fun(torch.FloatTensor(outputs), torch.FloatTensor(targets))
    print(f'\rEpoch: {epoch}/{EPOCHS}, Micro-f1: {micro_f1:.3f}, Train Loss: {epoch_loss/len(train_dataloader):.3f}, Dev Loss: {dev_loss:.3f}', end='')

Epoch: 30/30, Micro-f1: 0.300, Train Loss: 0.072, Dev Loss: 0.689

In [238]:
model2.eval()
outputs_dev2 = []
targets = []
with torch.no_grad():
    for idx, data in enumerate(dev_dataloader):
        output_batch = model2.predict(data['input_ids'].to(device), data['label_1_output'])
        target_batch = np.array(data['label'])
        outputs_dev2.extend(output_batch)
        targets.extend(target_batch)

print(classification_report(targets, outputs_dev2))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       1.00      0.12      0.22         8
           4       1.00      0.50      0.67         2
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         0
           7       1.00      0.33      0.50         3

   micro avg       1.00      0.18      0.30        17
   macro avg       0.38      0.12      0.17        17
weighted avg       0.76      0.18      0.27        17
 samples avg       0.06      0.06      0.06        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [239]:
model.eval()
outputs_test = []
targets = []
with torch.no_grad():
    for idx, data in enumerate(test_dataloader):
        output_batch = model.predict(data['input_ids'].to(device), data['label_1_output'])
        target_batch = np.array(data['label_1'])
        outputs_test.extend(output_batch)
        targets.extend(target_batch)

print(classification_report(targets, outputs_test))

test_dataset = SemEvalTask3Subtask2(
    X_dev, Y_dev, tokenizer, labels_1=outputs_test
)

dev_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.00      0.00      0.00        77
           2       0.00      0.00      0.00        62
           3       0.00      0.00      0.00        16
           4       0.00      0.00      0.00        82
           5       0.00      0.00      0.00        10

   micro avg       0.00      0.00      0.00       282
   macro avg       0.00      0.00      0.00       282
weighted avg       0.00      0.00      0.00       282
 samples avg       0.00      0.00      0.00       282



In [240]:
model2.eval()
outputs_test2 = []
targets = []
with torch.no_grad():
    for idx, data in enumerate(test_dataloader):
        output_batch = model2.predict(data['input_ids'].to(device), data['label_1_output'])
        target_batch = np.array(data['label'], )
        outputs_test2.extend(output_batch)
        targets.extend(target_batch)

print(classification_report(targets, outputs_test2))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        16
           3       0.00      0.00      0.00         8
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00        61
           6       0.00      0.00      0.00        29
           7       0.00      0.00      0.00        11

   micro avg       0.00      0.00      0.00       138
   macro avg       0.00      0.00      0.00       138
weighted avg       0.00      0.00      0.00       138
 samples avg       0.00      0.00      0.00       138



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
 for idx, data in enumerate(train_dataloader):
    print(data['label'])
    break

In [246]:
np.array(outputs_test2)

array([[0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 

In [247]:
encoder.inverse_transform(np.array(outputs_test2))

ValueError: Expected indicator for 14 classes, but got 8

In [None]:
from transformers import XLNetModel
import torch.nn as nn
class XLNet_Model(nn.Module):
    def __init__(self, classes):
        super(XLNet_Model, self).__init__()
        self.xlnet = XLNetModel.from_pretrained('xlnet-base-cased')
        self.out = nn.Linear(self.xlnet.config.hidden_size, classes)

    def forward(self, input):
        outputs = self.xlnet(input)
        out = self.out(outputs.last_hidden_state)
        return out

    def predict(self, x, threshold=0.5):
        preds = self.sigmoid(self.forward(x))
        preds = np.array(preds.cpu() > threshold, dtype=float)
        return preds

In [None]:
model = XLNet_Model(14)
model.to(device)

loss_fun = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


for epoch in range(1, EPOCHS + 1):
    epoch_loss = 0
    model.train()
    for idx, data in enumerate(train_dataloader):
        # model.hidden = model.init_hidden(BATCH_SIZE)
        optimizer.zero_grad()
        outputs = model(data['input_ids'].to(device))
        loss = loss_fun(outputs, data['label'].to(device))
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        # print(model.grad)
        # print(model.embs[0].weight.grad)
        optimizer.step()
        epoch_loss += loss.item()

    model.eval()
    outputs = []
    targets = []
    with torch.no_grad():
        for idx, data in enumerate(dev_dataloader):
            output_batch = model.predict(data['input_ids'].to(device))
            target_batch = np.array(data['label'])
            outputs.extend(output_batch)
            targets.extend(target_batch)

    micro_f1 = f1_score(targets, outputs, average='micro')
    dev_loss = loss_fun(torch.FloatTensor(outputs), torch.FloatTensor(targets))
    print(f'\rEpoch: {epoch}/{EPOCHS}, Micro-f1: {micro_f1:.3f}, Train Loss: {epoch_loss/len(train_dataloader):.3f}, Dev Loss: {dev_loss:.3f}', end='')