In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
from torch import nn, torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [3]:
def parse_csv(data_file,train_test = 'train'):
    if train_test == 'train':
        with open(data_file, 'r', encoding = 'utf-8') as fd:
            data = [l.strip().split('\t') for l in fd.readlines()][1:]
        X = [d[2] for d in data]
        y = [d[1] for d in data]
        return X, y
    elif train_test == "test":
        with open(data_file, 'r') as fd:
            data = [l.strip().split('\t') for l in fd.readlines()][1:]
        X = [d[1] for d in data]
        return X        

In [4]:
train_tweet, train_label = parse_csv('data/train/SemEval2018-T3-train-taskA_emoji_ironyHashtags.txt')

In [5]:
test_tweet, test_label = parse_csv('data/gold/SemEval2018-T3_gold_test_taskA_emoji.txt')

In [6]:
train_label_loc = list(map(int, train_label))
test_label_loc =  list(map(int, test_label))

In [7]:
from operator import itemgetter

In [8]:
data = itemgetter(*[i for i, e in enumerate(train_label_loc) if e])(train_tweet) + itemgetter(*[i for i, e in enumerate(test_label_loc) if e])(test_tweet)

In [9]:
data = list(data)

In [10]:
datasets = {
    "train": (train_tweet, train_label),
    "gold": (test_tweet, test_label),
}

In [11]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm

In [12]:
def twitter_preprocess():
    preprocessor = TextPreProcessor(
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                   'time',
                   'date', 'number'],
        annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis',
                  'censored'},
        all_caps_tag="wrap",
        fix_text=True,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]
    ).pre_process_doc

    def preprocess(name, dataset):
        desc = "PreProcessing dataset {}...".format(name)

        data = [preprocessor(x)
                for x in tqdm(dataset, desc=desc)]
        return data

    return preprocess

In [13]:
pre = twitter_preprocess()

Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 2grams ...
Reading twitter_2018 - 1grams ...


In [14]:
a = pre(None,train_tweet)

PreProcessing dataset None...: 100%|█████████████████████████████████████████████| 3834/3834 [00:02<00:00, 1819.34it/s]


In [15]:
import os

In [16]:
BASE_PATH = "./"

In [17]:
model_conf = {
    "name": "TASK3_B",
    "token_type": "word",
    "batch_train": 32,
    "batch_eval": 32,
    "epochs": 50,
    "embeddings_file": "ntua_twitter_affect_310",
    "embed_dim": 310,
    "embed_finetune": False,
    "embed_noise": 0.2,
    "embed_dropout": 0.1,
    "encoder_dropout": 0.2,
    "encoder_size": 150,
    "encoder_layers": 2,
    "encoder_bidirectional": True,
    "attention": True,
    "attention_layers": 1,
    "attention_context": False,
    "attention_activation": "tanh",
    "attention_dropout": 0.0,
    "base": 0.3,
    "patience": 10,
    "weight_decay": 0.0,
    "clip_norm": 1,
}
os.path.join(BASE_PATH, "embeddings",
                                "{}.txt".format(model_conf["embeddings_file"]))

'./embeddings\\ntua_twitter_affect_310.txt'

In [18]:
import errno
import pickle

def file_cache_name(file):
    head, tail = os.path.split(file)
    filename, ext = os.path.splitext(tail)
    return os.path.join(head, filename + ".p")


def write_cache_word_vectors(file, data):
    with open(file_cache_name(file), 'wb') as pickle_file:
        pickle.dump(data, pickle_file)


def load_cache_word_vectors(file):
    with open(file_cache_name(file), 'rb') as f:
        return pickle.load(f)


def load_word_vectors(file, dim):
    """
    Read the word vectors from a text file
    Args:
        file (): the filename
        dim (): the dimensions of the word vectors

    Returns:
        word2idx (dict): dictionary of words to ids
        idx2word (dict): dictionary of ids to words
        embeddings (np.ndarray): the word embeddings matrix

    """
    # in order to avoid this time consuming operation, cache the results
    try:
        cache = load_cache_word_vectors(file)
        print("Loaded word embeddings from cache.")
        return cache
    except OSError:
        print("Didn't find embeddings cache file {}".format(file))

    # create the necessary dictionaries and the word embeddings matrix
    if os.path.exists(file):
        print('Indexing file {} ...'.format(file))

        word2idx = {}  # dictionary of words to ids
        idx2word = {}  # dictionary of ids to words
        embeddings = []  # the word embeddings matrix

        # create the 2D array, which will be used for initializing
        # the Embedding layer of a NN.
        # We reserve the first row (idx=0), as the word embedding,
        # which will be used for zero padding (word with id = 0).
        embeddings.append(np.zeros(dim))

        # flag indicating whether the first row of the embeddings file
        # has a header
        header = False

        # read file, line by line
        with open(file, "r", encoding="utf-8") as f:
            for i, line in enumerate(f, 1):

                # skip the first row if it is a header
                if i == 1:
                    if len(line.split()) < dim:
                        header = True
                        continue

                values = line.split(" ")
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')

                index = i - 1 if header else i

                idx2word[index] = word
                word2idx[word] = index
                embeddings.append(vector)

            # add an unk token, for OOV words
            if "<unk>" not in word2idx:
                idx2word[len(idx2word) + 1] = "<unk>"
                word2idx["<unk>"] = len(word2idx) + 1
                embeddings.append(
                    np.random.uniform(low=-0.05, high=0.05, size=dim))

            print(set([len(x) for x in embeddings]))

            print('Found %s word vectors.' % len(embeddings))
            embeddings = np.array(embeddings, dtype='float32')

        # write the data to a cache file
        write_cache_word_vectors(file, (word2idx, idx2word, embeddings))

        return word2idx, idx2word, embeddings

    else:
        print("{} not found!".format(file))
        raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), file)


In [19]:
def load_embeddings(model_conf):
    word_vectors = os.path.join(BASE_PATH, "embeddings",
                                "{}.txt".format(model_conf["embeddings_file"]))
    word_vectors_size = model_conf["embed_dim"]

    # load word embeddings
    print("loading word embeddings...")
    return load_word_vectors(word_vectors, word_vectors_size)

In [20]:
word2idx, idx2word, embeddings = load_embeddings(model_conf)

loading word embeddings...
Loaded word embeddings from cache.


In [21]:
from torch.utils.data import Dataset
from collections import Counter

def vectorize(sequence, el2idx, max_length):
    """
    Covert array of tokens, to array of ids, with a fixed length
    and zero padding at the end
    Args:
        sequence (): a list of elements
        el2idx (): dictionary of word to ids
        max_length ():
        unk_policy (): how to handle OOV words
        spell_corrector (): if unk_policy = 'correct' then pass a callable
            which will try to apply spell correction to the OOV token


    Returns: list of ids with zero padding at the end

    """
    words = np.zeros(max_length).astype(int)

    # trim tokens after max length
    sequence = sequence[:max_length]

    for i, token in enumerate(sequence):
        if token in el2idx:
            words[i] = el2idx[token]
        else:
            words[i] = el2idx["<unk>"]

    return words

class WordDataset(Dataset):

    def __init__(self, X, word2idx, pre):
        """
        A PyTorch Dataset
        What we have to do is to implement the 2 abstract methods:

            - __len__(self): in order to let the DataLoader know the size
                of our dataset and to perform batching, shuffling and so on...
            - __getitem__(self, index): we have to return the properly
                processed data-item from our dataset with a given index

        Args:
            X (): list of training samples
            y (): list of training labels
            max_length (int): the max length for each sentence.
                if 0 then use the maximum length in the dataset
            word2idx (dict): a dictionary which maps words to indexes
            label_transformer (LabelTransformer):
        """
        self.word2idx = word2idx
        
        self.data = X
        
        self.data = pre(None, self.data)
        
        self.set_max_length()

        self.dataset_statistics()

    def set_max_length(self):
        self.max_length = max([len(x) for x in self.data])

    def dataset_statistics(self):
        words = Counter()
        for x in self.data:
            words.update(x)
        unks = {w: v for w, v in words.items() if w not in self.word2idx}
        # unks = sorted(unks.items(), key=lambda x: x[1], reverse=True)
        total_words = sum(words.values())
        total_unks = sum(unks.values())

        print("Total words: {}, Total unks:{} ({:.2f}%)".format(
            total_words, total_unks, total_unks * 100 / total_words))

        print("Unique words: {}, Unique unks:{} ({:.2f}%)".format(
            len(words), len(unks), len(unks) * 100 / len(words)))


        print()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        """
        Returns the _transformed_ item from the dataset

        Args:
            index (int):

        Returns:
            (tuple):
                * example (ndarray): vector representation of a training sample
                * label (string): the class label
                * length (int): the length (tokens) of the sentence
                * index (int): the index of the dataitem in the dataset.
                  It is useful for getting the raw input for visualizations.
        """
        sample = self.data[index]

        # transform the sample and the label,
        # in order to feed them to the model
        sample = vectorize(sample, self.word2idx, self.max_length)


        return sample, len(self.data[index]), index

In [22]:
from torch.utils.data import DataLoader
print("Building word-level datasets...")
dataset = WordDataset(data, word2idx, pre)
batch_size = model_conf["batch_train"]
loaders = DataLoader(dataset, batch_size, shuffle=True, drop_last=True)

Building word-level datasets...


PreProcessing dataset None...: 100%|█████████████████████████████████████████████| 2222/2222 [00:00<00:00, 3315.17it/s]


Total words: 48166, Total unks:89 (0.18%)
Unique words: 6230, Unique unks:89 (1.43%)



In [46]:
class ModelHelper:
    @staticmethod
    def _sort_by(lengths):
        """
        Sort batch data and labels by length.
        Useful for variable length inputs, for utilizing PackedSequences
        Args:
            lengths (nn.Tensor): tensor containing the lengths for the data

        Returns:
            - sorted lengths Tensor
            - sort (callable) which will sort a given iterable
                according to lengths
            - unsort (callable) which will revert a given iterable to its
                original order

        """
        batch_size = lengths.size(0)

        sorted_lengths, sorted_idx = lengths.sort()
        _, original_idx = sorted_idx.sort(0, descending=True)
        reverse_idx = torch.linspace(batch_size - 1, 0, batch_size).long()

        reverse_idx = reverse_idx.to(device)

        sorted_lengths = sorted_lengths[reverse_idx]

        def sort(iterable):
            if len(iterable.shape) > 1:
                return iterable[sorted_idx.data][reverse_idx]
            else:
                return iterable

        def unsort(iterable):
            if len(iterable.shape) > 1:
                return iterable[reverse_idx][original_idx][reverse_idx]
            else:
                return iterable

        return sorted_lengths, sort, unsort
    
class RNN(nn.Module, ModelHelper):
    def __init__(self, input_size, rnn_size, num_layers,
                 bidirectional, dropout, embd):
        """
        A simple RNN Encoder.

        Args:
            input_size (int): the size of the input features
            rnn_size (int):
            num_layers (int):
            bidirectional (bool):
            dropout (float):

        Returns: outputs, last_outputs
        - **outputs** of shape `(batch, seq_len, hidden_size)`:
          tensor containing the output features `(h_t)`
          from the last layer of the LSTM, for each t.
        - **last_outputs** of shape `(batch, hidden_size)`:
          tensor containing the last output features
          from the last layer of the LSTM, for each t=seq_len.

        """
        super(RNN, self).__init__()
        self.n_layers = num_layers
        self.n_hidden = rnn_size
        
        self.embedding = nn.Embedding(num_embeddings=embd.shape[0],embedding_dim=embd.shape[1])
        
        self.init_embeddings(embd)

        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=rnn_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)

        # the dropout "layer" for the output of the RNN
        self.drop_rnn = nn.Dropout(dropout)

        # define output feature size
        self.feature_size = rnn_size

        self.linear = nn.Linear(self.feature_size, input_size)
            
    def init_embeddings(self, weights):
        self.embedding.weight = nn.Parameter(torch.from_numpy(weights),requires_grad=False)


    def forward(self, x, h):
        """
        This is the heart of the model. This function, defines how the data
        passes through the network.
        Args:
            embs (): word embeddings
            lengths (): the lengths of each sentence

        Returns: the logits for each class

        """
        # lengths, sort, unsort = self._sort_by(lengths)
        # x = sort(x)
        
        embd = self.embedding(x.long())
        
        if embd.shape[1] != 1:
            embd_input = embd[:,:-1,:]
            embd_target = embd[:,1:,:]
        else:
            embd_input = embd
            embd_target = embd
        
        # pack the batch
        # packed = pack_padded_sequence(embd, list(lengths.data),
        #                       batch_first=True)
        outputs, h = self.rnn(embd_input, h)

        # unpack output - no need if we are going to use only the last outputs
        # outputs, _ = pad_packed_sequence(out_puts, batch_first=True)

        # get the outputs from the last *non-masked* timestep for each sentence
        #last_outputs = self.last_timestep(outputs, lengths,
        #                                  self.rnn.bidirectional)

        # apply dropout to the outputs of the RNN
        outputs = self.drop_rnn(outputs)


        logits = self.linear(outputs)

        return logits, h, embd_target
    
    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data
        
        
        if (gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [67]:
def model_training(model, loaders, epochs=10, lr=0.001, clip=5, print_every=50):
    model.train()
    
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_f = nn.CosineSimilarity(dim=2, eps=1e-08)
    
    gpu = torch.cuda.is_available()
    
    if gpu:
        device = torch.device('cuda')
        model.to(device)
    else:
        device = torch.device('cpu')
    
    for e in range(epochs):
        
        counter = 0
        h = model.init_hidden(model_conf['batch_train'])
        
        for i_batch, (X, lengths, index) in enumerate(loaders):
            
            counter += 1
            
            gpu = torch.cuda.is_available()
            
            
            if gpu:
                X, lengths = X.to(device), lengths.to(device)
                
            h = tuple([each.data for each in h])

            opt.zero_grad()
            
            
            linear_outputs, h, embd_target = model(X, h)
            # print(linear_outputs.shape, embd_target.shape)
            loss = loss_f(linear_outputs, embd_target)
            loss = (1 - loss)/2
            
            loss.sum().backward()
            
            
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            opt.step()
        
            if counter % print_every == 0:
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.sum()))
        
        
        
        

In [68]:
model = RNN(model_conf['embed_dim'], model_conf['encoder_size'], model_conf['encoder_layers'], False, model_conf['encoder_dropout'], embeddings)

In [69]:
model

RNN(
  (embedding): Embedding(804871, 310)
  (rnn): LSTM(310, 150, num_layers=2, batch_first=True, dropout=0.2)
  (drop_rnn): Dropout(p=0.2)
  (linear): Linear(in_features=150, out_features=310, bias=True)
)

In [27]:
gpu = torch.cuda.is_available()
    
if(gpu):
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [70]:
model_training(model, loaders)

Epoch: 1/10... Step: 50... Loss: 565.2349...
Epoch: 2/10... Step: 50... Loss: 532.6721...
Epoch: 3/10... Step: 50... Loss: 529.7601...
Epoch: 4/10... Step: 50... Loss: 553.4858...
Epoch: 5/10... Step: 50... Loss: 512.8134...
Epoch: 6/10... Step: 50... Loss: 524.2103...
Epoch: 7/10... Step: 50... Loss: 557.9632...
Epoch: 8/10... Step: 50... Loss: 536.8785...
Epoch: 9/10... Step: 50... Loss: 522.8715...
Epoch: 10/10... Step: 50... Loss: 539.6664...


In [71]:
loss_f_1 = nn.CosineSimilarity(dim=0, eps=1e-08)
def predict(model, word, h=None, f = loss_f):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        np.random.seed(4)
        # tensor inputs
        x = word2idx[word]
        inputs = torch.tensor(x).view(1,1)
        # print(inputs.shape)
        
        if(gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h, _ = model(inputs, h)

        # get the character probabilities
 
        if(gpu):
            out = out.cpu() # move to cpu
        
        loss_min = float("inf")
        i = 0
        for embd in embeddings:
            loss = loss_f_1(torch.from_numpy(embd), torch.tensor(out)).mean()
            loss = (1 - loss)/2
            if loss < loss_min:
                loss_min = loss
                min_index = i
            i += 1
        
        # return the encoded value of the predicted char and the hidden state
        return idx2word[min_index] ,h

In [73]:
beginning = "you are"

size = 30

if(gpu):
    model.cuda()
else:
    model.cpu()

model.eval()

words = beginning.split()
h = model.init_hidden(1)
for word in words:
    word_out, h = predict(model, word, h, loss_f)

words.append(word_out)

for i in range(size):
    word_out, h = predict(model, words[-1], h)
    print(word_out)
    words.append(word_out)

print(' '.join(words))



that
'
s
the
.
<repeated>
<hashtag>
not
</hashtag>
<hashtag>
not
</hashtag>
<hashtag>
lol
</hashtag>
<hashtag>
not
</hashtag>
<hashtag>
lol
</hashtag>
<hashtag>
lol
</hashtag>
<hashtag>
lol
</hashtag>
<hashtag>
lol
</hashtag>
you are that that ' s the . <repeated> <hashtag> not </hashtag> <hashtag> not </hashtag> <hashtag> lol </hashtag> <hashtag> not </hashtag> <hashtag> lol </hashtag> <hashtag> lol </hashtag> <hashtag> lol </hashtag> <hashtag> lol </hashtag>


In [64]:
input1 = torch.randn(100, 128)
input1

tensor([[-1.9667e-03, -7.0143e-01, -7.3567e-01,  ..., -2.2321e-01,
          1.3087e-01, -3.3168e-01],
        [ 4.6314e-01,  2.4860e+00, -1.2168e+00,  ..., -1.5734e+00,
         -2.6471e-01, -1.7451e+00],
        [-1.8289e-01,  6.3258e-01, -4.5826e-01,  ...,  2.2944e+00,
         -2.3991e-01,  6.4642e-01],
        ...,
        [-1.0176e+00,  9.8853e-01,  8.9408e-01,  ...,  1.2193e+00,
          2.8966e-01, -3.0660e-01],
        [-8.2059e-01,  1.0942e+00, -4.7992e-01,  ..., -1.4636e+00,
         -8.7323e-01, -4.8448e-01],
        [ 2.9662e-01,  9.8677e-01, -1.0833e-01,  ..., -4.8979e-01,
         -8.4017e-02, -3.8445e-01]])

In [66]:
input1/2

tensor([[-9.8334e-04, -3.5071e-01, -3.6784e-01,  ..., -1.1161e-01,
          6.5433e-02, -1.6584e-01],
        [ 2.3157e-01,  1.2430e+00, -6.0840e-01,  ..., -7.8669e-01,
         -1.3236e-01, -8.7255e-01],
        [-9.1445e-02,  3.1629e-01, -2.2913e-01,  ...,  1.1472e+00,
         -1.1995e-01,  3.2321e-01],
        ...,
        [-5.0881e-01,  4.9427e-01,  4.4704e-01,  ...,  6.0965e-01,
          1.4483e-01, -1.5330e-01],
        [-4.1029e-01,  5.4708e-01, -2.3996e-01,  ..., -7.3181e-01,
         -4.3661e-01, -2.4224e-01],
        [ 1.4831e-01,  4.9339e-01, -5.4164e-02,  ..., -2.4490e-01,
         -4.2008e-02, -1.9223e-01]])

In [40]:
word = 'you'
x = word2idx[word]
inputs = torch.tensor(x).view(1,1)
print(inputs.shape)

if(gpu):
    inputs = inputs.cuda()
    
h = model.init_hidden(1)
embd = model.embedding(inputs.long())
if embd.shape[1] != 1:
    embd_input = embd[:,:-1,:]
    embd_target = embd[:,1:,:]
else:
    embd_input = embd
    embd_target = embd

# pack the batch
# packed = pack_padded_sequence(embd, list(lengths.data),
#                       batch_first=True)
print(embd_input.shape)

torch.Size([1, 1])
torch.Size([1, 1, 310])


In [46]:
embeddings.shape

(804871, 310)

In [54]:
del model

In [55]:
torch.cuda.empty_cache()

In [45]:
counter = 0
h = model.init_hidden(model_conf['batch_train'])

for i_batch, (X, lengths, index) in enumerate(loaders):
    counter += 1

    loss_f = nn.CosineSimilarity(dim=2, eps=1e-08)
    
    print(X)
    
    if(gpu):
        X, lengths = X.to(device), lengths.to(device)

    # opt.zero_grad()

    linear_outputs, h, embd_target = model(X, h, lengths)
    
    print(linear_outputs.shape, embd_target.shape)

    loss = loss_f(linear_outputs, embd_target)
    print(loss.shape)
    loss.sum().backward()

    nn.utils.clip_grad_norm_(model.parameters(), clip)
    # opt.step()

    if counter % print_every == 0:
        print("Epoch: {}/{}...".format(e+1, epochs),
              "Step: {}...".format(counter),
              "Loss: {:.4f}...".format(loss.item()))

tensor([[    1,   421,    68,  ...,     0,     0,     0],
        [    1,     1,    69,  ...,     0,     0,     0],
        [ 2442,   428, 26818,  ...,     0,     0,     0],
        ...,
        [  138,    10,     9,  ...,     0,     0,     0],
        [    1,    80, 40702,  ...,     0,     0,     0],
        [   70,  1036,   106,  ...,     0,     0,     0]], dtype=torch.int32)
torch.Size([32, 47, 310]) torch.Size([32, 47, 310])
torch.Size([32, 47])


NameError: name 'clip' is not defined

In [73]:
input1 = torch.abs(torch.randn(1,2,20, 20))
input2 = torch.abs(torch.randn(1,2,20, 20))
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
output = cos(input1, input2)
input1
print(input1.shape, input2.shape, output.size())

torch.Size([1, 2, 20, 20]) torch.Size([1, 2, 20, 20]) torch.Size([1, 20, 20])


In [60]:
gpu = torch.cuda.is_available()
    
if(gpu):
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [129]:
torch.save(model, 'model_b_310_10e.mdl')

  "type " + obj.__name__ + ". It won't be checked "


In [160]:
gold_data = WordDataset(datasets['gold'][0],datasets['gold'][1],word2idx,pre)

PreProcessing dataset None...: 100%|███████████████████████████████████████████████| 784/784 [00:00<00:00, 3417.35it/s]


Total words: 17462, Total unks:42 (0.24%)
Unique words: 3420, Unique unks:42 (1.23%)
Labels statistics:
{'0': '60.33%', '1': '20.92%', '2': '10.84%', '3': '7.91%'}



In [185]:
gold_size = len(gold_data)

In [204]:
Xs = []
ys = []
lengths = []
for X, y, length, index in gold_data:
    Xs.append(torch.from_numpy(X))
    ys.append(int(y))
    lengths.append(int(length))
Xs = torch.stack(Xs)
ys = torch.tensor(ys)
lengths = torch.tensor(lengths)

In [205]:
test_outputs = model(Xs.to(torch.device('cuda')), lengths.to(torch.device('cuda')))

In [211]:
test_argmax = test_outputs.max(1)[1].to(torch.device('cpu'))

In [234]:
precision = torch.mean((test_argmax == ys).float()).item()
precision

0.8443877696990967

In [218]:
from sklearn.metrics import f1_score

In [229]:
argmax_list = [tensor.item() for tensor in test_outputs.max(1)[1]]
ys_list = [tensor.item() for tensor in ys]

In [233]:
f1_score = f1_score(ys_list, argmax_list, average='micro')

0.8443877551020407

In [236]:
loss_f = torch.nn.MultiLabelSoftMarginLoss()
X_list = []
y_list = []
for i_batch, (X, y, lengths, index) in enumerate(loaders['gold'], 1):
    y_raw = np.array(list(map(int,y)))
    y_onehot = np.zeros((y_raw.shape[0], 4))
    y_onehot[np.arange(y_raw.shape[0]), y_raw] = 1
    y = torch.from_numpy(y_onehot).float()
    if(gpu):
        X, y, lengths = X.to(device), y.to(device), lengths.to(device)
            
    linear_outputs = model(X, lengths) 
    loss = loss_f(linear_outputs, y)
    argmax = linear_outputs.max(1)[1]
    print(np.mean(np.array(argmax.to(torch.device('cpu'))) == y_raw))

<class 'torch.Tensor'>
0.84375
<class 'torch.Tensor'>
0.8125
<class 'torch.Tensor'>
0.875
<class 'torch.Tensor'>
0.90625
<class 'torch.Tensor'>
0.9375
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.9375
<class 'torch.Tensor'>
0.84375
<class 'torch.Tensor'>
0.8125
<class 'torch.Tensor'>
0.875
<class 'torch.Tensor'>
0.84375
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.90625
<class 'torch.Tensor'>
0.875
<class 'torch.Tensor'>
0.875
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.9375
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.8125
<class 'torch.Tensor'>
0.90625
<class 'torch.Tensor'>
0.84375
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.9375
<class 'torch.Tensor'>
0.78125


In [32]:
for i_batch, (X, y, lengths, index) in enumerate(loaders['train'], 1):
    print(X)
    print(y)
    break

tensor([[  170,    77, 14161,  ...,     0,     0,     0],
        [    1,   184,    54,  ...,     0,     0,     0],
        [    6,    46,   331,  ...,     0,     0,     0],
        ...,
        [    1,     6,    45,  ...,     0,     0,     0],
        [  170,   850,    23,  ...,     0,     0,     0],
        [    6,   237,    25,  ...,     0,     0,     0]], dtype=torch.int32)
('1', '0', '1', '0', '0', '3', '0', '1', '1', '0', '1', '3', '1', '1', '0', '1', '3', '1', '1', '2', '1', '1', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1')
