In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
from torch import nn, torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [3]:
def parse_csv(data_file,train_test = 'train'):
    if train_test == 'train':
        with open(data_file, 'r', encoding = 'utf-8') as fd:
            data = [l.strip().split('\t') for l in fd.readlines()][1:]
        X = [d[2] for d in data]
        y = [d[1] for d in data]
        return X, y
    elif train_test == "test":
        with open(data_file, 'r') as fd:
            data = [l.strip().split('\t') for l in fd.readlines()][1:]
        X = [d[1] for d in data]
        return X        

In [4]:
train_tweet, train_label = parse_csv('data/train/SemEval2018-T3-train-taskB_emoji_ironyHashtags.txt')

In [5]:
test_tweet, test_label = parse_csv('data/gold/SemEval2018-T3_gold_test_taskB_emoji.txt')

In [6]:
datasets = {
    "train": (train_tweet, train_label),
    "gold": (test_tweet, test_label),
}

In [7]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm

In [8]:
def twitter_preprocess():
    preprocessor = TextPreProcessor(
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                   'time',
                   'date', 'number'],
        annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis',
                  'censored'},
        all_caps_tag="wrap",
        fix_text=True,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]
    ).pre_process_doc

    def preprocess(name, dataset):
        desc = "PreProcessing dataset {}...".format(name)

        data = [preprocessor(x)
                for x in tqdm(dataset, desc=desc)]
        return data

    return preprocess

In [9]:
pre = twitter_preprocess()

Reading twitter_2018 - 1grams ...
Reading twitter_2018 - 2grams ...
Reading twitter_2018 - 1grams ...


In [10]:
a = pre(None,train_tweet)

PreProcessing dataset None...: 100%|██████████████████████████████████████████████| 3834/3834 [00:05<00:00, 658.02it/s]


In [11]:
import os

In [12]:
BASE_PATH = "C:/Users/ABC/Desktop/630/project/ntua_slp/ntua-slp-semeval2018"

In [13]:
model_conf = {
    "name": "TASK3_B",
    "token_type": "word",
    "batch_train": 32,
    "batch_eval": 32,
   "epochs": 50,
    "embeddings_file": "ntua_twitter_affect_310",
    "embed_dim": 310,
    "embed_finetune": False,
    "embed_noise": 0.2,
    "embed_dropout": 0.1,
    "encoder_dropout": 0.2,
    "encoder_size": 150,
    "encoder_layers": 2,
    "encoder_bidirectional": True,
    "attention": True,
    "attention_layers": 1,
    "attention_context": False,
    "attention_activation": "tanh",
    "attention_dropout": 0.0,
    "base": 0.3,
    "patience": 10,
    "weight_decay": 0.0,
    "clip_norm": 1,
}
os.path.join(BASE_PATH, "embeddings",
                                "{}.txt".format(model_conf["embeddings_file"]))

'C:/Users/ABC/Desktop/630/project/ntua_slp/ntua-slp-semeval2018\\embeddings\\ntua_twitter_affect_310.txt'

In [14]:
import errno
import pickle

def file_cache_name(file):
    head, tail = os.path.split(file)
    filename, ext = os.path.splitext(tail)
    return os.path.join(head, filename + ".p")


def write_cache_word_vectors(file, data):
    with open(file_cache_name(file), 'wb') as pickle_file:
        pickle.dump(data, pickle_file)


def load_cache_word_vectors(file):
    with open(file_cache_name(file), 'rb') as f:
        return pickle.load(f)


def load_word_vectors(file, dim):
    """
    Read the word vectors from a text file
    Args:
        file (): the filename
        dim (): the dimensions of the word vectors

    Returns:
        word2idx (dict): dictionary of words to ids
        idx2word (dict): dictionary of ids to words
        embeddings (np.ndarray): the word embeddings matrix

    """
    # in order to avoid this time consuming operation, cache the results
    try:
        cache = load_cache_word_vectors(file)
        print("Loaded word embeddings from cache.")
        return cache
    except OSError:
        print("Didn't find embeddings cache file {}".format(file))

    # create the necessary dictionaries and the word embeddings matrix
    if os.path.exists(file):
        print('Indexing file {} ...'.format(file))

        word2idx = {}  # dictionary of words to ids
        idx2word = {}  # dictionary of ids to words
        embeddings = []  # the word embeddings matrix

        # create the 2D array, which will be used for initializing
        # the Embedding layer of a NN.
        # We reserve the first row (idx=0), as the word embedding,
        # which will be used for zero padding (word with id = 0).
        embeddings.append(np.zeros(dim))

        # flag indicating whether the first row of the embeddings file
        # has a header
        header = False

        # read file, line by line
        with open(file, "r", encoding="utf-8") as f:
            for i, line in enumerate(f, 1):

                # skip the first row if it is a header
                if i == 1:
                    if len(line.split()) < dim:
                        header = True
                        continue

                values = line.split(" ")
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')

                index = i - 1 if header else i

                idx2word[index] = word
                word2idx[word] = index
                embeddings.append(vector)

            # add an unk token, for OOV words
            if "<unk>" not in word2idx:
                idx2word[len(idx2word) + 1] = "<unk>"
                word2idx["<unk>"] = len(word2idx) + 1
                embeddings.append(
                    np.random.uniform(low=-0.05, high=0.05, size=dim))

            print(set([len(x) for x in embeddings]))

            print('Found %s word vectors.' % len(embeddings))
            embeddings = np.array(embeddings, dtype='float32')

        # write the data to a cache file
        write_cache_word_vectors(file, (word2idx, idx2word, embeddings))

        return word2idx, idx2word, embeddings

    else:
        print("{} not found!".format(file))
        raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), file)


In [15]:
# TASK3_A = {
#     "name": "TASK3_A",
#     "token_type": "word",
#     "batch_train": 64,
#     "batch_eval": 64,
#    "epochs": 50,
#     "embeddings_file": "ntua_twitter_affect_310",
#     "embed_dim": 310,
#     "embed_finetune": False,
#     "embed_noise": 0.05,
#     "embed_dropout": 0.1,
#     "encoder_dropout": 0.2,
#     "encoder_size": 150,
#     "encoder_layers": 2,
#     "encoder_bidirectional": True,
#     "attention": True,
#     "attention_layers": 1,
#     "attention_context": False,
#     "attention_activation": "tanh",
#     "attention_dropout": 0.0,
#     "base": 0.7,
#     "patience": 10,
#     "weight_decay": 0.0,
#     "clip_norm": 1,
# }
TASK3_B = {
    "name": "TASK3_B",
    "token_type": "word",
    "batch_train": 32,
    "batch_eval": 32,
   "epochs": 50,
    "embeddings_file": "ntua_twitter_affect_310",
    "embed_dim": 310,
    "embed_finetune": False,
    "embed_noise": 0.2,
    "embed_dropout": 0.1,
    "encoder_dropout": 0.2,
    "encoder_size": 150,
    "encoder_layers": 2,
    "encoder_bidirectional": True,
    "attention": True,
    "attention_layers": 1,
    "attention_context": False,
    "attention_activation": "tanh",
    "attention_dropout": 0.0,
    "base": 0.3,
    "patience": 10,
    "weight_decay": 0.0,
    "clip_norm": 1,
}
def load_embeddings(model_conf):
    word_vectors = os.path.join(BASE_PATH, "embeddings",
                                "{}.txt".format(model_conf["embeddings_file"]))
    word_vectors_size = model_conf["embed_dim"]

    # load word embeddings
    print("loading word embeddings...")
    return load_word_vectors(word_vectors, word_vectors_size)

In [16]:
word2idx, idx2word, embeddings = load_embeddings(TASK3_B)

loading word embeddings...
Loaded word embeddings from cache.


In [17]:
from torch.utils.data import Dataset
from collections import Counter

def vectorize(sequence, el2idx, max_length):
    """
    Covert array of tokens, to array of ids, with a fixed length
    and zero padding at the end
    Args:
        sequence (): a list of elements
        el2idx (): dictionary of word to ids
        max_length ():
        unk_policy (): how to handle OOV words
        spell_corrector (): if unk_policy = 'correct' then pass a callable
            which will try to apply spell correction to the OOV token


    Returns: list of ids with zero padding at the end

    """
    words = np.zeros(max_length).astype(int)

    # trim tokens after max length
    sequence = sequence[:max_length]

    for i, token in enumerate(sequence):
        if token in el2idx:
            words[i] = el2idx[token]
        else:
            words[i] = el2idx["<unk>"]

    return words

class WordDataset(Dataset):

    def __init__(self, X, y, word2idx, pre):
        """
        A PyTorch Dataset
        What we have to do is to implement the 2 abstract methods:

            - __len__(self): in order to let the DataLoader know the size
                of our dataset and to perform batching, shuffling and so on...
            - __getitem__(self, index): we have to return the properly
                processed data-item from our dataset with a given index

        Args:
            X (): list of training samples
            y (): list of training labels
            max_length (int): the max length for each sentence.
                if 0 then use the maximum length in the dataset
            word2idx (dict): a dictionary which maps words to indexes
            label_transformer (LabelTransformer):
        """
        self.word2idx = word2idx
        
        self.data = X
        self.labels = y
        
        self.data = pre(None, self.data)
        
        self.set_max_length()

        self.dataset_statistics()

    def set_max_length(self):
        self.max_length = max([len(x) for x in self.data])

    def dataset_statistics(self):
        words = Counter()
        for x in self.data:
            words.update(x)
        unks = {w: v for w, v in words.items() if w not in self.word2idx}
        # unks = sorted(unks.items(), key=lambda x: x[1], reverse=True)
        total_words = sum(words.values())
        total_unks = sum(unks.values())

        print("Total words: {}, Total unks:{} ({:.2f}%)".format(
            total_words, total_unks, total_unks * 100 / total_words))

        print("Unique words: {}, Unique unks:{} ({:.2f}%)".format(
            len(words), len(unks), len(unks) * 100 / len(words)))

        # label statistics
        print("Labels statistics:")
        if isinstance(self.labels[0], float):
            print("Mean:{:.4f}, Std:{:.4f}".format(np.mean(self.labels),
                                                   np.std(self.labels)))
        else:
            try:
                counts = Counter(self.labels)
                stats = {k: "{:.2f}%".format(v * 100 / len(self.labels))
                         for k, v in sorted(counts.items())}
                print(stats)
            except:
                print("Not implemented for mclf")
        print()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        """
        Returns the _transformed_ item from the dataset

        Args:
            index (int):

        Returns:
            (tuple):
                * example (ndarray): vector representation of a training sample
                * label (string): the class label
                * length (int): the length (tokens) of the sentence
                * index (int): the index of the dataitem in the dataset.
                  It is useful for getting the raw input for visualizations.
        """
        sample, label = self.data[index], self.labels[index]

        # transform the sample and the label,
        # in order to feed them to the model
        sample = vectorize(sample, self.word2idx, self.max_length)

        if isinstance(label, (list, tuple)):
            label = np.array(label)

        return sample, label, len(self.data[index]), index

In [18]:
loaders={}
from torch.utils.data import DataLoader
print("Building word-level datasets...")
for k, v in datasets.items():
    dataset = WordDataset(v[0], v[1], word2idx, pre)
    batch_size = TASK3_B["batch_train"] if k == "train" else TASK3_B["batch_eval"]
    loaders[k] = DataLoader(dataset, batch_size, shuffle=True, drop_last=True)

Building word-level datasets...


PreProcessing dataset None...: 100%|█████████████████████████████████████████████| 3834/3834 [00:02<00:00, 1335.56it/s]


Total words: 81386, Total unks:199 (0.24%)
Unique words: 9111, Unique unks:197 (2.16%)
Labels statistics:
{'0': '50.16%', '1': '36.25%', '2': '8.24%', '3': '5.35%'}



PreProcessing dataset None...: 100%|████████████████████████████████████████████████| 784/784 [00:01<00:00, 728.23it/s]


Total words: 17462, Total unks:42 (0.24%)
Unique words: 3420, Unique unks:42 (1.23%)
Labels statistics:
{'0': '60.33%', '1': '20.92%', '2': '10.84%', '3': '7.91%'}



In [19]:
class ModelHelper:
    @staticmethod
    def _sort_by(lengths):
        """
        Sort batch data and labels by length.
        Useful for variable length inputs, for utilizing PackedSequences
        Args:
            lengths (nn.Tensor): tensor containing the lengths for the data

        Returns:
            - sorted lengths Tensor
            - sort (callable) which will sort a given iterable
                according to lengths
            - unsort (callable) which will revert a given iterable to its
                original order

        """
        batch_size = lengths.size(0)

        sorted_lengths, sorted_idx = lengths.sort()
        _, original_idx = sorted_idx.sort(0, descending=True)
        reverse_idx = torch.linspace(batch_size - 1, 0, batch_size).long()

        reverse_idx = reverse_idx.to(device)

        sorted_lengths = sorted_lengths[reverse_idx]

        def sort(iterable):
            if len(iterable.shape) > 1:
                return iterable[sorted_idx.data][reverse_idx]
            else:
                return iterable

        def unsort(iterable):
            if len(iterable.shape) > 1:
                return iterable[reverse_idx][original_idx][reverse_idx]
            else:
                return iterable

        return sorted_lengths, sort, unsort

class RNN(nn.Module, ModelHelper):
    def __init__(self, input_size, rnn_size, num_layers,
                 bidirectional, dropout, embd, task = 'B'):
        """
        A simple RNN Encoder.

        Args:
            input_size (int): the size of the input features
            rnn_size (int):
            num_layers (int):
            bidirectional (bool):
            dropout (float):

        Returns: outputs, last_outputs
        - **outputs** of shape `(batch, seq_len, hidden_size)`:
          tensor containing the output features `(h_t)`
          from the last layer of the LSTM, for each t.
        - **last_outputs** of shape `(batch, hidden_size)`:
          tensor containing the last output features
          from the last layer of the LSTM, for each t=seq_len.

        """
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=embd.shape[0],embedding_dim=embd.shape[1])
        
        self.init_embeddings(embd)

        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=rnn_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)

        # the dropout "layer" for the output of the RNN
        self.drop_rnn = nn.Dropout(dropout)

        # define output feature size
        self.feature_size = rnn_size

        if bidirectional:
            self.feature_size *= 2
            
        if task == 'B':
            self.linear = nn.Linear(self.feature_size, 4)
        else:
            self.linear = nn.Linear(self.feature_size, 1)
            
    def init_embeddings(self, weights):
        self.embedding.weight = nn.Parameter(torch.from_numpy(weights),requires_grad=False)

    @staticmethod
    def last_by_index(outputs, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(outputs.size(0),
                                               outputs.size(2)).unsqueeze(1)
        return outputs.gather(1, idx).squeeze()

    @staticmethod
    def split_directions(outputs):
        direction_size = int(outputs.size(-1) / 2)
        forward = outputs[:, :, :direction_size]
        backward = outputs[:, :, direction_size:]
        return forward, backward

    def last_timestep(self, outputs, lengths, bi=False):
        if bi:
            forward, backward = self.split_directions(outputs)
            last_forward = self.last_by_index(forward, lengths)
            last_backward = backward[:, 0, :]
            return torch.cat((last_forward, last_backward), dim=-1)

        else:
            return self.last_by_index(outputs, lengths)

    def forward(self, x, lengths):
        """
        This is the heart of the model. This function, defines how the data
        passes through the network.
        Args:
            embs (): word embeddings
            lengths (): the lengths of each sentence

        Returns: the logits for each class

        """
        lengths, sort, unsort = self._sort_by(lengths)
        x = sort(x)

        embd = self.embedding(x.long())
        
        # pack the batch
        packed = pack_padded_sequence(embd, list(lengths.data),
                                      batch_first=True)

        out_packed, _ = self.rnn(packed)

        # unpack output - no need if we are going to use only the last outputs
        outputs, _ = pad_packed_sequence(out_packed, batch_first=True)

        # get the outputs from the last *non-masked* timestep for each sentence
        last_outputs = self.last_timestep(outputs, lengths,
                                          self.rnn.bidirectional)

        # apply dropout to the outputs of the RNN
        last_outputs = self.drop_rnn(last_outputs)

        # unsort
        last_outputs = unsort(last_outputs)

        logits = self.linear(last_outputs)

        return logits

In [20]:
def model_training(model, loaders, epochs=10, lr=0.001, clip=5, print_every=100):
    model.train()
    
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_f = nn.MultiLabelSoftMarginLoss()
    
    gpu = torch.cuda.is_available()
    
    if(gpu):
        device = torch.device('cuda')
        model.to(device)
    else:
        device = torch.device('cpu')
    
    for e in range(epochs):
        counter = 0
        for i_batch, (X, y, lengths, index) in enumerate(loaders['train'], 1):
            counter += 1
            
            y = np.array(list(map(int,y)))
            y_onehot = np.zeros((y.shape[0], 4))
            y_onehot[np.arange(y.shape[0]), y] = 1
            y = torch.from_numpy(y_onehot).float()
            if(gpu):
                X, y, lengths = X.to(device), y.to(device), lengths.to(device)

            opt.zero_grad()
            
            linear_outputs = model(X, lengths)
            
            loss = loss_f(linear_outputs, y)
            loss.backward()
            
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            opt.step()
        
            if counter % print_every == 0:
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()))
        
        
        
        

In [21]:
model = RNN(TASK3_B['embed_dim'], TASK3_B['encoder_size'], TASK3_B['encoder_layers'], TASK3_B['encoder_bidirectional'], TASK3_B['encoder_dropout'], embeddings, task = 'B')

In [22]:
model
gpu = torch.cuda.is_available()
    
if(gpu):
    device = torch.device('cuda')
    model.to(device)
else:
    device = torch.device('cpu')

In [23]:
model_training(model, loaders)

Epoch: 1/10... Step: 100... Loss: 0.3814...
Epoch: 2/10... Step: 100... Loss: 0.2152...
Epoch: 3/10... Step: 100... Loss: 0.2992...
Epoch: 4/10... Step: 100... Loss: 0.1606...
Epoch: 5/10... Step: 100... Loss: 0.2329...
Epoch: 6/10... Step: 100... Loss: 0.1676...
Epoch: 7/10... Step: 100... Loss: 0.1334...
Epoch: 8/10... Step: 100... Loss: 0.1314...
Epoch: 9/10... Step: 100... Loss: 0.2312...
Epoch: 10/10... Step: 100... Loss: 0.1417...


In [24]:
torch.save(model, 'model_b_310_10e.mdl')

  "type " + obj.__name__ + ". It won't be checked "


In [25]:
gold_data = WordDataset(datasets['gold'][0],datasets['gold'][1],word2idx,pre)

PreProcessing dataset None...: 100%|███████████████████████████████████████████████| 784/784 [00:00<00:00, 1187.74it/s]


Total words: 17462, Total unks:42 (0.24%)
Unique words: 3420, Unique unks:42 (1.23%)
Labels statistics:
{'0': '60.33%', '1': '20.92%', '2': '10.84%', '3': '7.91%'}



In [26]:
gold_size = len(gold_data)

In [27]:
Xs = []
ys = []
lengths = []
for X, y, length, index in gold_data:
    Xs.append(torch.from_numpy(X))
    ys.append(int(y))
    lengths.append(int(length))
Xs = torch.stack(Xs)
ys = torch.tensor(ys)
lengths = torch.tensor(lengths)

In [28]:
model.eval()

RNN(
  (embedding): Embedding(804871, 310)
  (rnn): LSTM(310, 150, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (drop_rnn): Dropout(p=0.2)
  (linear): Linear(in_features=300, out_features=4, bias=True)
)

In [29]:
test_outputs = model(Xs.to(torch.device('cuda')), lengths.to(torch.device('cuda')))

In [30]:
test_argmax = test_outputs.max(1)[1].to(torch.device('cpu'))

In [31]:
precision = torch.mean((test_argmax == ys).float()).item()
precision

0.8609693646430969

In [38]:
from sklearn.metrics import f1_score

In [33]:
argmax_list = [tensor.item() for tensor in test_outputs.max(1)[1]]
ys_list = [tensor.item() for tensor in ys]

In [39]:
f1 = f1_score(ys_list, argmax_list, average='macro')

In [40]:
f1

0.7023853181274253

In [236]:
loss_f = torch.nn.MultiLabelSoftMarginLoss()
X_list = []
y_list = []
for i_batch, (X, y, lengths, index) in enumerate(loaders['gold'], 1):
    y_raw = np.array(list(map(int,y)))
    y_onehot = np.zeros((y_raw.shape[0], 4))
    y_onehot[np.arange(y_raw.shape[0]), y_raw] = 1
    y = torch.from_numpy(y_onehot).float()
    if(gpu):
        X, y, lengths = X.to(device), y.to(device), lengths.to(device)
            
    linear_outputs = model(X, lengths) 
    loss = loss_f(linear_outputs, y)
    argmax = linear_outputs.max(1)[1]
    print(np.mean(np.array(argmax.to(torch.device('cpu'))) == y_raw))

<class 'torch.Tensor'>
0.84375
<class 'torch.Tensor'>
0.8125
<class 'torch.Tensor'>
0.875
<class 'torch.Tensor'>
0.90625
<class 'torch.Tensor'>
0.9375
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.9375
<class 'torch.Tensor'>
0.84375
<class 'torch.Tensor'>
0.8125
<class 'torch.Tensor'>
0.875
<class 'torch.Tensor'>
0.84375
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.90625
<class 'torch.Tensor'>
0.875
<class 'torch.Tensor'>
0.875
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.9375
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.8125
<class 'torch.Tensor'>
0.90625
<class 'torch.Tensor'>
0.84375
<class 'torch.Tensor'>
0.78125
<class 'torch.Tensor'>
0.9375
<class 'torch.Tensor'>
0.78125
