In [None]:
import copy
import time
import re
import string
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from unicodedata import bidirectional
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pack_sequence, pad_sequence, pad_packed_sequence
from torch import nn
import torch
import torchtext
import pandas as pd

In [None]:
sentiment_dict = {
    0: 'negative',
    1: 'somewhat negative',
    2: 'neutral',
    3: 'somewhat positive',
    4: 'positive'
}
def load_sentiment_data(path='res/train.tsv'):
    df = pd.read_csv(path, sep='\t', header=0)
    #    columns = ['PhraseId' 'SentenceId', 'Phrase', 'Sentiment']
    def process_phrase(phrase):
        remove_pun = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        remove_digits = str.maketrans(string.digits, ' '*len(string.digits))
        phrase = phrase.translate(remove_digits)
        phrase = phrase.translate(remove_pun)
        phrase = re.sub(' {2,}', ' ', phrase)
        return phrase.lower()
    df['Phrase'] = df['Phrase'].apply(lambda x: process_phrase(x))
    # filter out empty phrases
    df = df[df['Phrase'].str.len() > 1]
    df = df.reset_index(drop=True)
    return df

In [None]:
# The first time you run this will download a ~823MB file
glove = torchtext.vocab.GloVe(name="6B", dim=50)
# the glove object acts as the vocabulary here, can look up lower case words, check this whenb prprocessing
# glove has stoi (string ot index)
# glove has itos, a list of token string index by their numerical identifiers)

In [None]:
class RottenTomatoesDataset(Dataset):
    def __init__(self, df, glove_vocab, label_col='Sentiment', unk='<unk>') -> None:
        super().__init__()
        self.df = df
        self.labels = self.df[label_col].values
        self.glove = glove_vocab
        self.vocab_size = len(glove_vocab)
        self.data = []

        for title in self.df['Phrase'].values:
            self.data.append(torch.stack(
                [torch.LongTensor([glove.stoi.get(w, glove.stoi.get(unk))]) for w in title.split()]))

    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.data[idx], self.labels[idx]

class SequencePadder():
    def __init__(self, symbol) -> None:
        self.symbol = symbol

    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x[0].size(0), reverse=True)
        sequences = [x[0] for x in sorted_batch]
        labels = [x[1] for x in sorted_batch]
        padded = pad_sequence(sequences, padding_value=self.symbol)
        lengths = torch.LongTensor([len(x) for x in sequences])
        return padded, torch.LongTensor(labels), lengths


In [None]:
def get_metrics(model, data_loader, device, get_sentences=False):
    # use with batch size 1!
    with torch.set_grad_enabled(False):
        model.eval()
        model.to(device)
        y_pred, y_true = [], []
        sentences = []
        for inputs, labels, lengths in data_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            out, weights = model(inputs, lengths)

            _, preds = torch.max(out, 1)

            y_pred.append(preds)
            y_true.append(labels)
            if get_sentences:
                sent = [data_loader.dataset.glove.itos[i.item()] for i in inputs]
                sentences.append((sent, weights))
        if get_sentences:
            return scores, sentences
        else:
            return {
                      'f1': f1_score(y_true, y_pred, average='micro'),
                      'prec': precision_score(y_true, y_pred, average='micro'),
                      'recall': recall_score(y_true, y_pred, average='micro'),
                      'acc': accuracy_score(y_true, y_pred),
                   }


In [None]:
class LstmClassifierGloveEmbeddings(nn.Module):
    def __init__(self,
                hidden_size,
                output_size, # number of classes
                glove=None,
                num_layers=1,
                bidirectional=False):

        super(LstmClassifierGloveEmbeddings, self).__init__()
        self.input_size = len(glove) # vocabulary size
        self.embedding_size = glove.dim
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        # nn.Embedding can also be used with your own embeddings
        # hint: if you want to do so, you need to adapt the Dataloader
        self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)

        self.lstm = nn.LSTM(
                        input_size=self.embedding_size,
                        hidden_size=hidden_size,
                        num_layers=self.num_layers,
                        dropout=0.2 if num_layers > 1 else 0,
                        bidirectional=bidirectional,
        )

        self.num_directions = 2 if bidirectional else 1

        fc_size = self.hidden_size * self.num_directions
        self.fc = nn.Linear(fc_size, output_size)

    def forward(self, x, lengths, h_n=None):
        if h_n is None:
            h_n, c_n = self.init_hidden(x.size(1))
        else:
            h_n =  h_n[0]
            c_n =  h_n[1]
        # seq_len, batch_size = x.size()
        embed = self.embedding(x).squeeze(2)
        packed_seq = pack_padded_sequence(embed, lengths)
        # packed squence helps avoid unneccsary computation, with the length it marks out irrelvant/ padded sequence
        # elements, this allows the efficient computation of sequences of different lengths inside the same batch
        out, (h_n, cn) = self.lstm(packed_seq, (h_n, c_n))
        # out containing the output features (h_t) from the last layer of the LSTM, for each t. I
        # h_n containing the final hidden state for each element in the batch.
        # c_n containing the final cell state for each element in the batch.
        # output.view(seq_len, batch, num_directions, hidden_size)

        # padded_seq, lens = pad_packed_sequence(out) # undoing pack_padded_sequnce, not necessary here
        # h_n.view(num_layers, num_directions, batch, hidden_size) # addressable per layer
        if self.num_directions == 2:
            h_forward_backward = h_n.view(2, 2, x.size(1), -1)[-1]
            h_forward_backward = torch.cat([h_forward_backward[0], h_forward_backward[1]], 1)
            logits = self.fc(h_forward_backward) #h only hidden state at last layer, if bidrect out[-1 contains the concatenated hidden state]
        else:
            logits = self.fc(h_n) # h_n only hidden state at last layer, if bidrect out[-1 contains the concatenated hidden state]
        # dont use batch first here, seq_len must be first dimension

        # k-vectors are: quer output gru, query vector:
        return logits, h_n # only hidden state for the last layer is needed for loss calculation

    def init_hidden(self, batch_size=1):
        # if you want zero init, this does not have to be done manually in newer versions of pytorch
        # https://discuss.pytorch.org/t/lstm-hidden-state-changing-dimensions-error/23359
        device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
        # h_0 of shape (num_layers * num_directions, batch, hidden_size)
        h_dim_0 = self.num_layers * self.num_directions
        hidden = (torch.zeros(h_dim_0, batch_size, self.hidden_size, device=device),
                  torch.zeros(h_dim_0, batch_size, self.hidden_size, device=device))

        return hidden

In [None]:
class LstmSelfAttentionGloveEmbeddings(nn.Module):
    def __init__(self,
                hidden_size,
                output_size, # number of classes
                glove=None,
                num_layers=1,
                bidirectional=False):

        super(LstmSelfAttentionGloveEmbeddings, self).__init__()
        self.input_size = len(glove) # vocabulary size
        self.embedding_size = glove.dim
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        # nn.Embedding can also be used with your own embeddings
        # hint: if you want to do so, you need to adapt the Dataloader
        self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)

        self.lstm = nn.LSTM(
                        input_size=self.embedding_size,
                        hidden_size=hidden_size,
                        num_layers=self.num_layers,
                        dropout=0.2 if num_layers > 1 else 0,
                        bidirectional=bidirectional,
        )

        self.num_directions = 2 if bidirectional else 1

        self.attention = nn.MultiheadAttention(hidden_size * self.num_directions,
                                               1, dropout=0.2)

        fc_size = self.hidden_size * self.num_directions
        self.fc = nn.Linear(fc_size, output_size)

    def forward(self, x, lengths, h_n=None):
        if h_n is None:
            h_n, c_n = self.init_hidden(x.size(1))
        else:
            h_n =  h_n[0]
            c_n =  h_n[1]
        # seq_len, batch_size = x.size()
        embed = self.embedding(x).squeeze(2)
        packed_seq = pack_padded_sequence(embed, lengths)
        # packed squence helps avoid unneccsary computation, with the length it marks out irrelvant/ padded sequence
        # elements, this allows the efficient computation of sequences of different lengths inside the same batch
        out, (h_n, cn) = self.lstm(packed_seq, (h_n, c_n))
    
        padded_seq, lens = pad_packed_sequence(out) # undoing pack_padded_sequnce, not necessary here
        # self attention sees q,k and v to be from the same input, in our case all sequential hidden states
        # values retrieved are in the dimensionality of hidden state in this case
        # dont use batch first here, seq_len must be first dimension
        attn_output, attn_weights = self.attention(padded_seq, padded_seq, padded_seq)

        weighted_mean = attn_output.mean(0) # mean of attention values
        # this makes it the attention weighted mean of the hidden states
        logits = self.fc(weighted_mean)
        # k-vectors are: quer output gru, query vector:
        return logits, attn_weights # only hidden state for the last layer is needed for loss calculation

    def init_hidden(self, batch_size=1):
        # if you want zero init, this does not have to be done manually in newer versions of pytorch
        # https://discuss.pytorch.org/t/lstm-hidden-state-changing-dimensions-error/23359
        device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
        # h_0 of shape (num_layers * num_directions, batch, hidden_size)
        h_dim_0 = self.num_layers * self.num_directions
        hidden = (torch.zeros(h_dim_0, batch_size, self.hidden_size, device=device),
                  torch.zeros(h_dim_0, batch_size, self.hidden_size, device=device))

        return hidden

In [None]:
def train_rnn_model(model, data_loaders, criterion, optimizer, device, num_epochs=25):
    '''
    @param: data_loaders: takes on data loader containing the test set and one containing the train set
            keys must be: 'train', 'test' in this case
    '''
    since = time.time()
    # best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch = 0
    best_acc = 0.0
    test_losses = []
    print(model)
    best_model_wts = copy.deepcopy(model.state_dict())

    for epoch in range(1, num_epochs + 1):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            is_train = phase == 'train'
            if is_train:   # set model mode
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            running_corrects = 0
            # Iterate over data in DataLoaders
            for inputs, labels, lens in data_loaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # model = model.to(device) model should already be @device
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(is_train):
                    out, h_n = model(inputs, lens) 
                    # take only the last output
                    _, preds = torch.max(out, 1)
                    loss = criterion(out, labels)

                    # backward + optimize only if in training phase
                    if is_train:
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(1)
                running_corrects += torch.sum(preds == labels.data)
            # if is_train:
                # scheduler.step()

            epoch_loss = running_loss / len(data_loaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(data_loaders[phase].dataset)
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                                            phase, epoch_loss, epoch_acc))
            if not is_train:
                test_losses.append(epoch_loss)
            if (not is_train) and epoch_loss <= min(test_losses):
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                best_epoch = epoch

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best test Acc: {:4f}'.format(best_acc))
    print('Best test loss: {:4f}'.format(min(test_losses)))
    model.load_state_dict(best_model_wts)
    return model, f'{best_acc:3.4f}', best_epoch


In [None]:
mylstm = LstmClassifierGloveEmbeddings(100, len(sentiment_dict), glove, num_layers=2, bidirectional=True)
print(mylstm)
out, h_n = mylstm(torch.stack([torch.LongTensor([3]), torch.LongTensor([1])]), [1])
torch.stack([torch.LongTensor(1), torch.LongTensor(1)]).shape
print(out.shape) # (L, N, D*H_out)
print(h_n.shape) # final hidden state, (D * num_layers, N, H_out) final hidden state for each element in the batch
# N = batch_size
# L = sequence_length
# D = 2 if bidrecitional esle 1
# H_in = input size
# H_cell = hidden_size
# H_out = proj_size if proj_size > 0 else hidden_size

In [None]:
myatt = LstmSelfAttentionGloveEmbeddings(100, len(sentiment_dict), glove, num_layers=2, bidirectional=True)
print(mylstm)
out, weights = myatt(torch.stack([torch.LongTensor([3]), torch.LongTensor([1])]), [1])
torch.stack([torch.LongTensor(1), torch.LongTensor(1)]).shape
print(out.shape) # (L, N, D*H_out)
print(h_n.shape) # final hidden state, (D * num_layers, N, H_out) final hidden state for each element in the batch


In [None]:
n_epochs = 50 #00
hidden_size = 64
n_layers = 2
batch_size = 8
bi_direct = True
lr = 0.0001
shuffle = True
debug = False

In [None]:
# prep data
def append_special(glove, special, vec=None):
    glove.itos.append(special)
    glove.stoi[special] = glove.itos.index(special)
    if vec is None:
        vec = torch.zeros(1, glove.vectors.size(1))
    glove.vectors = torch.cat((glove.vectors, vec))
    return glove

pad_sym = '<pad>'
unk = '<unk>'
df= load_sentiment_data()
glove = torchtext.vocab.GloVe(name="6B", dim=50)
glove = append_special(glove, unk)
glove = append_special(glove, pad_sym)
print(df.Sentiment.value_counts())
if debug:
    df = df.iloc[0:2500]

In [None]:
# Training of 'normal' classifier with lstm-> sentiment encoded to last hidden state, FC layer to classify
# 5 fold cv mindful of data distribution
# cpu training 1 fold took ~ 3hours on my machine(macbook pro 2017, i7, 16GB RAM),
# best acc 65.78 / test loss 0.839
kf = StratifiedKFold(n_splits=5, shuffle=True)
results_dict = {}
fold = 1
for train, test in kf.split(df.index, df['Sentiment']):
    datasets = ['train', 'test']
    model = LstmClassifierGloveEmbeddings(
                                          hidden_size,
                                          output_size=len(df['Sentiment'].value_counts()),
                                          glove=glove,
                                          num_layers=n_layers,
                                          bidirectional=bi_direct,)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    train_ds = RottenTomatoesDataset(df.iloc[train], glove)
    test_ds = RottenTomatoesDataset(df.iloc[test], glove)

    data_loaders = {
        'train': DataLoader(train_ds, drop_last=True,
                            collate_fn=SequencePadder(glove.stoi[pad_sym]),
                            batch_size=batch_size, shuffle=shuffle),
        'test': DataLoader(test_ds, batch_size=batch_size, drop_last=shuffle,
                            collate_fn=SequencePadder(glove.stoi[pad_sym]))
                }
    # train model returns the best model for the current run
    model, acc, best_epoch = train_rnn_model(model, data_loaders, criterion, optimizer,
                                             device, num_epochs=n_epochs)
    # run eval for best model and save for this split
    scores = get_metrics(model, DataLoader(RottenTomatoesDataset(df.iloc[test], glove),
                                            collate_fn=SequencePadder(glove.stoi[pad_sym]), 
                                            batch_size=1), device)
    # results_dict[fold] = scores
    fold += 1
    # print(scores)
    break

In [None]:
print(pd.DataFrame(results_dict).T.mean())
print(pd.DataFrame(results_dict).T.std())

In [None]:
# mean metric results for 5 folds

In [None]:
# Training With attention model needs a slightly modified loop, params come out differently
def train_lstm_att_model(model, data_loaders, criterion, optimizer, device, num_epochs=25):
    '''
    @param: data_loaders: takes on data loader containing the test set and one containing the train set
            keys must be: 'train', 'test' in this case
    '''
    since = time.time()
    best_epoch = 0
    best_acc = 0.0
    test_losses = []
    print(model)
    best_model_wts = copy.deepcopy(model.state_dict())

    for epoch in range(1, num_epochs + 1):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            is_train = phase == 'train'
            if is_train:   # set model mode
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            running_corrects = 0
            for inputs, labels, lens in data_loaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                with torch.set_grad_enabled(is_train):
                    out, weights = model(inputs, lens)
                    # take only the last output
                    _, preds = torch.max(out, 1)
                    loss = criterion(out, labels)

                    if is_train:
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(1)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(data_loaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(data_loaders[phase].dataset)
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                                            phase, epoch_loss, epoch_acc))
            if not is_train:
                test_losses.append(epoch_loss)
            if (not is_train) and epoch_loss <= min(test_losses):
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                best_epoch = epoch

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best test Acc: {:4f}'.format(best_acc))
    print('Best test loss: {:4f}'.format(min(test_losses)))
    model.load_state_dict(best_model_wts)
    return model, f'{best_acc:3.4f}', best_epoch

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
results_dict = {}
fold = 1
for train, test in kf.split(df.index, df['Sentiment']):
    datasets = ['train', 'test']
    model = LstmSelfAttentionGloveEmbeddings(
                                          hidden_size,
                                          output_size=len(df['Sentiment'].value_counts()),
                                          glove=glove,
                                          num_layers=n_layers,
                                          bidirectional=bi_direct,)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    train_ds = RottenTomatoesDataset(df.iloc[train], glove)
    test_ds = RottenTomatoesDataset(df.iloc[test], glove)

    data_loaders = {
        'train': DataLoader(train_ds, drop_last=True,
                            collate_fn=SequencePadder(glove.stoi[pad_sym]),
                            batch_size=batch_size, shuffle=shuffle),
        'test': DataLoader(test_ds, batch_size=batch_size, drop_last=shuffle,
                            collate_fn=SequencePadder(glove.stoi[pad_sym]))
                }
    # train_lstm_att_model returns the best model for the current run
    model, acc, best_epoch = train_lstm_att_model(model, data_loaders, criterion, optimizer,
                                                 device, num_epochs=n_epochs)
    # run eval for best model and save for this split
    scores = get_metrics(model, DataLoader(RottenTomatoesDataset(df.iloc[test], glove),
                                            collate_fn=SequencePadder(glove.stoi[pad_sym]), 
                                            batch_size=1), device)
    # results_dict[fold] = scores
    fold += 1
    # print(scores)
    break

In [None]:
print(pd.DataFrame(results_dict).T.mean())
print(pd.DataFrame(results_dict).T.std())