In [1]:
import os
import math
import codecs
from argparse import Namespace
from collections import Counter

import gensim
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from sklearn.cluster import MiniBatchKMeans
from sklearn import preprocessing

In [2]:
class Sentences:
    def __init__(self, filename: str):
        self.filename = filename
        self.num_lines = sum(1 for line in open(filename))

    def __iter__(self):
        for line in tqdm(
            codecs.open(self.filename, "r", encoding="utf-8"), 
            self.filename, 
            self.num_lines
        ):
            yield line.strip().split()

In [3]:
def read_data_batches(path, batch_size=50, minlength=5):
    """
        Reading batched texts of given min. length
    :param path: path to the text file ``one line -- one normalized sentence''
    :return: batches iterator
    """
    batch = []

    for line in open(path, encoding="utf-8"):
        line = line.strip().split()

        # lines with less than `minlength` words are omitted
        if len(line) >= minlength:
            batch.append(line)
            if len(batch) >= batch_size:
                yield batch
                batch = []

    if len(batch) > 0:
        yield batch

In [4]:
def get_num_batches(path, batch_size=50, minlength=5):
    count = 0
    batch_count = 0
    
    for line in open(path, encoding="utf-8"):
        
        if len(line) >= minlength:
            batch_count += 1
            if batch_count >= batch_size:
                count += 1
                batch_count = 0
    
    return count

In [5]:
def text2vectors(text, w2v_model, maxlen, vocabulary):
    """
        Token sequence -- to a list of word vectors;
        if token not in vocabulary, it is skipped; the rest of
        the slots up to `maxlen` are replaced with zeroes
    :param text: list of tokens
    :param w2v_model: gensim w2v model
    :param maxlen: max. length of the sentence; the rest is just cut away
    :return:
    """

    acc_vecs = []

    for word in text:
        if word in w2v_model.wv and (vocabulary is None or word in vocabulary):
            acc_vecs.append(w2v_model.wv[word])

    # padding for consistent length with ZERO vectors
    if len(acc_vecs) < maxlen:
        acc_vecs.extend([np.zeros(w2v_model.vector_size)] * (maxlen - len(acc_vecs)))

    return acc_vecs

In [6]:
def read_data_tensors(
    path, 
    batch_size=50, 
    vocabulary=None,
    maxlen=100, 
    pad_value=0, 
    minsentlength=5,
    w2v_model=None,
):
    """
        Data for training the NN -- from text file to word vectors sequences batches
    :param path:
    :param batch_size:
    :param vocabulary:
    :param maxlen:
    :param pad_value:
    :param minsentlength:
    :return:
    """
    for batch in read_data_batches(path, batch_size, minsentlength):
        batch_vecs = []
        batch_texts = []

        for text in batch:
            vectors_as_list = text2vectors(text, w2v_model, maxlen, vocabulary)
            batch_vecs.append(np.asarray(vectors_as_list[:maxlen], dtype=np.float32))
            batch_texts.append(text)

        yield np.stack(batch_vecs, axis=0), batch_texts

In [7]:
def get_centroids(w2v_model, aspects_count):
    """
        Clustering all word vectors with K-means and returning L2-normalizes
        cluster centroids; used for ABAE aspects matrix initialization
    """

    km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100)
    m = []

    for k in w2v_model.wv.vocab:
        m.append(w2v_model.wv[k])

    m = np.matrix(m)

    km.fit(m)
    clusters = km.cluster_centers_

    # L2 normalization
    norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)

    return norm_aspect_matrix

In [8]:
class SentenseDataset(Dataset):
    def __init__(self, df, vectorizer, maxlen):
        self.df = df
        self._vectorizer = vectorizer
        self.maxlen = maxlen
        
        self.train_df = self.df[self.df.split=='train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.df[self.df.split=='val']
        self.validation_size = len(self.val_df)
        
        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'val': (self.val_df, self.validation_size),
        }
        
        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, df, maxlen):
        """Load dataset and make a new vectorizer from scratch

        """
        train_df = df[df.split=='train']
        sentences = [s.split() for s in train_df['sentences']]

#         w2v = gensim.models.Word2Vec(
#             sentences, 
#             size=args.w2v_size, 
#             window=args.w2v_window, 
#             min_count=args.w2v_min_count, 
#             workers=args.w2v_workers, 
#             sg=args.w2v_sg,
#             negative=args.w2v_negative, 
#             iter=args.w2v_iter, 
#             max_vocab_size=args.w2v_max_vocab_size,
#         )
#         w2v.save(args.w2v_file)

        w2v = gensim.models.Word2Vec.load(args.w2v_file)

        return cls(df, w2v, maxlen)

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe 
        
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def get_vectorizer(self):
        return self._vectorizer

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
         
        vectors_as_list = text2vectors(row.sentences, self._vectorizer, self.maxlen, None)
        vector = np.asarray(vectors_as_list[:self.maxlen], dtype=np.float32)

        return {
            'x_data': vector,
            'y_target': 1.,
        }
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size  

In [9]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """A generator function which wraps the PyTorch DataLoader. 
    
    It will ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(
        dataset=dataset, 
        batch_size=batch_size,
        shuffle=shuffle, 
        drop_last=drop_last
    )

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)

        yield out_data_dict

In [10]:
def calculate_conv_output(input_, kernel, padding, stride):
    """Calculate the Output size in Convolution layer
    
    """
    return math.floor(((input_ - kernel + 2 * padding) / stride) + 1)

In [11]:
class ConvLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride):
        super(ConvLayer, self).__init__()

        self.conv = nn.Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )

    def forward(self, x):
        return F.relu(self.conv(x))

In [12]:
class PrimaryCaps(nn.Module):
    def __init__(self, num_capsules, in_channels, out_channels, kernel_size, stride, conv_out_size):
        super(PrimaryCaps, self).__init__()

        self.out_channels = out_channels
        self.capsules = nn.ModuleList([
            nn.Conv1d(
                in_channels=in_channels, 
                out_channels=out_channels, 
                kernel_size=kernel_size, 
                stride=stride, 
                padding=0
            ) 
            for _ in range(num_capsules)
        ])
        
        self._out_channels = out_channels
        self._conv_out_size = conv_out_size
    
    def forward(self, x):
        u = [capsule(x) for capsule in self.capsules]
        u = torch.stack(u, dim=1)
        u = u.view(x.size(0), self._out_channels * self._conv_out_size , -1)
        return self.squash(u)
    
    def squash(self, input_tensor):
        squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
        output_tensor = squared_norm *  input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm + 1e-07))
#         print(f'PrimaryCaps {((1. + squared_norm) * torch.sqrt(squared_norm))}')
        return output_tensor

In [13]:
class SecondaryCaps(nn.Module):
    def __init__(self, num_capsules, num_routes, in_channels, out_channels):
        super(SecondaryCaps, self).__init__()

        self.in_channels = in_channels
        self.num_routes = num_routes
        self.num_capsules = num_capsules

        self.W = nn.Parameter(torch.randn(1, num_routes, num_capsules, out_channels, in_channels))

    def forward(self, x):
        batch_size = x.size(0)
        x = torch.stack([x] * self.num_capsules, dim=2).unsqueeze(4)

        W = torch.cat([self.W] * batch_size, dim=0)
        u_hat = torch.matmul(W, x)

        b_ij = Variable(torch.zeros(1, self.num_routes, self.num_capsules, 1)).to(args.device)

        num_iterations = 3
        for iteration in range(num_iterations):
            c_ij = F.softmax(b_ij, dim=2)

            c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4)

            s_j = (c_ij * u_hat).sum(dim=1, keepdim=True)
            v_j = self.squash(s_j)
            
            if iteration < num_iterations - 1:
                a_ij = torch.matmul(u_hat.transpose(3, 4), torch.cat([v_j] * self.num_routes, dim=1))
                b_ij = b_ij + a_ij.squeeze(4).mean(dim=0, keepdim=True)

        return v_j.squeeze(1)
    
    def squash(self, input_tensor):
        squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
        output_tensor = squared_norm *  input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm + 1e-07))
#         print(f'SecondaryCaps {((1. + squared_norm) * torch.sqrt(squared_norm))}')
        return output_tensor

In [14]:
class CapsNet(nn.Module):
    def __init__(
        self,
        conv_in_ch,
        conv_out_ch,
        conv_kernel,
        conv_stride,
        prime_num_capsules,
        prime_out_ch,
        prime_kernel,
        prime_stride,
        secondary_num_capsules,
        secondary_out_channels,
        batch_size,
        input_len,
    ):
        super(CapsNet, self).__init__()
        
        self.conv_layer = ConvLayer(
            in_channels=conv_in_ch,
            out_channels=conv_out_ch,
            kernel_size=conv_kernel,
            stride=conv_stride,
        )
        conv_layer_output = calculate_conv_output(
            input_=input_len, 
            kernel=conv_kernel, 
            padding=0, 
            stride=conv_stride,
        )
        
        prime_caps_conv_output = calculate_conv_output(
            input_=conv_layer_output, 
            kernel=prime_kernel, 
            padding=0, 
            stride=prime_stride,
        )
        
        self.primary_caps = PrimaryCaps(
            num_capsules=prime_num_capsules, 
            in_channels=conv_out_ch, 
            out_channels=prime_out_ch, 
            kernel_size=prime_kernel, 
            stride=prime_stride,
            conv_out_size=prime_caps_conv_output,
        )
        
        self.secondary_caps = SecondaryCaps(
            num_capsules=secondary_num_capsules,
            num_routes=prime_caps_conv_output * prime_out_ch,
            in_channels=prime_num_capsules,
            out_channels=secondary_out_channels,

        )
        
        self.fc = nn.Linear(secondary_out_channels * secondary_num_capsules, input_len)
        self.capsule_softmax = torch.nn.Softmax()

        self._batch_size = batch_size
        self._secondary_out_size=secondary_out_channels * secondary_num_capsules

    def forward(self, data):
        output = self.secondary_caps(self.primary_caps(self.conv_layer(data)))
        output = output.reshape(-1, self._secondary_out_size)

        return self.capsule_softmax(self.fc(output))

In [15]:
class CBAE(torch.nn.Module):

    def __init__(
        self, 
        wv_dim, 
        asp_count,
        ortho_reg, 
        maxlen, 
        init_aspects_matrix,
        cn_conv_out_ch,
        cn_conv_kernel,
        cn_conv_stride,
        cn_prime_num_capsules,
        cn_prime_out_ch,
        cn_prime_kernel,
        cn_prime_stride,
        cn_secondary_num_capsules,
        cn_secondary_out_channels,
        batch_size,
        encoder_only=False,
    ):
        super(CBAE, self).__init__()
        self.wv_dim = wv_dim
        self.asp_count = asp_count
        self.ortho = ortho_reg
        self.maxlen = maxlen

        self.caps_net = CapsNet(
            conv_in_ch=wv_dim,
            conv_out_ch=cn_conv_out_ch,
            conv_kernel=cn_conv_kernel,
            conv_stride=cn_conv_stride,
            prime_num_capsules=cn_prime_num_capsules,
            prime_out_ch=cn_prime_out_ch,
            prime_kernel=cn_prime_kernel,
            prime_stride=cn_prime_stride,
            secondary_num_capsules=cn_secondary_num_capsules,
            secondary_out_channels=cn_secondary_out_channels,
            batch_size=batch_size,
            input_len=maxlen,
        )
        
        self.linear_transform = torch.nn.Linear(self.wv_dim, self.asp_count)
        self.softmax_aspects = torch.nn.Softmax()
        self.aspects_embeddings = Parameter(torch.empty(size=(wv_dim, asp_count)))

        if init_aspects_matrix is None:
            torch.nn.init.xavier_uniform(self.aspects_embeddings)
        else:
            self.aspects_embeddings.data = torch.from_numpy(init_aspects_matrix.T)
            
        self.encoder_only = encoder_only

    def get_aspects_importances(self, text_embeddings):
        """Takes embeddings of a sentence as input, returns attention weights

        """
        # compute attention scores, looking at text embeddings average
        caps_weights = self.caps_net(text_embeddings.permute(0, 2, 1))

        # multiplying text embeddings by attention scores -- and summing
        # (matmul: we sum every word embedding's coordinate with attention weights)
        weighted_text_emb = torch.matmul(caps_weights.unsqueeze(1),  # (batch, 1, sentence)
                                         text_embeddings  # (batch, sentence, wv_dim)
                                         ).squeeze()

        # encoding with a simple feed-forward layer (wv_dim) -> (aspects_count)
        raw_importances = self.linear_transform(weighted_text_emb)

        # computing 'aspects distribution in a sentence'
        aspects_importances = self.softmax_aspects(raw_importances)

        return caps_weights, aspects_importances, weighted_text_emb

    def forward(self, text_embeddings, negative_samples_texts):
        
        # encoding: words embeddings -> sentence embedding, aspects importances
        _, aspects_importances, weighted_text_emb = self.get_aspects_importances(text_embeddings)

        if self.encoder_only:
            return aspects_importances
        else:
            # negative samples are averaged
            averaged_negative_samples = torch.mean(negative_samples_texts, dim=2)

            # decoding: aspects embeddings matrix, aspects_importances -> recovered sentence embedding
            recovered_emb = torch.matmul(self.aspects_embeddings, aspects_importances.unsqueeze(2)).squeeze()

            # loss
            reconstruction_triplet_loss = CBAE._reconstruction_loss(
                weighted_text_emb,
                recovered_emb,
                averaged_negative_samples,
            )

            max_margin = torch.max(reconstruction_triplet_loss, torch.zeros_like(reconstruction_triplet_loss))

            return self.ortho * self._ortho_regularizer() + max_margin

    @staticmethod
    def _reconstruction_loss(text_emb, recovered_emb, averaged_negative_emb):

        positive_dot_products = torch.matmul(text_emb.unsqueeze(1), recovered_emb.unsqueeze(2)).squeeze()
        negative_dot_products = torch.matmul(averaged_negative_emb, recovered_emb.unsqueeze(2)).squeeze()
        reconstruction_triplet_loss = torch.sum(1 - positive_dot_products.unsqueeze(1) + negative_dot_products, dim=1)

        return reconstruction_triplet_loss

    def _ortho_regularizer(self):
        return torch.norm(
            torch.matmul(self.aspects_embeddings.t(), self.aspects_embeddings) \
            - torch.eye(self.asp_count).to(args.device))

    def get_aspect_words(self, w2v_model, topn=10):
        words = []

        # getting aspects embeddings
        aspects = self.aspects_embeddings.cpu().detach().numpy()

        # getting scalar products of word embeddings and aspect embeddings;
        # to obtain the ``probabilities'', one should also apply softmax
        words_scores = w2v_model.wv.vectors.dot(aspects)

        for row in range(aspects.shape[1]):
            argmax_scalar_products = np.argsort(- words_scores[:, row])[:topn]
            # print([w2v_model.wv.index2word[i] for i in argmax_scalar_products])
            # print([w for w, dist in w2v_model.similar_by_vector(aspects.T[row])[:topn]])
            words.append([w2v_model.wv.index2word[i] for i in argmax_scalar_products])

        return words

In [16]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)
        

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

        
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}


def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

In [17]:
args = Namespace(
    data='preprocessed_data/restaurant/train.txt',
    test_data='preprocessed_data/restaurant/test.txt',
    test_labels='preprocessed_data/restaurant/test_label.txt',
    model_state_file="model.pth",
    save_dir="model_storage",
    
    perc_train=0.9,
    perc_val=0.1,

    w2v_file='preprocessed_data/restaurant/w2v_embedding',
    w2v_size=200,
    w2v_window=5,
    w2v_min_count=3,
    w2v_workers=7,
    w2v_sg=1,
    w2v_negative=5,
    w2v_iter=1,
    w2v_max_vocab_size=9000,

    batch_size=50,
    aspects_number=14,
    ortho_reg=0.1,
    epochs=5,
    optimizer='adam',
    neg_samples=20,
    maxlen=32,
    minsentlength=1,

    cn_conv_out_channels = 128,
    cn_conv_kernel = 9,
    cn_conv_stride = 1,
    cn_prime_num_capsules=6,
    cn_prime_kernel=3,
    cn_prime_out_channels=32,
    cn_prime_stride=2,
    cn_secondary_num_capsules=8,
    cn_secondary_out_channels=16,
    
    cuda=True,
    reload_from_files=False,
    seed=1234,
    learning_rate=1e-2,
    early_stopping_criteria=5,  
    catch_keyboard_interrupt=True
)

assert args.perc_val + args.perc_train == 1.0


args.device = torch.device("cuda" if args.cuda else "cpu")

set_seed_everywhere(args.seed, args.cuda)

handle_dirs(args.save_dir)
    
print("Using CUDA: {}".format(args.cuda))

Using CUDA: True


In [18]:
# if args.reload_from_files:
#     print("Loading vectorizer")
# else:
#     print("Loading dataset and creating vectorizer")
#     sentences = Sentences(args.data)
#     w2v = gensim.models.Word2Vec(
#         sentences, 
#         size=args.w2v_size, 
#         window=args.w2v_window, 
#         min_count=args.w2v_min_count, 
#         workers=args.w2v_workers, 
#         sg=args.w2v_sg,
#         negative=args.w2v_negative, 
#         iter=args.w2v_iter, 
#         max_vocab_size=args.w2v_max_vocab_size,
#     )
#     w2v.save(args.w2v_file)
#     print(f'{args.w2v_file} saved')
    
# vectorizer = gensim.models.Word2Vec.load(args.w2v_file)

In [19]:
sentence_len = 0

for s in Sentences(args.data):
    if len(s) >= sentence_len:
        sentence_len = len(s)
        
print(sentence_len)

data = [' '.join(s) for s in Sentences(args.data)]
split = []
np.random.shuffle(data)
n_train = int(len(data) * args.perc_train)

for _ in range(n_train):
    split.append('train')

for _ in range(n_train, len(data)):
    split.append('val')

df = pd.DataFrame(data={'sentences': data, 'split': split})
df.head()

HBox(children=(FloatProgress(value=0.0, description='preprocessed_data/restaurant/train.txt', max=279885.0, st…


157


HBox(children=(FloatProgress(value=0.0, description='preprocessed_data/restaurant/train.txt', max=279885.0, st…




Unnamed: 0,sentences,split
0,veal chop still make mouth water,train
1,well alot ignored,train
2,like glorified pizza joint sell slice,train
3,true traditional form southern germany history...,train
4,ambiance beat,train


In [20]:
dataset = SentenseDataset.load_dataset_and_make_vectorizer(df, args.maxlen)
vectorizer = dataset.get_vectorizer()

In [21]:
for word in ["bill", "waiter", "vodka", "meat"]:
    if word in vectorizer.wv.vocab:
        print(word, [w for w, c in vectorizer.wv.similar_by_word(word=word)])
    else:
        print(word, "not in vocab")

bill ['tip', 'tab', 'gratuity', 'charged', '300', 'fee', 'charge', 'minimum', 'paid', '400']
waiter ['waitress', 'server', 'manager', 'waitstaff', 'hostess', 'waitor', 'maitre', 'host', 'busboy', 'question']
vodka ['ginger', 'mango', 'watermelon', 'pomegranate', 'rum', 'pear', 'cranberry', 'infused', 'honey', 'pineapple']
meat ['fish', 'vegetable', 'beef', 'fat', 'cut', 'veggie', 'seafood', 'bone', 'patty', 'sliced']


In [22]:
wv_dim = vectorizer.vector_size

In [23]:
model = CBAE(
    wv_dim=wv_dim,
    asp_count=args.aspects_number,
    ortho_reg=args.ortho_reg, 
    maxlen=args.maxlen, 
    init_aspects_matrix=get_centroids(vectorizer, aspects_count=args.aspects_number),
    cn_conv_out_ch=args.cn_conv_out_channels,
    cn_conv_kernel=args.cn_conv_kernel,
    cn_conv_stride=args.cn_conv_stride,
    cn_prime_num_capsules=args.cn_prime_num_capsules,
    cn_prime_out_ch=args.cn_prime_out_channels,
    cn_prime_kernel=args.cn_prime_kernel,
    cn_prime_stride=args.cn_prime_stride,
    cn_secondary_num_capsules=args.cn_secondary_num_capsules,
    cn_secondary_out_channels=args.cn_secondary_out_channels,
    batch_size=args.batch_size,
)



In [24]:
model = model.to(args.device)
loss_func = torch.nn.MSELoss(reduction="sum")
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

In [25]:
epoch_bar = tqdm(
    desc='training routine', 
    total=args.epochs,
    position=1,
)

dataset.set_split('train')
train_bar = tqdm(
    desc='split=train',
    total=dataset.get_num_batches(args.batch_size), 
    position=1, 
)

dataset.set_split('val')
val_bar = tqdm(
    desc='split=val',
    total=dataset.get_num_batches(args.batch_size), 
    position=1, 
    leave=True
)

for epoch_index in range(args.epochs):
    
    train_state['epoch_index'] = epoch_index
    
    # Iterate over training dataset
    # setup: batch generator, set loss to 0, set train mode on
    dataset.set_split('train')

    running_loss = 0.0
    model.train()

    batch_generator = generate_batches(
        dataset, 
        batch_size=args.batch_size, 
        device=args.device
    )

    for batch_index, batch_dict in enumerate(batch_generator):
        
        optimizer.zero_grad()
        
        x = batch_dict['x_data']
        y = batch_dict['y_target'].float()

        # extracting bad samples from the very same batch; not sure if this is OK, so todo
        negative_samples = torch.stack(
            tuple([x[torch.randperm(x.shape[0])[:args.neg_samples]] 
                   for _ in range(args.batch_size)])
        ).to(args.device)

        # prediction
        y_pred = model(x, negative_samples)
        
        # error computation
        loss = loss_func(y_pred, y)
        loss_t = loss.item()
        if not isinstance(loss_t, float):
            raise ZeroDivisionError
        
        # compute the running loss
        running_loss += (loss_t - running_loss) / (batch_index + 1)
        
        # use loss to produce gradients
        loss.backward()

        # use optimizer to take gradient step
        optimizer.step()
        
        # update bar
        train_bar.set_postfix(loss=running_loss, epoch=epoch_index)
        train_bar.update()
           

    train_state['train_loss'].append(running_loss)
    
    # Iterate over val dataset
    # setup: batch generator, set loss and acc to 0; set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(
        dataset, 
        batch_size=args.batch_size, 
        device=args.device,
    )

    running_loss = 0.
    model.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        x = batch_dict['x_data']
        y = batch_dict['y_target'].float()

        negative_samples = torch.stack(
            tuple([x[torch.randperm(x.shape[0])[:args.neg_samples]] 
                   for _ in range(args.batch_size)])
        ).to(args.device)
        
        y_pred = model(x, negative_samples)
        
        # compute the loss
        loss = loss_func(y_pred, y)
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        val_bar.set_postfix(loss=running_loss, epoch=epoch_index)
        val_bar.update()
    
    train_state['val_loss'].append(running_loss)
    train_state = update_train_state(args=args, model=model, train_state=train_state)

    scheduler.step(train_state['train_loss'][-1])
        
    print(batch_index, "batches, and LR:", optimizer.param_groups[0]['lr'])
    for i, aspect in enumerate(model.get_aspect_words(vectorizer)):
        print(i, " ".join([a for a in aspect]))
    print("Loss:", loss.item())
    print()
    
    if train_state['stop_early']:
        break

    train_bar.n = 0
    val_bar.n = 0

    epoch_bar.set_postfix(best_val=train_state['early_stopping_best_val'])
    epoch_bar.update()

HBox(children=(FloatProgress(value=0.0, description='training routine', max=5.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=5037.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='split=val', max=559.0, style=ProgressStyle(description_wi…

  return self.capsule_softmax(self.fc(output))
  aspects_importances = self.softmax_aspects(raw_importances)


558 batches, and LR: 0.01
0 enjoyed recommend worth like better forward year wonderful excellent city
1 feel french cuisine italian italy bistro fare neighborhood indian much
2 didn manager away hostess attitude mouth card without never couldn
3 dish chef thing sauce item ingredient flavor vegetarian seafood roll
4 sauce tomato steak garlic cooked chicken potato tender tuna fry
5 give gave sit recommend spend seat cute loud conversation romantic
6 price sushi quality value service portion atmosphere lunch fast overall
7 going sure come coming go reservation last eating ago advance
8 wood wall ceiling brick lit white booth floor window chair
9 crust pork beef rib chicken thin cuisine gras foie bbq
10 spot district corner area scene location lounge dining near street
11 chocolate cake banana creme apple tart brulee pudding ice desert
12 rude slow customer reviewer review poor management attitude worse par
13 fixe ordered appetizer size prix sized shared main three lamb
Loss: 4.8067255020

In [26]:
model.eval()
model.encoder_only = True

predictions = []
with torch.no_grad():
    data_iterator = read_data_tensors(
        args.test_data,
        batch_size=50, 
        maxlen=args.maxlen,
        w2v_model=vectorizer,
        minsentlength=0
    )

    for batch_index, (x, texts) in enumerate(data_iterator):
        x = torch.from_numpy(x).to(args.device)

        y_pred = model(x, None)
        
        for pred in y_pred:
            predictions.append(pred.cpu().numpy())


  return self.capsule_softmax(self.fc(output))
  aspects_importances = self.softmax_aspects(raw_importances)


In [27]:
classes = []
for pred in predictions:
    classes.append(pred.argmax())

In [28]:
target = [v[0] for v in Sentences(args.test_labels)]

HBox(children=(FloatProgress(value=0.0, description='preprocessed_data/restaurant/test_label.txt', max=1490.0,…




In [29]:
print(len(classes))
print(len(target))
print(Counter(classes))
print(Counter(target))


1490
1490
Counter({9: 1170, 2: 320})
Counter({'Food': 887, 'Staff': 352, 'Ambience': 251})


In [30]:
for i, aspect in enumerate(model.get_aspect_words(vectorizer)):
    print(i, " ".join([a for a in aspect]))

0 else worth better wrong anywhere rare try chef somewhere recommend
1 feel italian italy french neighborhood indian bistro felt experience nyc
2 manager waiter waitress sauce asked chef hostess u owner server
3 dish vegetarian item seafood roll appetizer chef spicy meat menu
4 sauce tasted tomato cooked garlic crust mouth tender oil dry
5 give sit spend recommend go gave seat bring loud drink
6 didn dont money doesn attitude pay want won anything wouldn
7 sure going come reservation coming go try ll advance decided
8 wood ceiling wall lit brick floor booth window decorated white
9 crust pork beef rib chicken thin cuisine gras foie bbq
10 spot door dining restaurant walk reservation location park wall live
11 chocolate cake banana creme apple pudding tart brulee souffle desert
12 quality slow rude par expectation poor experience bit service overpriced
13 ordered fixe best size appetizer main half three sized shared


In [34]:
cluster_map = {
    0: 'Food', 
    1: 'Staff', 
    2: 'Staff', 
    3: 'Staff',
    4: 'Staff', 
    5: 'Food', 
    6: 'Ambience',  
    7: 'Staff', 
    8: 'Price', 
    9: 'Food', 
    10: 'Ambience', 
    11: 'Miscellaneous', 
    12: 'Food', 
    13: 'Price'
}

labels = ['Ambience', 'Food', 'Miscellaneous', 'Price', 'Staff']


In [35]:
y_pred = [cluster_map[pred] for pred in classes]
y_true = target

In [36]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

    Ambience       0.00      0.00      0.00       251
        Food       0.64      0.85      0.73       887
       Staff       0.42      0.38      0.40       352

    accuracy                           0.59      1490
   macro avg       0.35      0.41      0.38      1490
weighted avg       0.48      0.59      0.53      1490



In [37]:
predictions

[array([4.5290337e-13, 3.2881163e-12, 2.6668007e-10, 1.1638822e-13,
        1.5493496e-12, 2.9015074e-13, 1.0114886e-11, 5.6348502e-13,
        1.7330782e-13, 1.0000000e+00, 1.5819554e-12, 1.3869434e-12,
        5.2156179e-14, 7.8965785e-14], dtype=float32),
 array([7.2519737e-09, 8.4688367e-08, 1.8495081e-09, 4.2005963e-09,
        7.9652864e-08, 6.1905907e-09, 4.9510089e-07, 3.1094185e-09,
        3.7604955e-09, 9.9999893e-01, 1.2786622e-07, 1.7702983e-07,
        4.4937667e-09, 2.2653603e-09], dtype=float32),
 array([4.6049910e-13, 2.1810236e-12, 7.4342803e-11, 1.3902850e-13,
        1.1201323e-12, 3.5225355e-13, 1.0450609e-11, 6.1010523e-13,
        1.6624243e-13, 1.0000000e+00, 1.3469639e-12, 1.2263989e-12,
        8.1376766e-14, 6.2214645e-14], dtype=float32),
 array([3.8914631e-12, 1.4241724e-17, 1.0000000e+00, 2.5745797e-15,
        1.1448524e-17, 9.7927044e-14, 1.2659020e-18, 3.1080648e-12,
        1.8900239e-14, 3.0506421e-16, 1.6578708e-17, 9.6312874e-18,
        4.0517580e-