In [1]:
import gensim
import codecs
import numpy as np
import torch
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm.notebook import tqdm
from argparse import Namespace
from sklearn.cluster.k_means_ import MiniBatchKMeans



In [2]:
class Sentences:
    def __init__(self, filename: str):
        self.filename = filename
        self.num_lines = sum(1 for line in open(filename))

    def __iter__(self):
        for line in tqdm(
            codecs.open(self.filename, "r", encoding="utf-8"), 
            self.filename, 
            self.num_lines
        ):
            yield line.strip().split()

In [3]:
def read_data_batches(path, batch_size=50, minlength=5):
    """
        Reading batched texts of given min. length
    :param path: path to the text file ``one line -- one normalized sentence''
    :return: batches iterator
    """
    batch = []

    for line in open(path, encoding="utf-8"):
        line = line.strip().split()

        # lines with less than `minlength` words are omitted
        if len(line) >= minlength:
            batch.append(line)
            if len(batch) >= batch_size:
                yield batch
                batch = []

    if len(batch) > 0:
        yield batch

In [4]:
def get_num_batches(path, batch_size=50, minlength=5):
    count = 0
    batch_count = 0
    
    for line in open(path, encoding="utf-8"):

        if len(line) >= minlength:
            batch_count += 1
            if batch_count >= batch_size:
                count += 1
                batch_count = 0
    
    return count

In [5]:
def text2vectors(text, w2v_model, maxlen, vocabulary):
    """
        Token sequence -- to a list of word vectors;
        if token not in vocabulary, it is skipped; the rest of
        the slots up to `maxlen` are replaced with zeroes
    :param text: list of tokens
    :param w2v_model: gensim w2v model
    :param maxlen: max. length of the sentence; the rest is just cut away
    :return:
    """

    acc_vecs = []

    for word in text:
        if word in w2v_model and (vocabulary is None or word in vocabulary):
            acc_vecs.append(w2v_model.wv[word])

    # padding for consistent length with ZERO vectors
    if len(acc_vecs) < maxlen:
        acc_vecs.extend([np.zeros(w2v_model.vector_size)] * (maxlen - len(acc_vecs)))

    return acc_vecs

In [6]:
def read_data_tensors(
    path, 
    batch_size=50, 
    vocabulary=None,
    maxlen=100, 
    pad_value=0, 
    minsentlength=5,
    w2v_model=None,
):
    """
        Data for training the NN -- from text file to word vectors sequences batches
    :param path:
    :param batch_size:
    :param vocabulary:
    :param maxlen:
    :param pad_value:
    :param minsentlength:
    :return:
    """
    for batch in read_data_batches(path, batch_size, minsentlength):
        batch_vecs = []
        batch_texts = []

        for text in batch:
            vectors_as_list = text2vectors(text, w2v_model, maxlen, vocabulary)
            batch_vecs.append(np.asarray(vectors_as_list[:maxlen], dtype=np.float32))
            batch_texts.append(text)

        yield np.stack(batch_vecs, axis=0), batch_texts

In [7]:
def get_centroids(w2v_model, aspects_count):
    """
        Clustering all word vectors with K-means and returning L2-normalizes
        cluster centroids; used for ABAE aspects matrix initialization
    """

    km = MiniBatchKMeans(n_clusters=aspects_count, verbose=0, n_init=100)
    m = []

    for k in w2v_model.wv.vocab:
        m.append(w2v_model.wv[k])

    m = np.matrix(m)

    km.fit(m)
    clusters = km.cluster_centers_

    # L2 normalization
    norm_aspect_matrix = clusters / np.linalg.norm(clusters, axis=-1, keepdims=True)

    return norm_aspect_matrix

In [8]:
class ConvLayer(nn.Module):
    def __init__(self, in_channels=200, out_channels=256, kernel_size=9, stride=1):
        super(ConvLayer, self).__init__()

        self.conv = nn.Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )

    def forward(self, x):
        return F.relu(self.conv(x))

In [9]:
class PrimaryCaps(nn.Module):
    def __init__(self, num_capsules=8, in_channels=256, out_channels=32, kernel_size=3):
        super(PrimaryCaps, self).__init__()

        self.out_channels = out_channels
        self.capsules = nn.ModuleList([
            nn.Conv1d(
                in_channels=in_channels, 
                out_channels=out_channels, 
                kernel_size=kernel_size, 
                stride=2, 
                padding=0
            ) 
            for _ in range(num_capsules)
        ])
    
    def forward(self, x):
        u = [capsule(x) for capsule in self.capsules]
        u = torch.stack(u, dim=1)
        u = u.view(x.size(0), 32 * 96 , -1)
        return self.squash(u)
    
    def squash(self, input_tensor):
        squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
        output_tensor = squared_norm *  input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
        return output_tensor

In [10]:
class SecondaryCaps(nn.Module):
    def __init__(self, num_capsules=10, num_routes=32 * 96, in_channels=8, out_channels=16):
        super(SecondaryCaps, self).__init__()

        self.in_channels = in_channels
        self.num_routes = num_routes
        self.num_capsules = num_capsules
        self.fc = nn.Linear(16 * 10, 1)

        self.W = nn.Parameter(torch.randn(1, num_routes, num_capsules, out_channels, in_channels))

    def forward(self, x):
        batch_size = x.size(0)
        x = torch.stack([x] * self.num_capsules, dim=2).unsqueeze(4)

        W = torch.cat([self.W] * batch_size, dim=0)
        u_hat = torch.matmul(W, x)

        b_ij = Variable(torch.zeros(1, self.num_routes, self.num_capsules, 1)).to(args.device)

        num_iterations = 3
        for iteration in range(num_iterations):
            c_ij = F.softmax(b_ij)
            c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4)

            s_j = (c_ij * u_hat).sum(dim=1, keepdim=True)
            v_j = self.squash(s_j)
            
            if iteration < num_iterations - 1:
                a_ij = torch.matmul(u_hat.transpose(3, 4), torch.cat([v_j] * self.num_routes, dim=1))
                b_ij = b_ij + a_ij.squeeze(4).mean(dim=0, keepdim=True)

        return v_j.squeeze(1)
    
    def squash(self, input_tensor):
        squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
        output_tensor = squared_norm *  input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
        return output_tensor

In [11]:
class CapsNet(nn.Module):
    def __init__(self):
        super(CapsNet, self).__init__()
        self.conv_layer = ConvLayer()
        self.primary_caps = PrimaryCaps()
        self.secondary_caps = SecondaryCaps()
        
        self.fc = nn.Linear(16 * 10, 201)

    def forward(self, data):
        output = self.secondary_caps(self.primary_caps(self.conv_layer(data)))
        output = output.reshape(50, 160)

        return self.fc(output)

In [12]:
class CBAE(torch.nn.Module):

    def __init__(
        self, 
        wv_dim: int = 200, 
        asp_count: int = 30,
        ortho_reg: float = 0.1, 
        maxlen: int = 201, 
        init_aspects_matrix=None
    ):
        super(CBAE, self).__init__()
        self.wv_dim = wv_dim
        self.asp_count = asp_count
        self.ortho = ortho_reg
        self.maxlen = maxlen

        self.caps_net = CapsNet()
        self.linear_transform = torch.nn.Linear(self.wv_dim, self.asp_count)
        self.softmax_aspects = torch.nn.Softmax()
        self.aspects_embeddings = Parameter(torch.empty(size=(wv_dim, asp_count)))

        if init_aspects_matrix is None:
            torch.nn.init.xavier_uniform(self.aspects_embeddings)
        else:
            self.aspects_embeddings.data = torch.from_numpy(init_aspects_matrix.T)

    def get_aspects_importances(self, text_embeddings):
        """Takes embeddings of a sentence as input, returns attention weights

        """

        # compute attention scores, looking at text embeddings average
        caps_weights = self.caps_net(text_embeddings.permute(0, 2, 1))

        # multiplying text embeddings by attention scores -- and summing
        # (matmul: we sum every word embedding's coordinate with attention weights)
        weighted_text_emb = torch.matmul(caps_weights.unsqueeze(1),  # (batch, 1, sentence)
                                         text_embeddings  # (batch, sentence, wv_dim)
                                         ).squeeze()

        # encoding with a simple feed-forward layer (wv_dim) -> (aspects_count)
        raw_importances = self.linear_transform(weighted_text_emb)

        # computing 'aspects distribution in a sentence'
        aspects_importances = self.softmax_aspects(raw_importances)

        return caps_weights, aspects_importances, weighted_text_emb

    def forward(self, text_embeddings, negative_samples_texts):

        # negative samples are averaged
        averaged_negative_samples = torch.mean(negative_samples_texts, dim=2)
        
        # encoding: words embeddings -> sentence embedding, aspects importances
        _, aspects_importances, weighted_text_emb = self.get_aspects_importances(text_embeddings)

        # decoding: aspects embeddings matrix, aspects_importances -> recovered sentence embedding
        recovered_emb = torch.matmul(self.aspects_embeddings, aspects_importances.unsqueeze(2)).squeeze()

        # loss
        reconstruction_triplet_loss = CBAE._reconstruction_loss(
            weighted_text_emb,
            recovered_emb,
            averaged_negative_samples,
        )
        
        max_margin = torch.max(reconstruction_triplet_loss, torch.zeros_like(reconstruction_triplet_loss))
        reconstruction_triplet_loss

        return self.ortho * self._ortho_regularizer() + max_margin

    @staticmethod
    def _reconstruction_loss(text_emb, recovered_emb, averaged_negative_emb):

        positive_dot_products = torch.matmul(text_emb.unsqueeze(1), recovered_emb.unsqueeze(2)).squeeze()
        negative_dot_products = torch.matmul(averaged_negative_emb, recovered_emb.unsqueeze(2)).squeeze()
        reconstruction_triplet_loss = torch.sum(1 - positive_dot_products.unsqueeze(1) + negative_dot_products, dim=1)

        return reconstruction_triplet_loss

    def _ortho_regularizer(self):
        return torch.norm(
            torch.matmul(self.aspects_embeddings.t(), self.aspects_embeddings) \
            - torch.eye(self.asp_count).to(args.device))

    def get_aspect_words(self, w2v_model, topn=15):
        words = []

        # getting aspects embeddings
        aspects = self.aspects_embeddings.cpu().detach().numpy()

        # getting scalar products of word embeddings and aspect embeddings;
        # to obtain the ``probabilities'', one should also apply softmax
        words_scores = w2v_model.wv.syn0.dot(aspects)

        for row in range(aspects.shape[1]):
            argmax_scalar_products = np.argsort(- words_scores[:, row])[:topn]
            # print([w2v_model.wv.index2word[i] for i in argmax_scalar_products])
            # print([w for w, dist in w2v_model.similar_by_vector(aspects.T[row])[:topn]])
            words.append([w2v_model.wv.index2word[i] for i in argmax_scalar_products])

        return words

In [13]:
args = Namespace(
    data_json='Electronics_5.json',
    
    w2v_file='Electronics_5.w2v',
    w2v_size=200,
    w2v_window=5,
    w2v_min_count=5,
    w2v_workers=7,
    w2v_sg=1,
    w2v_negative=5,
    w2v_iter=1,
    w2v_max_vocab_size=20000,
    
    batch_size=50,
    aspects_number=40,
    ortho_reg=0.1,
    epochs=1,
    optimizer='adam',
    neg_samples=5,
    maxlen=201,
    
    cuda=True,
    reload_from_files=True,
)

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))

Using CUDA: True


In [14]:
if args.reload_from_files:
    print("Loading vectorizer")
    pass
else:
    print("Loading dataset and creating vectorizer")
    sentences = Sentences(args.data_json)
    w2v = gensim.models.Word2Vec(
        sentences, 
        size=args.w2v_size, 
        window=args.w2v_window, 
        min_count=args.w2v_min_count, 
        workers=args.w2v_workers, 
        sg=args.w2v_sg,
        negative=args.w2v_negative, 
        iter=args.w2v_iter, 
        max_vocab_size=args.w2v_max_vocab_size,
    )
    w2v.save(args.w2v_file)
    print(f'{args.w2v_file} saved')
    
vectorizer = gensim.models.Word2Vec.load(args.w2v_file)

Loading vectorizer


In [15]:
for word in ["he", "love", "looks", "buy", "laptop"]:
    if word in vectorizer.wv.vocab:
        print(word, [w for w, c in vectorizer.wv.similar_by_word(word=word)])
    else:
        print(word, "not in vocab")

he ['she', 'He', 'his', 'She', 'son', 'husband', 'him', 'dad', 'daughter', 'wife']
love ['LOVE', 'Love', 'loved', '"Love', 'enjoy', 'hate', 'loves', 'appreciate', 'enjoyed', 'like']
looks ['feels', 'Looks', 'look', 'looked', 'sleek', 'sounds', 'matches', 'finish', 'appearance', 'look.']
buy ['purchase', 'buying', 'purchasing', 'sell', 'ordering', 'buy,', 'invest', 'try', 'buy.', 'order']
laptop ['notebook', 'netbook', 'computer', 'laptop,', 'machine', 'laptop.', 'PC', 'desktop', 'tablet', 'pc']


In [16]:
wv_dim = vectorizer.vector_size
y = torch.zeros(args.batch_size, 1).to(args.device)

In [17]:
model = CBAE(
    wv_dim=wv_dim,
    asp_count=args.aspects_number,
    init_aspects_matrix=get_centroids(vectorizer, aspects_count=args.aspects_number)
)
model.to(args.device)

CBAE(
  (caps_net): CapsNet(
    (conv_layer): ConvLayer(
      (conv): Conv1d(200, 256, kernel_size=(9,), stride=(1,))
    )
    (primary_caps): PrimaryCaps(
      (capsules): ModuleList(
        (0): Conv1d(256, 32, kernel_size=(3,), stride=(2,))
        (1): Conv1d(256, 32, kernel_size=(3,), stride=(2,))
        (2): Conv1d(256, 32, kernel_size=(3,), stride=(2,))
        (3): Conv1d(256, 32, kernel_size=(3,), stride=(2,))
        (4): Conv1d(256, 32, kernel_size=(3,), stride=(2,))
        (5): Conv1d(256, 32, kernel_size=(3,), stride=(2,))
        (6): Conv1d(256, 32, kernel_size=(3,), stride=(2,))
        (7): Conv1d(256, 32, kernel_size=(3,), stride=(2,))
      )
    )
    (secondary_caps): SecondaryCaps(
      (fc): Linear(in_features=160, out_features=1, bias=True)
    )
    (fc): Linear(in_features=160, out_features=201, bias=True)
  )
  (linear_transform): Linear(in_features=200, out_features=40, bias=True)
  (softmax_aspects): Softmax(dim=None)
)

In [18]:
criterion = torch.nn.MSELoss(reduction="sum")
optimizer = torch.optim.Adam(model.parameters())

In [None]:
epoch_bar = tqdm(
    desc='training routine', 
    total=args.epochs,
    position=0
)

train_bar = tqdm(
    desc='train',
    total=get_num_batches(args.data_json, args.batch_size, args.maxlen), 
    position=1, 
    leave=True
)


for t in range(args.epochs):

    print("Epoch %d/%d" % (t + 1, args.epochs))

    data_iterator = read_data_tensors(
        args.data_json,
        batch_size=args.batch_size, 
        maxlen=args.maxlen,
        w2v_model=vectorizer,
    )

    for item_number, (x, texts) in enumerate(data_iterator):
        if x.shape[0] < args.batch_size:  # pad with 0 if smaller than batch size
            x = np.pad(x, ((0, args.batch_size - x.shape[0]), (0, 0), (0, 0)))

        x = torch.from_numpy(x).to(args.device)

        # extracting bad samples from the very same batch; not sure if this is OK, so todo
        negative_samples = torch.stack(
            tuple([x[torch.randperm(x.shape[0])[:args.neg_samples]] 
                   for _ in range(args.batch_size)])
        ).to(args.device)

        # prediction
        y_pred = model(x, negative_samples)

        # error computation
        loss = criterion(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if item_number % 1000 == 0:

            print(item_number, "batches, and LR:", optimizer.param_groups[0]['lr'])

            for i, aspect in enumerate(model.get_aspect_words(vectorizer)):
                print(i + 1, " ".join([a for a in aspect]))

            print("Loss:", loss.item())
            print()

        train_bar.update()
    epoch_bar.update()

HBox(children=(FloatProgress(value=0.0, description='training routine', max=1.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='train', max=33782.0, style=ProgressStyle(description_widt…

Epoch 1/1


  if word in w2v_model and (vocabulary is None or word in vocabulary):
  c_ij = F.softmax(b_ij)
  aspects_importances = self.softmax_aspects(raw_importances)
  return F.mse_loss(input, target, reduction=self.reduction)


0 batches, and LR: 0.001
1 photography, portraits photography outdoors photography. trips situations photographer indoors amateur studio school yard casual gym
2 thousands kinds sorts sites variety countless various hundreds types books, (including numerous etc.) options, formats
3 Player Desktop GB Drive Music Portable Network Adobe downloaded Office Google Adapter Seagate Wireless Vista
4 for.", future.", again.", anyone.", to.", time.", these.", needs.", with.", in.", one.", years.", do.", around.", fine.",
5 appearance sleek aluminum feel, stylish leather elegant protects gray weight, color, padding matches bulk attractive
6 "Kenneth "Jonathan "Bruce "William "Kevin "Stephen "Peter "Thomas "Richard "Scott "Jerry "N. "Eric "Robert "Daniel
7 birthday ago Christmas contacted died "I've "Bought sister "B003ES5ZUU", ago, "Bruce daughter "Purchased weeks week
8 select restart press unplug delete enter icon reset activate manually turn pressing click automatically button,
9 highs mids tre

  words_scores = w2v_model.wv.syn0.dot(aspects)


1000 batches, and LR: 0.001
1 photography, photography photography. amateur photographer portraits studio casual outdoors school editing everyday situations daily outdoor
2 variety countless thousands tons numerous bunch hundreds various kinds many sorts ton types lots several
3 movies Music TiVo Google Player Portable games videos DVDs Tivo Desktop DVD's books Seagate HD
4 future.", to.", again.", for.", on.", in.", fine.", one.", these.", time.", stars.", me.", now.", that.", them.",
5 sleek appearance color, bulk color. color matches slim elegant protects gray stylish leather protection feel,
6 "William "Richard "Robert "Bruce "Kevin "Scott "Mike "Thomas "Brian "Paul "N. "Peter "Kenneth "John "Stephen
7 birthday Christmas lasted died sister contacted ago gift weeks week daughter month sent months dad
8 select press turn restart unplug adjust seconds delete activate change reset manually enter click skip
9 highs mids treble lows bass bass, bass. muddy midrange tinny classical Bass hi

3000 batches, and LR: 0.001
1 photography, photography photography. amateur photographer portraits studio casual outdoors school editing everyday situations daily outdoor
2 variety countless thousands tons numerous bunch hundreds various many kinds sorts ton types lots several
3 movies Music TiVo Google Player Portable games videos DVDs Tivo Desktop books DVD's Seagate HD
4 future.", to.", again.", for.", on.", in.", fine.", one.", these.", time.", stars.", me.", now.", that.", them.",
5 sleek appearance color, bulk color. color matches slim elegant protects gray stylish leather protection feel,
6 "William "Richard "Robert "Bruce "Kevin "Scott "Mike "Brian "Thomas "Paul "N. "Peter "John "Stephen "Daniel
7 birthday Christmas lasted died sister ago contacted gift weeks week daughter month sent months dad
8 select press turn restart unplug adjust seconds activate delete change reset manually enter skip click
9 highs mids treble lows bass bass, bass. muddy midrange tinny classical Bass his