In [None]:
/*
 *     THIS FILE BELONGS TO THE PROGRAM: TLCM 
 *
 *        File: results.evaluation.ipynb
 *
 *     Authors: Deleted for purposes of anonymity 
 *
 *     Proprietor: Deleted for purposes of anonymity --- PROPRIETARY INFORMATION
 * 
 * The software and its source code contain valuable trade secrets and shall be maintained in
 * confidence and treated as confidential information. The software may only be used for 
 * evaluation and/or testing purposes, unless otherwise explicitly stated in the terms of a
 * license agreement or nondisclosure agreement with the proprietor of the software. 
 * Any unauthorized publication, transfer to third parties, or duplication of the object or
 * source code---either totally or in part---is strictly prohibited.
 *
 *     Copyright (c) 2021 Proprietor: Deleted for purposes of anonymity
 *     All Rights Reserved.
 *
 * THE PROPRIETOR DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY 
 * AND FITNESS FOR A PARTICULAR PURPOSE AND THE WARRANTY AGAINST LATENT 
 * DEFECTS, WITH RESPECT TO THE PROGRAM AND ANY ACCOMPANYING DOCUMENTATION. 
 * 
 * NO LIABILITY FOR CONSEQUENTIAL DAMAGES:
 * IN NO EVENT SHALL THE PROPRIETOR OR ANY OF ITS SUBSIDIARIES BE 
 * LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES
 * FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF INFORMATION, OR
 * OTHER PECUNIARY LOSS AND INDIRECT, CONSEQUENTIAL, INCIDENTAL,
 * ECONOMIC OR PUNITIVE DAMAGES) ARISING OUT OF THE USE OF OR INABILITY
 * TO USE THIS PROGRAM, EVEN IF the proprietor HAS BEEN ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGES.
 * 
 * For purposes of anonymity, the identity of the proprietor is not given herewith. 
 * The identity of the proprietor will be given once the review of the 
 * conference submission is completed. 
 *
 * THIS HEADER MAY NOT BE EXTRACTED OR MODIFIED IN ANY WAY.
 */

In [None]:
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import defaultdict
import pandas as pd
plt.style.use('seaborn')
import seaborn as sns
%matplotlib inline
import utils

In [None]:
dir_preprocessed_data = utils.DIR_PREPROCESSED_DATA
dir_training_data = utils.DIR_TRAIN_TEST_DATA
dir_results = utils.DIR_RESULTS
K = utils.K
L = utils.L
V = 2000
sigma_u = utils.SIGMA_USERS
sigma_p = utils.SIGMA_PRODUCTS
categories = utils.CATEGORIES

def dist(xy, xy2):
    x1, y1 = xy
    x2, y2 = xy2
    dist = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    return dist


def neighborhood_function(z1, z, sigma=0.5):
    output = []
    num = sum([np.exp(-dist(z1,zz)/(2*sigma)) for zz in z])
    for z2 in z:
        den = np.exp(-dist(z1,z2)/(2*sigma))
        output.append(den/num)
    return output

def latex_words(p_w_yx, inverse_kw_map, grid, num_words=10):
    num_classes = len(grid)
    grid_words = defaultdict(list)
    output_words = []
    
    for i in range(num_classes):
        top_k = [k[1] for k in sorted(zip(p_w_yx[i], inverse_kw_map.keys()), reverse=True)[:num_words]]
        top_words = [inv_keywords_map[k] for k in top_k]
        grid_words[grid[i][0]].append(top_words)

    for key in grid_words.keys():
        output_words.extend(np.array(grid_words[key]).T)
        
    return output_words

grid_user = [(int(i), int(j)) for i in range(int(np.sqrt(K))) for j in range(int(np.sqrt(K)))]
grid_prod = [(int(i), int(j)) for i in range(int(np.sqrt(L))) for j in range(int(np.sqrt(L)))]
sum_l = [np.arange(L)+L*i for i in range(K)]
sum_k = [np.arange(0, K*L, L)+1*i for i in range(L)]

In [None]:
category = 'automotive'
dir_preprocessed_data_c = dir_preprocessed_data + category + '/'
dir_results_c = dir_results + category
dir_results_CNN_c = dir_results_c + '/results_CNN_{0}_{1}/'.format(K, L)
dir_results_EM_c = dir_results_c + '/results_EM_{0}_{1}/'.format(K, L)
dir_figures_c = dir_results_c + '/figures/' 

### Plot learning curves

In [None]:
statistics_dict = {}
nll_train = pickle.load(open(dir_results_EM_c + 'nll_train.pkl', 'rb'))
nll_test = pickle.load(open(dir_results_EM_c + 'nll_test.pkl', 'rb'))
ticks = np.arange(0, len(nll_train) + 1, 5)
plt.plot(nll_train, label='Train')
plt.plot(nll_test, label='Test')
plt.legend()
plt.title(category, fontsize=14)
plt.xticks(ticks, labels=[i for i in ticks])
plt.xlabel('Epoch')
plt.ylabel('NLL')
plt.show()

In [None]:
users_map = pickle.load(open(dir_preprocessed_data_c + 'users_map.pkl', 'rb'))  # {user_ID: idx_u}
products_map = pickle.load(open(dir_preprocessed_data_c + 'products_map.pkl', 'rb')) # {prod_ID: idx_p}
keywords_map = pickle.load(open(dir_preprocessed_data_c + 'keywords_map.pkl', 'rb')) # {word_ID: idx_w}
users_test = pickle.load(open(dir_preprocessed_data_c + 'users_test.pkl', 'rb'))

# word index from 0 to 2000, originally from 1 to 2001 (due to padding)
# --> {k:v-1 for k,v in keywords_map.items()}
keywords_map = {k:v-1 for k,v in keywords_map.items()} # {word: idx_w}
inv_keywords_map = {v-1:k for k,v in keywords_map.items()} # {idx_w: word}
inv_users_map = {v:k for k,v in users_map.items()} # {idx_u: user_ID}
inv_products_map = {v:k for k,v in products_map.items()} # {idx_p: prod_ID}
inv_keywords_map = {v:k for k,v in keywords_map.items()} # {idx_w: word_ID}

statistics_dict[category] = {'# users':len(users_map), '# products':len(products_map)}
print(pd.DataFrame(statistics_dict))

In [None]:
if not os.path.exists(dir_figures_c):  # create directory if it does not exist
    print('\tCreate new output directory:', dir_figures_c)
    os.makedirs(dir_figures_c)

p_w_yuyp = pickle.load(open(dir_results_EM_c + 'p_w_yuyp.pkl', 'rb'))
p_w_yuyp_T = p_w_yuyp.T
p_zu_u = pickle.load(open(dir_results_EM_c + 'p_zu_u.pkl', 'rb'))
p_zp_p = pickle.load(open(dir_results_EM_c + 'p_zp_p.pkl', 'rb'))

## Generative extension for unseen users

In [None]:
users_test_dict = {}

for review in users_test:
    user_id = review[0]
    product_id = review[1]
    text = review[2]
    if users_test_dict.get(user_id) == None:
        users_test_dict[user_id] = [[product_id, text]]
    else:
        users_test_dict[user_id].append([product_id, text])

In [None]:
user_id = list(users_test_dict.keys())[0]
print('User ID: {0}'.format(user_id))
for review in users_test_dict[user_id]:
        print(review[0],[inv_keywords_map[w] for w in review[1]])

In [None]:
sampled_review = users_test_dict[user_id][5]
product_test = products_map[sampled_review[0]]
review_test = sampled_review[1]
print(sampled_review)
print([inv_keywords_map[w] for w in review_test])

In [None]:
p_zp_prod = p_zp_p[product_test]
p_zu_user = [1./K]*K             # flat prior (we do not have information about this user)

### $P(w|\mathbf{{y}}_{u}^{k'},\bar p) = \sum_{\ell'} P(w|\mathbf{{y}}_{u}^{k'},\mathbf{{y}}_{p}^{\ell'}) P(\mathbf{{y}}_{p}^{\ell'} |\bar p) \qquad \forall w \in \bar r$

In [None]:
p_w_yup = np.array([np.sum((np.multiply(p_w_yuyp_T[w][sum_l[i]],p_zp_prod))) 
                    for w in review_test for i in range(K)]).reshape(len(review_test),K)
p_w_yup.shape

### $P(\bar{r}|\mathbf{{y}}_{u}^{k'},\bar p) = \prod_{w \in \bar{r}}P(w|\mathbf{{y}}_{u}^{k'},\bar p)$

In [None]:
p_r_yup = np.prod(p_w_yup, axis=0)
p_r_yup.shape

### $P(\mathbf{{y}}_{u}^{k'}|\bar{r}) = \frac{P(\bar{r}|\mathbf{{y}}_{u}^{k'},\bar p)P(\mathbf{y}_u^{k^{\prime}}|u^i)}
                                    {\sum_{k^{''}}P(\bar{r}|\mathbf{{y}}_{u}^{k^{''}},\bar p)P(\mathbf{y}_u^{k^{''}}|u^i)}$

In [None]:
den = np.sum(p_r_yup)    
p_yu_r = p_r_yup/den

In [None]:
plt.bar(np.arange(K), p_yu_r)
plt.show()
np.sum(p_yu_r) # check

In [None]:
p_w_yu = np.array([sum(p_w_yuyp[sum_l[i]])/L for i in range(K)])
p_w_yp = np.array([sum(p_w_yuyp[sum_k[i]])/K for i in range(L)])

In [None]:
labels_u = []
for i in range(K):
    top_k = [k[1] for k in sorted(zip(p_w_yu[i], sorted(inv_keywords_map.keys())), reverse = True)[:10]]
    top_words = [inv_keywords_map[k] for k in top_k]
    labels_u.append('\n'.join(top_words))
    
labels_u = np.array(labels_u).reshape(int(np.sqrt(K)),int(np.sqrt(K)))

In [None]:
plt.figure(figsize=(12,13))
sns.heatmap(p_yu_r.reshape(int(np.sqrt(K)),int(np.sqrt(K))), annot=labels_u, linewidths=0.3, fmt='', cmap="GnBu",
           annot_kws={"fontsize":13}, linecolor='black')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig(dir_figures_c + 'generative_extension.pdf', format='pdf', dpi=800, transparent=True, bbox_inches='tight')
plt.show()
plt.close()
print([inv_keywords_map[w] for w in review_test])

### User latent class organization

In [None]:
# labels_u already computed
values_u = np.ones(K)
plt.figure(figsize=(10,13))
sns.heatmap(values_u.reshape(int(np.sqrt(K)),int(np.sqrt(K))), annot=labels_u, linewidths=0.3, fmt='', cmap="rocket_r",
           annot_kws={"fontsize":13}, linecolor='black', cbar=False)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig(dir_figures_c + 'map_users.pdf', format='pdf', dpi=800, transparent=True, bbox_inches='tight')
plt.show()
plt.close()

### Product latent class organization

In [None]:
labels_p = []
for i in range(L):
    top_k = [k[1] for k in sorted(zip(p_w_yp[i], sorted(inv_keywords_map.keys())), reverse = True)[:10]]
    top_words = [inv_keywords_map[k] for k in top_k]
    labels_p.append('\n'.join(top_words))
    print(grid_prod[i], top_words)
    print()
    
labels_p = np.array(labels_p).reshape(int(np.sqrt(L)),int(np.sqrt(L)))
values_p = np.ones(L)
plt.figure(figsize=(8,11))
sns.heatmap(values_p.reshape(int(np.sqrt(L)),int(np.sqrt(L))), annot=labels_p, linewidths=0.3, fmt='', cmap="rocket_r",
           annot_kws={"fontsize":13}, linecolor='black', cbar=False)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.savefig(dir_figures_c + 'map_products.pdf', format='pdf', dpi=800, transparent=True, bbox_inches='tight')
plt.show()
plt.close()