In [2]:
import gensim
import numpy as np
import pandas as pd
import re
import random
import tensorflow as tf
from tqdm import tqdm
import json
import codecs

In [3]:
# Load Word2Vec model (trained on an enormous Google corpus)
model = gensim.models.KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary = True) 

In [4]:
def preprocess(path,documents,reviewColumn):
    # read review texts
    reviews = []
    for name in documents:
        raw_data = pd.read_csv(path+name+".csv")
        raw_data = raw_data.dropna(subset=[reviewColumn])
        raw_data = raw_data.reset_index(drop=True)
        reviews += list(raw_data[reviewColumn])
    sentences = []
    for review in reviews:
        #split text into sentences (separated by .,?,!, or a newline)
        sentences += re.split(r"[\.\?!]+[ \n]*",review)
    #split sentences into words
    tokenized = [re.split(r"[,]*[ \n]+[,]*",sentence) for sentence in sentences]
    #remove empty sentences
    tokenized = [element for element in tokenized if element!=['']]
    return tokenized

In [5]:
def word2vec(word,vec_model):
    try:
        return vec_model[word]
    except KeyError:
        return np.zeros((300,1))

def vectorize(sentence_text,vec_model):
    return [tf.reshape(tf.convert_to_tensor(word2vec(word,vec_model),dtype=tf.float32),[300,1]) for word in sentence_text]

In [6]:
def sentence_embedding(sentence,M):
    #calculate how much the model should focus on each word
    a = embedding_focus(sentence,M)
    #sentence embedding is a weighted average of the vectors in the sentence
    z_s=tf.reduce_sum([a[i]*sentence[i] for i in range(len(sentence))],axis=0)
    return z_s

In [7]:
def embedding_focus(sentence,M):
    y_s = tf.reshape(tf.reduce_mean(sentence,axis=0),[300,1])
    a = tf.convert_to_tensor([tf.matmul(tf.matmul(tf.transpose(word),M),y_s) for word in sentence])
    
    #reduce values to prevent overflow
    a -= tf.reduce_max(a)-70
    #apply softmax
    a = tf.exp(a)
    a /= tf.reduce_sum(a)
    
    return a

In [8]:
def sentence_mean(sentence):
    y_s = tf.reduce_mean(sentence,axis=0)
    return y_s

In [9]:
def reconstruction(embedding,W,b,T):
    #calculate the topic proportion of the sentence based on the embedding and learned parameters
    p_t = topic_proportion(embedding,W,b,T)
    #get a weighted average of the topic vectors
    r_s = tf.matmul(tf.transpose(T),p_t)
    return r_s

In [10]:
def topic_proportion(embedding,W,b,T):
    #multiply by M and add bias b (both learned) to get prominance scores
    p_t = tf.matmul(W,embedding)+b
    #reduce values to prevent overflow
    p_t -= tf.reduce_max(p_t)-70
    #apply softmax
    p_t = tf.exp(p_t)
    p_t /= tf.reduce_sum(p_t)
    return p_t

In [11]:
def J_s(embedding,re_embedding,negatives):
    #calculate cosine similarity of reconstruction with embedding and a random sample of other embeddings
    cos_sim = tf.reshape(tf.matmul(tf.transpose(embedding),re_embedding),[1])
    neg_sim = tf.convert_to_tensor([tf.reshape(tf.matmul(tf.transpose(re_embedding),neg),[1]) for neg in negatives])
    #loss tries to maximize cosine similarity of reconstruction with embedding 
    #while minimizing similarity with other (generally unrelated) sentence embeddings
    return tf.reduce_sum(tf.maximum(0,1-cos_sim+neg_sim))

def J(embeddings,reconstructions,negs):
    total_loss = 0
    dJ_dr_list = []
    #get loss for each sentence and sum them
    for i in range(len(embeddings)):
        temp = J_s(embeddings[i],reconstructions[i],negs)
        total_loss += temp
    return total_loss

In [12]:
#no longer used
def regularization(T):
    rows = []
    #normalize topic vectors to length 1
    for i in range(T.shape[0]):
        rows.append([T[i]/tf.reduce_sum(tf.square(T[i]))])
    normed = tf.concat(axis=0, values=rows)
    #get dot product (=cosine similarity) of all pairs of topics
    dots = tf.matmul(normed,tf.transpose(normed))
    #subtract identity matrix (since the cosine similarity of all topics with itself is 1)
    U = dots-tf.eye(T.shape[0])
    #get norm of entire matrix
    return tf.reduce_sum(tf.square(U))

In [13]:
def build_topics(seed_lists,seed_weights,unseeded):
    
    #perform sparsemax of raw seed weights
    weights_sorted = [tf.sort(element,direction='DESCENDING') for element in seed_weights]
    k = [tf.range(element.shape[0],dtype='float32')+1 for element in seed_weights]
    k_array = [1 + tf.math.multiply(k[i],weights_sorted[i]) for i in range(len(k))]
    weights_cumsum = [tf.cumsum(element) for element in weights_sorted]
    k_selected = [k_array[i] > weights_cumsum[i] for i in range(len(k_array))]
    k_max = [tf.reduce_max(tf.where(element)).numpy()+1 for element in k_selected]
    threshold = [(weights_cumsum[i][k_max[i]-1] - 1) / k_max[i] for i in range(len(weights_cumsum))]
    seed_weights = [tf.maximum(seed_weights[i]-threshold[i],0) for i in range(len(seed_weights))]
    
    #seeded topoics are just weights averages of seed word vectors
    seed_topics = tf.concat([tf.reshape(tf.matmul(tf.transpose(seed_lists[i]),seed_weights[i]),[1,300]) for i in range(len(seed_lists))],0)
    all_topics = tf.concat([seed_topics,unseeded],0)
    return all_topics

In [100]:
#files = ["dhl","fedex","ups","usps"]
files = ["sj_"+str(i) for i in range(1,7)]
files += ["tp_"+str(i) for i in range(1,6)]
reviews = preprocess("input/shipping/",files,'reviews')

In [15]:
"""reviews_google = []
for i in range(2,6):
    f = open("input/shipping_google_"+str(i)+".json", encoding='utf-8')
    raw = json.load(f)
    for macro in raw:
        for element in macro["reviews"]:
            if element["text"]!=None:
                reviews_google.append(element["text"])
sentences=[]
for review in reviews_google:
    sentences += re.split(r"[\.\?!]+[ \n]*",review)
tokenized = [re.split(r"[,]*[ \n]+[,]*",sentence) for sentence in sentences]
tokenized = [element for element in tokenized if element!=['']]
reviews=tokenized"""

In [101]:
vectorized_reviews = [vectorize(review,model) for review in tqdm(reviews)]

100%|██████████| 27054/27054 [00:29<00:00, 905.95it/s] 


In [102]:
NUMUNSEEDED = 1
NUMSEEDED = 6

In [103]:
#initialize parameters with small random values
M = tf.random.uniform([300,300],minval=-1)
b = tf.random.uniform([NUMSEEDED+NUMUNSEEDED,1],minval=-1)
W = tf.random.uniform([NUMSEEDED+NUMUNSEEDED,300],minval=-1)
#seed words
seed_words = [["customs","international","country"],
             ["service","driver"],
             ["paid","worth","fee"],
             ["delivery","package","shipment"],
             ["speed","quick","late"],
             ["tracking","email","website"]]
seed_lists = [tf.concat([tf.reshape(model[word],[1,300]) for word in seed_words[i]],0) for i in range(len(seed_words))]
seed_weights = [tf.random.uniform([len(seed_lists[i]),1]) for i in range(len(seed_lists))]
seed_topics = tf.concat([tf.reshape(tf.matmul(tf.transpose(seed_lists[i]),seed_weights[i]),[1,300]) for i in range(len(seed_lists))],0)
unseeded = tf.random.uniform([NUMUNSEEDED,300],minval=-1)

In [24]:
def forward_pass(vectorized_reviews,M,W,b,seed_lists,z,uT,m,negative_pool):
    with tf.GradientTape(persistent=True) as g:
        #track gradients
        g.watch(M)
        g.watch(W)
        g.watch(b)
        g.watch(z)
        g.watch(uT)
        T = build_topics(seed_lists,z,uT)
        #get sentence embeddings
        sentence_embeddings = [sentence_embedding(sentence,M) for sentence in vectorized_reviews]
        #get sentence reconstructions
        sentence_reconstructions = [reconstruction(embed,W,b,T) for embed in sentence_embeddings]
        #choose random negative sentences
        negs = random.sample(negative_pool,m)
        #calculate loss over minibatch
        total_loss = J(sentence_embeddings,sentence_reconstructions,negs)
    grads = g.gradient(total_loss,{'M':M,'W':W,'b':b,'z':z,'uT':uT})
    #return loss and gradients
    return total_loss,grads

In [25]:
def training_epoch(vectorized_reviews,M,W,b,seed_lists,z,uT,batch_size=50,learning_rate=0.001,m=20,beta_1=0.9,beta_2=0.999,epsilon=1e-8,num_epochs=1):
    t=0
    #initialize adam optimizer values to 0
    m_M = tf.zeros(M.shape)
    m_W = tf.zeros(W.shape)
    m_b = tf.zeros(b.shape)
    m_z = [tf.zeros(z[i].shape) for i in range(len(z))]
    m_uT = tf.zeros(uT.shape)
    v_M = tf.zeros(M.shape)
    v_W = tf.zeros(W.shape)
    v_b = tf.zeros(b.shape)
    v_z = [tf.zeros(z[i].shape) for i in range(len(z))]
    v_uT = tf.zeros(uT.shape)
    #calculate the pool of negative sentences
    #we use a straight mean rather than a full embedding
    negative_pool = [sentence_mean(sentence) for sentence in tqdm(vectorized_reviews)]
    for j in range(num_epochs):
        #randomize order to avoid overfitting
        random.shuffle(vectorized_reviews)
        for i in tqdm(range(0,len(vectorized_reviews),batch_size)):
            t+=1
            
            #get gradients through a forward pass with a minibatch
            loss,grads=forward_pass(vectorized_reviews[i:min(len(vectorized_reviews),i+batch_size)],M,W,b,seed_lists,z,uT,m,negative_pool)
            #update adam optimizer values
            m_M = beta_1*m_M+(1-beta_1)*grads['M']
            m_W = beta_1*m_W+(1-beta_1)*grads['W']
            m_b = beta_1*m_b+(1-beta_1)*grads['b']
            m_z = [beta_1*m_z[i]+(1-beta_1)*grads['z'][i] for i in range(len(z))]
            m_uT = beta_1*m_uT+(1-beta_1)*grads['uT']

            v_M = beta_2*v_M+(1-beta_2)*tf.square(grads['M'])
            v_W = beta_2*v_W+(1-beta_2)*tf.square(grads['W'])
            v_b = beta_2*v_b+(1-beta_2)*tf.square(grads['b'])
            v_z = [beta_2*v_z[i]+(1-beta_2)*tf.square(grads['z'][i]) for i in range(len(z))]
            v_uT = beta_2*v_uT+(1-beta_2)*tf.square(grads['uT'])
            
            #update parameters
            M-=(m_M/(1-beta_1**t))/(tf.sqrt(v_M/(1-beta_2**t))+epsilon)*learning_rate
            W-=(m_W/(1-beta_1**t))/(tf.sqrt(v_W/(1-beta_2**t))+epsilon)*learning_rate
            b-=(m_b/(1-beta_1**t))/(tf.sqrt(v_b/(1-beta_2**t))+epsilon)*learning_rate
            z=[z[i]-(m_z[i]/(1-beta_1**t))/(tf.sqrt(v_z[i]/(1-beta_2**t))+epsilon)*learning_rate for i in range(len(z))]
            uT-=(m_uT/(1-beta_1**t))/(tf.sqrt(v_uT/(1-beta_2**t))+epsilon)*learning_rate
    #return learned parameters
    return M,W,b,z,uT

In [104]:
M,W,b,seed_weights,unseeded = training_epoch(vectorized_reviews,M,W,b,seed_lists,seed_weights,unseeded,num_epochs=15)
T = build_topics(seed_lists,seed_weights,unseeded)

100%|██████████| 27054/27054 [00:02<00:00, 10699.62it/s]
100%|██████████| 542/542 [08:12<00:00,  1.10it/s]
100%|██████████| 542/542 [07:52<00:00,  1.15it/s]
100%|██████████| 542/542 [07:48<00:00,  1.16it/s]
100%|██████████| 542/542 [07:51<00:00,  1.15it/s]
100%|██████████| 542/542 [07:56<00:00,  1.14it/s]
100%|██████████| 542/542 [07:49<00:00,  1.15it/s]
100%|██████████| 542/542 [07:50<00:00,  1.15it/s]
100%|██████████| 542/542 [07:48<00:00,  1.16it/s]
100%|██████████| 542/542 [07:49<00:00,  1.15it/s]
100%|██████████| 542/542 [07:51<00:00,  1.15it/s]
100%|██████████| 542/542 [07:46<00:00,  1.16it/s]
100%|██████████| 542/542 [07:42<00:00,  1.17it/s]
100%|██████████| 542/542 [07:43<00:00,  1.17it/s]
100%|██████████| 542/542 [07:46<00:00,  1.16it/s]
100%|██████████| 542/542 [07:44<00:00,  1.17it/s]


In [43]:
reviews[7666]

['With',
 'fast',
 'response',
 'and',
 'superior',
 'professionism',
 'I',
 'think',
 'Promoton',
 'is',
 'the',
 'best',
 'choice']

In [44]:
temp_sentence = vectorized_reviews[7666]
temp=sentence_embedding(temp_sentence,M)
#temp=reconstruction(temp,W,b,T)

In [45]:
embedding_focus(temp_sentence,M)

<tf.Tensor: shape=(13, 1, 1), dtype=float32, numpy=
array([[[1.0841736e-04]],

       [[5.5329616e-05]],

       [[3.4649510e-04]],

       [[1.3073077e-06]],

       [[9.3742259e-05]],

       [[1.3073077e-06]],

       [[9.4188362e-01]],

       [[5.5953301e-02]],

       [[1.3073077e-06]],

       [[8.6980890e-07]],

       [[7.1883419e-06]],

       [[7.5633252e-05]],

       [[1.4715591e-03]]], dtype=float32)>

In [42]:
topic_proportion(temp,W,b,T)

<tf.Tensor: shape=(6, 1), dtype=float32, numpy=
array([[9.6039370e-02],
       [1.0486507e-03],
       [8.9238565e-03],
       [1.7518915e-02],
       [2.9405186e-04],
       [8.7617517e-01]], dtype=float32)>

In [94]:
sentence_embeddings = [sentence_embedding(sentence,M) for sentence in tqdm(vectorized_reviews)]

100%|██████████| 3537/3537 [00:11<00:00, 315.98it/s]


In [95]:
sentence_reconstructions = [topic_proportion(embed,W,b,T) for embed in tqdm(sentence_embeddings)]

100%|██████████| 3537/3537 [00:00<00:00, 6432.64it/s]


In [96]:
for i in range(NUMSEEDED+NUMUNSEEDED):
    file = codecs.open('output/shipping_topic_'+str(i)+".txt",'w','utf-8')
    for j in range(len(reviews)):
        if sentence_reconstructions[j][i]>=0.4:
            file.write(str(j)+' '+ ' '.join(reviews[j]) +' '+str(sentence_reconstructions[j][i].numpy())+'\n')
    file.close()

In [97]:
print(model.most_similar(positive=[np.array(T[0])],topn=5))

[('customs', 0.9295828938484192), ('spokesman_Kees_Nanninga', 0.7020161747932434), ('Customs', 0.6367698311805725), ('customs_clearance', 0.5555130243301392), ('customs_formalities', 0.5413417220115662)]


In [None]:
loss,grads=forward_pass(vectorized_reviews,M,W,b,seed_lists,seed_weights,unseeded,20,[sentence_mean(sentence) for sentence in vectorized_reviews])

In [None]:
loss

In [None]:
seed_weights

In [None]:
grads