In [1]:
import pandas as pd
import numpy as np
import scipy
import math
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import nltk

STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]


In [5]:
import gensim

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

PATH_TO_WORD2VEC = "embeds/GoogleNews-vectors-negative300.bin"
# PATH_TO_GLOVE = os.path.expanduser("~/data/glove/glove.840B.300d.txt")

word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC, binary=True)

FileNotFoundError: [Errno 2] No such file or directory: 'embeds/GoogleNews-vectors-negative300.bin'

In [None]:

tmp_file = "/tmp/glove.840B.300d.w2v.txt"
glove2word2vec(PATH_TO_GLOVE, tmp_file)
glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

In [11]:
import json
from collections import Counter,defaultdict
def read_json(path,f):
    json_data = []
    cur_path = os.path.join(path,f)
    with open(cur_path) as json_file:
        for line in json_file:
            json_data.append(json.loads(line))
    return json_data
def save_json(path,data):
    with open(path, 'w') as f:
        json.dump(data, f)
def process_data(json_data):
    tmp = defaultdict(lambda: defaultdict(int))
    for session in json_data[0]:
        events = session["events"]
        last_vrex_search=-1
        tune = -1
        for idx,event in enumerate(events):
            if event["eventType"] == "vrexSearch":
                last_vrex_search = idx
        if last_vrex_search >=0:
            for idx in range(last_vrex_search,len(events)):
                event = events[idx]
                if event["eventType"] == "mediaTuneMetrics":
                    tune = idx
                    break
        if last_vrex_search >=0 and tune >= 0:
            query = events[last_vrex_search]["term"]
            to = events[tune]["title"]
            tmp[query][to] += 1
    print(len(tmp))
    return tmp

In [12]:
FOLDER_PATH = "hailong_data/07_30"
FILE_NAME = "session-vrex-tune-2019-07-30-13.json"
json_data = read_json(FOLDER_PATH,FILE_NAME)
query2program = process_data(json_data)

18327


In [13]:
queries = [k for k in query2program]

In [8]:
import tensorflow_hub as hub

tf.logging.set_verbosity(tf.logging.ERROR)
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/1")

In [9]:
sentences = tf.placeholder(dtype=tf.string, shape=[None])
embedding_fun = embed(sentences)

In [15]:
with tf.Session() as sess:
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
    context_embed = sess.run(embedding_fun, feed_dict={sentences: queries})

In [18]:
context_embed.shape

(18327, 512)

In [20]:
def compute_euclidean_dist(embed):
    from scipy.spatial import distance_matrix
    dist_matrix = distance_matrix(embed,embed)
    return dist_matrix
euclidean_dist_matrix = compute_euclidean_dist(context_embed)

In [28]:
def pick_most_similar(src_sent, dist_mat, ret_count=10, threshold=None):
    dist_order = np.argsort(dist_mat[src_sent,:])[1:1+ret_count]
    dist_list = dist_mat[src_sent][dist_order]
    mask = np.ones_like(dist_list)
    if threshold is not None:
        mask = np.where(dist_list < threshold)
        return dist_order[mask], dist_list[mask]
    else:
        return dist_order, dist_list

In [30]:
queries[1]

'animal planet'

In [39]:
d1,d2 = pick_most_similar(1,euclidean_dist_matrix,20)

In [40]:
for each in d1:
    print(queries[each])

animal planet animal planet
the animal planet
animal planet shows
animal animal planet
animal planet channel
nickelodeon animal planet
animal planet on demand
show me animal planet
free animal planet
animal planet hd
animal planet on demand shows
find animal planet
animal channel
find the animal planet
turn to animal planet
put on animal planet
let's watch animal planet
find animal planet channel
go to animal planet
show me the animal planet


In [41]:
from nltk.cluster import KMeansClusterer
import nltk

In [52]:
NUM_CLUSTERS=900
#cosine_distance
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.euclidean_distance, repeats=25)
assigned_clusters = kclusterer.cluster(context_embed, assign_clusters=True)

KeyboardInterrupt: 

In [47]:
queries[1]

'animal planet'

In [51]:
queries[6]

'paw patrol'

In [None]:
assigned_clusters

In [73]:
from sklearn import cluster
from sklearn import metrics
NUM_CLUSTERS = 100
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(context_embed)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [83]:
centroids.shape

(100, 512)

In [81]:
cur_label=5
indexes = np.where(labels==cur_label)
np.array(queries)[indexes]

array(['battlebots', 'american ninja warrior junior', 'power rangers',
       'american ninja warrior', 'smurfs english',
       'mel mao mao: heroes of pure heart',
       'mao mao: heroes of pure heart', 'power rangers dino charge',
       'power rangers ninja steel', 'knight rider', 'heroes & icons',
       'my hero academy', 'hero 6', 'find heroes & icons',
       "ok k.o.! let's be heroes", 'ninja warrior', 'play power rangers',
       'find american ninja warrior', 'power ranger sabins',
       'the strongest man in history', 'american ninja junior',
       'american ninja warrior ninja warrior', 'ninja vs. tyler on',
       'sergeant stubby an american hero', 'dildo heroes',
       "world's deadliest", 'hero girls', 'ninjago masters of spinjitzu',
       'descendents marathon', 'sanban power rangers',
       'aries spears comic blueprint', 'power ranger shows',
       'watch american ninja warrior', 'beyblade: shogun steel',
       'alexa go told american ninja warrior',
       

In [82]:
centroids[cur_label]

array([-3.79818156e-02, -2.01500505e-02, -2.73027420e-02, -2.30461638e-03,
        1.54417623e-02, -1.45422099e-02,  2.74933176e-04, -1.56202996e-02,
       -4.78789173e-02, -1.04728490e-02,  2.39983723e-02, -1.68087352e-02,
        6.29486069e-02, -1.48629490e-02, -5.60750440e-02,  4.23770398e-03,
        3.96860465e-02, -4.02539298e-02,  2.36671045e-02, -6.15866482e-03,
       -1.57597363e-02, -3.66673097e-02,  1.98372398e-02, -3.62982787e-02,
        2.44265273e-02, -4.44332510e-02,  9.20078252e-03, -3.34316418e-02,
       -5.67909442e-02, -5.17232530e-03,  8.20795074e-04, -9.94462986e-03,
       -3.20194550e-02, -8.36161617e-03,  7.07787136e-03,  1.46035198e-03,
        3.01203150e-02,  2.35912763e-02, -2.92529166e-02, -2.70215180e-02,
       -5.36955707e-03,  5.25475852e-03,  1.49889234e-02,  3.81433740e-02,
        1.03313625e-02,  1.33065972e-02, -2.15592738e-02, -3.80303827e-03,
       -4.20448557e-03, -1.54106664e-02, -3.60529348e-02, -4.32662433e-03,
        4.99712043e-02,  