In [25]:
import struct 
import pickle
import numpy as np

key_type = 'I32' # {'I64', 'I32'}, default is 'I32'
key_type_map = {"I32": ["I", 4], "I64": ["q", 8]}

embedding_vec_size = 64
 
each_key_size = key_type_map[key_type][1] + key_type_map[key_type][1] + 4 * embedding_vec_size

In [26]:
!ls -l ./hugeCTR_saved_model/0_sparse_20000.model

-rw-r--r-- 1 root root 43622568 Aug 10 13:32 ./hugeCTR_saved_model/0_sparse_20000.model


In [27]:
each_key_size * (138493+26744) # file size should match the above

43622568

In [28]:
embedding_table = [{},{}]

with open('./hugeCTR_saved_model/0_sparse_20000.model', 'rb') as file:
    try:
        while True:
            buffer = file.read(each_key_size)
            if len(buffer) == 0:
                break

            key, slot_id = struct.unpack("2" + key_type_map[key_type][0], 
                                         buffer[0: 2*key_type_map[key_type][1]])
            values = struct.unpack(str(embedding_vec_size) + "f", buffer[2*key_type_map[key_type][1]: ])

            if slot_id==0:
                embedding_table[slot_id][key] = values
            else:
                embedding_table[slot_id][key-138493] = values
    except BaseException as error:
        print(error)

In [29]:
len(embedding_table[0])

138493

In [30]:
len(embedding_table[1])

26744

In [31]:
item_embedding = np.zeros((len(embedding_table[1]), embedding_vec_size), dtype='float')
for i in range(len(embedding_table[1])):
    item_embedding[i] = embedding_table[1][i]

# Nearest neighbor query

In [32]:
from scipy.spatial.distance import cdist

def find_similar_movies(nn_movie_id, item_embedding, k=10, metric="cosine"):
    #find the top K similar items according to one of the distance metric: cosine or euclidean
    sim = 1-cdist(item_embedding, item_embedding[nn_movie_id].reshape(1, -1), metric=metric)
   
    return sim.squeeze().argsort()[-k:][::-1]

In [33]:
with open('./mappings.pickle', 'rb') as handle:
    movies_mapping = pickle.load(handle)["items"]

nn_to_movies = movies_mapping
movies_to_nn = {}
for i in range(len(movies_mapping)):
    movies_to_nn[movies_mapping[i]] = i

import pandas as pd
movies = pd.read_csv("./data/ml-20m/movies.csv", index_col="movieId")


#find nearest neighbor
movie_ID = 1
print("Query: ", movies.loc[movie_ID]["title"], movies.loc[movie_ID]["genres"])

print("Similar movies: ")
similar_movies = find_similar_movies(movies_to_nn[movie_ID], item_embedding)

for i in similar_movies:
    print(nn_to_movies[i], movies.loc[nn_to_movies[i]]["title"], movies.loc[nn_to_movies[i]]["genres"])

Query:  Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
Similar movies: 
1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
93988 North & South (2004) Drama|Romance
130089 Crazy Beautiful You (2015) (no genres listed)
31424 Alone in the Dark (2005) Action|Horror|Sci-Fi|Thriller
58783 Youth Without Youth (2007) Drama|Romance|Sci-Fi
118816 Public Sex (2009) Comedy|Drama
111993 Minuscule: Valley of the Lost Ants (Minuscule - La vallée des fourmis perdues) (2013) Adventure|Animation|Children
479 Judgment Night (1993) Action|Crime|Thriller
33168 Jiminy Glick in La La Wood (2004) Comedy|Mystery
80574 Saratoga Trunk (1945) Drama|Romance


In [34]:
movie_ID = 1196
print("Query: ", movies.loc[movie_ID]["title"], movies.loc[movie_ID]["genres"])

print("Similar movies: ")
similar_movies = find_similar_movies(movies_to_nn[movie_ID], item_embedding)

for i in similar_movies:
    print(nn_to_movies[i], movies.loc[nn_to_movies[i]]["title"], movies.loc[nn_to_movies[i]]["genres"])

Query:  Star Wars: Episode V - The Empire Strikes Back (1980) Action|Adventure|Sci-Fi
Similar movies: 
1196 Star Wars: Episode V - The Empire Strikes Back (1980) Action|Adventure|Sci-Fi
4060 Love Field (1992) Drama
7024 Salo, or The 120 Days of Sodom (Salò o le 120 giornate di Sodoma) (1976) Drama
26603 Prince of Darkness (1987) Fantasy|Horror|Sci-Fi|Thriller
25977 Abbott and Costello Meet Dr. Jekyll and Mr. Hyde (1953) Comedy|Horror|Sci-Fi
8816 Speedway (1968) Action
110004 Next Man, The (1976) Action|Thriller
112749 And So It Goes (2014) Comedy|Drama|Romance
113976 Slow Southern Steel (2011) Documentary
1779 Sphere (1998) Sci-Fi|Thriller


In [35]:
item_embedding.shape

(26744, 64)

In [36]:
len(embedding_table[0].keys())

138493