In [None]:

!python -m torch.distributed.launch --nproc_per_node=1 --use_env ./ncf.py --epochs 50 --data /data/cache/ml-20m --checkpoint_dir ./saved_model

In [9]:
!rm -rf ./saved_model
!python -m torch.distributed.launch --nproc_per_node=1 --use_env ../ncf.py --epochs 50 --data /data/cache/ml-20m --checkpoint_dir ./saved_model

DLL 2020-08-16 07:08:55.406353 - PARAMETER data : /data/cache/ml-20m  epochs : 50  batch_size : 1048576  valid_batch_size : 1048576  factors : 64  layers : [256, 256, 128, 64]  negative_samples : 4  learning_rate : 0.0045  topk : 10  seed : None  threshold : 1.0  beta1 : 0.25  beta2 : 0.5  eps : 1e-08  dropout : 0.5  checkpoint_dir : ./saved_model  load_checkpoint_path : None  mode : train  grads_accumulated : 1  amp : False  log_path : log.json  world_size : 1  distributed : False  local_rank : 0 
Saving results to ./saved_model
NeuMF(
  (mf_user_embed): Embedding(138493, 64)
  (mf_item_embed): Embedding(26744, 64)
  (mlp_user_embed): Embedding(138493, 128)
  (mlp_item_embed): Embedding(26744, 128)
  (mlp): ModuleList(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=128, bias=True)
    (2): Linear(in_features=128, out_features=64, bias=True)
  )
  (final): Linear(in_features=128, out_features=1, bias=True)
)
31832577 paramete

In [2]:
!ls -l saved_model

total 124356
-rw-r--r-- 1 root root 127332376 Aug 16 06:09 model.pth


In [3]:
import sys
sys.path.insert(0, "/workspace/recommendation")

import torch.jit
import time
from argparse import ArgumentParser
import numpy as np
import torch

from neumf import NeuMF

In [4]:
def parse_args():
    parser = ArgumentParser(description="Benchmark inference performance of the NCF model")
    parser.add_argument('-f', '--file', help='Path for input file. First line should contain number of lines to search in')
    parser.add_argument('--load_checkpoint_path', default=None, type=str,
                        help='Path to the checkpoint file to be loaded before training/evaluation')
    parser.add_argument('--n_users', default=138493, type=int,
                        help='Number of users. Defaults to the number of users in the ml-20m dataset after preprocessing')
    parser.add_argument('--n_items', default=26744, type=int,
                        help='Number of items. Defaults to the number of users in the ml-20m dataset after preprocessing')
    parser.add_argument('-fac', '--factors', type=int, default=64,
                        help='Number of predictive factors')
    parser.add_argument('--dropout', type=float, default=0.5,
                        help='Dropout probability, if equal to 0 will not use dropout at all')
    parser.add_argument('--layers', nargs='+', type=int,
                        default=[256, 256, 128, 64],
                        help='Sizes of hidden layers for MLP')
    parser.add_argument('--batch_sizes', default='1,4,16,64,256,1024,4096,16384,65536,262144,1048576', type=str,
                        help='A list of comma-separated batch size values to benchmark')
    parser.add_argument('--num_batches', default=200, type=int,
                        help='Number of batches for which to measure latency and throughput')
    parser.add_argument('--fp16', action='store_true', help='Cast the model to FP16 precision', default=False)
    parser.add_argument('--log_path', default='log.json', type=str,
                        help='Path for the JSON training log')

    return parser.parse_args()

In [18]:
def find_similar_movies(nn_movie_id, item_embedding, item_embedding_norm=None, k=10):
    #if not item_embedding_norm:
    #    item_embedding_norm = np.linalg.norm(item_embedding, axis=1)
    #sim = np.dot(item_embedding, item_embedding[nn_movie_id].reshape(64, 1)).squeeze()/item_embedding_norm
    
    sim = 1-cdist(item_embedding, item_embedding[nn_movie_id].reshape(1, -1), metric="cosine")

    #sim = -cdist(item_embedding, item_embedding[nn_movie_id].reshape(1, -1), metric="euclidean")
    
    return sim.squeeze().argsort()[-k:][::-1]

In [5]:
args = parse_args()

In [15]:
!ls ../saved_model/

model.pth


In [16]:
model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors,
              mlp_layer_sizes=args.layers, dropout=args.dropout)

model = model.cuda()

state_dict = torch.load("../saved_model/model.pth")
model.load_state_dict(state_dict)

#find nearest neighbor
item_embedding = model.mf_item_embed.weight
item_embedding = item_embedding.detach().cpu().numpy()



# Item-item similarity

In [19]:
import pickle

with open('./mappings.pickle', 'rb') as handle:
    movies_mapping = pickle.load(handle)["items"]

nn_to_movies = movies_mapping
movies_to_nn = {}
for i in range(len(movies_mapping)):
    movies_to_nn[movies_mapping[i]] = i

import pandas as pd
movies = pd.read_csv("/data/ml-20m/movies.csv", index_col="movieId")

movie_ID = 2
print("Query: ", movies.ix[movie_ID]["title"], movies.ix[movie_ID]["genres"])

print("Similar movies: ")
similar_movies = find_similar_movies(movies_to_nn[movie_ID], item_embedding)

for i in similar_movies:
    print(movies.ix[nn_to_movies[i]]["title"], movies.ix[nn_to_movies[i]]["genres"])

Query:  Jumanji (1995) Adventure|Children|Fantasy
Similar movies: 


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


NameError: name 'cdist' is not defined

In [None]:
movie_ID = 1
print("Query: ", movies.ix[movie_ID]["title"], movies.ix[movie_ID]["genres"])

print("Similar movies: ")
similar_movies = find_similar_movies(movies_to_nn[movie_ID], item_embedding)

for i in similar_movies:
    print(movies.ix[nn_to_movies[i]]["title"], movies.ix[nn_to_movies[i]]["genres"])

In [None]:
user_embedding = model.mf_user_embed.weight

In [None]:
user_embedding.shape

In [None]:
#item_embedding = model.mf_item_embed.weight

item_embedding = model.mlp_item_embed.weight

In [None]:
torch.dot(item_embedding[0], item_embedding[1])

In [None]:
#numpy
item_embedding = item_embedding.detach().cpu().numpy()

In [None]:
item_embedding.shape

In [None]:
item_embedding_norm = np.linalg.norm(item_embedding, axis=1)

In [None]:
len(item_embedding_norm)

In [None]:
sim = np.dot(item_embedding, item_embedding[0].reshape(-1, 1)).squeeze()

In [None]:
sim

In [None]:
len(sim/item_embedding_norm)

In [None]:
sim.squeeze().argsort()[-10:][::-1]

In [None]:
from scipy.spatial.distance import cdist

cosine_sim = 1-cdist(item_embedding, item_embedding[0].reshape(1, -1), metric="cosine")


In [None]:
cosine_sim

In [None]:
cosine_sim.shape

In [None]:
cosine_sim.squeeze().argsort()[-10:][::-1]

In [None]:
euclidean_sim = -cdist(item_embedding, item_embedding[0].reshape(1, -1), metric="euclidean")

In [None]:
euclidean_sim

In [None]:
!ls /data/ml-20m

In [None]:
import pandas as pd
movies = pd.read_csv("/data/ml-20m/movies.csv", index_col="movieId")

In [None]:
movies

In [None]:
import pickle

with open('./mappings.pickle', 'rb') as handle:
    movies_mapping = pickle.load(handle)["items"]

In [None]:
movies_mapping

In [None]:
nn_to_movies = movies_mapping

In [None]:
movies_to_nn = {}
for i in range(len(movies_mapping)):
    movies_to_nn[movies_mapping[i]] = i

In [None]:
movies_to_nn

In [None]:
import numpy as np
max(movies_mapping)

In [None]:
sorted(movies_mapping)

In [None]:
movies.ix[2]

In [None]:
def find_similar_movies(nn_movie_id, item_embedding, item_embedding_norm=None, k=10):
    #if not item_embedding_norm:
    #    item_embedding_norm = np.linalg.norm(item_embedding, axis=1)
    #sim = np.dot(item_embedding, item_embedding[nn_movie_id].reshape(64, 1)).squeeze()/item_embedding_norm
    
    sim = 1-cdist(item_embedding, item_embedding[nn_movie_id].reshape(1, -1), metric="cosine")

    #sim = -cdist(item_embedding, item_embedding[nn_movie_id].reshape(1, -1), metric="euclidean")
    
    return sim.squeeze().argsort()[-k:][::-1]

In [None]:
find_similar_movies(1, item_embedding)

In [None]:
movie_ID = 1
print("Query: ", movies.ix[movie_ID]["title"], movies.ix[movie_ID]["genres"])

print("Similar movies: ")
similar_movies = find_similar_movies(movies_to_nn[movie_ID], item_embedding)

for i in similar_movies:
    print(movies.ix[nn_to_movies[i]]["title"], movies.ix[nn_to_movies[i]]["genres"])