In [1]:
import pandas as pd
import numpy as np

df_movies = pd.read_csv('data/ml-20m/movies.csv')
df_ratings = pd.read_csv('data/ml-20m/ratings.csv')

movieId_to_name = pd.Series(df_movies.title.values, index = df_movies.movieId.values).to_dict()
name_to_movieId = pd.Series(df_movies.movieId.values, index = df_movies.title).to_dict()

# Randomly display 5 records in the dataframe
for df in list((df_movies, df_ratings)):
    rand_idx = np.random.choice(len(df), 5, replace=False)
    display(df.iloc[rand_idx,:])
    print("Displaying 5 of the total "+str(len(df))+" data points")

Unnamed: 0,movieId,title,genres
12382,57439,"Sensation of Sight, The (2006)",Drama
16793,84871,At Long Last Love (1975),Comedy|Musical|Romance
6363,6473,Half Moon Street (1986),Drama|Thriller
5396,5493,In Like Flint (1967),Action|Adventure|Comedy
6454,6564,Lara Croft Tomb Raider: The Cradle of Life (2003),Action|Adventure|Comedy|Romance|Thriller


Displaying 5 of the total 27278 data points


Unnamed: 0,userId,movieId,rating,timestamp
4386958,29942,17,2.0,832132207
14894972,102907,27772,4.0,1186478620
7385822,50943,5945,4.0,1059948684
1722316,11638,1223,4.5,1354576095
3927935,26821,930,2.0,974784344


Displaying 5 of the total 20000263 data points


In [2]:
from sklearn.model_selection import train_test_split

df_ratings_train, df_ratings_test= train_test_split(df_ratings,
                                                    stratify=df_ratings['userId'],
                                                    random_state = 15688,
                                                    test_size=0.30)

In [3]:
print("Number of training data: "+str(len(df_ratings_train)))
print("Number of test data: "+str(len(df_ratings_test)))

Number of training data: 14000184
Number of test data: 6000079


In [10]:
# Every user has two lists, one for liked list one for unliked list
def rating_splitter(df):
    
    df['liked'] = np.where(df['rating']>=4, 1, 0)
    df['movieId'] = df['movieId'].astype('str')
    gp_user_like = df.groupby(['liked', 'userId'])

    return ([gp_user_like.get_group(gp)['movieId'].tolist() for gp in gp_user_like.groups])

In [11]:
pd.options.mode.chained_assignment = None
splitted_movies = rating_splitter(df_ratings_train)

In [12]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim
assert gensim.models.word2vec.FAST_VERSION > -1

In [26]:
import random
sampled_list = random.sample(splitted_movies, 2700)
print(len(splitted_movies))
print(len(sampled_list))

for movie_list in sampled_list:
    random.shuffle(movie_list)

276252
2700


In [28]:
from gensim.models import Word2Vec
import datetime
start = datetime.datetime.now()

model = Word2Vec(sentences = sampled_list, # We will supply the pre-processed list of moive lists to this parameter
                 iter = 5, # epoch
                 min_count = 10, # a movie has to appear more than 10 times to be keeped
                 size = 200, # size of the hidden layer
                 workers = 4, # specify the number of threads to be used for training
                 sg = 1, # Defines the training algorithm. We will use skip-gram so 1 is chosen.
                 hs = 0, # Set to 0, as we are applying negative sampling.
                 negative = 5, # If > 0, negative sampling will be used. We will use a value of 5.
                 window = 9999999)

print("Time passed: " + str(datetime.datetime.now()-start))
#Word2Vec.save('item2vec_20180327')

Time passed: 0:00:24.769863


In [33]:
item_embeddings = model.wv

In [41]:
# Use built-in method
item_embeddings.most_similar(['2991'])
item_embeddings.get_vector('2991')

array([-0.16706412,  0.1514451 ,  0.06940889,  0.07019334, -0.13757634,
        0.09387428,  0.00123613,  0.09886783, -0.07118744,  0.12420377,
        0.14399597,  0.10212857,  0.13656957, -0.28205237, -0.02345779,
        0.11527677, -0.09361482,  0.0119391 ,  0.09717289, -0.04909069,
       -0.1251151 ,  0.10618363,  0.0265832 ,  0.00477338, -0.05524788,
        0.19301237, -0.11246625, -0.21516363,  0.08366182, -0.05144181,
        0.04617369, -0.02190979, -0.26778272,  0.1034494 , -0.01681721,
       -0.02255436,  0.04810009,  0.08444884,  0.0529722 ,  0.18078963,
        0.14825454,  0.02690773,  0.04579838, -0.01374364, -0.08007074,
       -0.00365069,  0.12839739,  0.10422312,  0.15239675,  0.00302883,
        0.14977047, -0.09625427,  0.00238035, -0.08291376, -0.14186817,
        0.16200133,  0.14228036,  0.01923411, -0.08890486,  0.03199152,
       -0.12115943,  0.17326094, -0.10315508, -0.08473594, -0.17716485,
       -0.02510586,  0.06271847, -0.14564277,  0.04157463,  0.34

In [42]:
my_dict = dict({})
for idx, key in enumerate(item_embeddings.vocab):
    my_dict[key] = model.wv[key]

In [49]:
my_dict

{'1379': array([-2.35922504e-02, -1.60996899e-01,  1.03055686e-01, -4.92235050e-02,
        -2.75021762e-01, -3.32598612e-02,  3.06628849e-02, -1.02417879e-01,
        -5.31893037e-02,  3.73870209e-02,  9.26905051e-02,  9.45780519e-03,
         7.19769374e-02, -1.20022632e-01,  1.59848511e-01,  1.06880523e-01,
        -1.22125201e-01,  9.08388346e-02,  2.17020437e-01,  6.67973682e-02,
         5.09610251e-02,  2.51259387e-01,  1.86100472e-02,  7.01979622e-02,
        -3.61365862e-02, -1.39900938e-01, -1.46408230e-01,  3.95298414e-02,
        -1.86425094e-02,  1.15931809e-01, -2.46268585e-01,  7.30087310e-02,
        -1.27055421e-01, -1.54250100e-01, -1.34079233e-01, -1.20222598e-01,
        -2.09941357e-01, -1.22467704e-01,  1.40473455e-01,  1.42526403e-01,
         2.55288690e-01, -5.10383882e-02, -2.66858667e-01, -4.21452612e-01,
         1.05125941e-01,  2.32708827e-01, -1.55945225e-02,  1.47428349e-01,
        -1.43745512e-01,  1.32046431e-01, -2.33201191e-01,  4.79006097e-02,
    

In [47]:
df = pd.DataFrame.from_dict(my_dict,  orient='index')

In [48]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
1379,-0.023592,-0.160997,0.103056,-0.049224,-0.275022,-0.033260,0.030663,-0.102418,-0.053189,0.037387,...,-0.286790,-0.116246,0.188010,-0.088002,0.106681,0.016855,-0.039595,0.019194,0.099640,0.067198
2991,-0.167064,0.151445,0.069409,0.070193,-0.137576,0.093874,0.001236,0.098868,-0.071187,0.124204,...,-0.089231,0.126388,0.029537,-0.110578,0.031564,0.118090,-0.063606,0.044292,0.005581,0.033703
329,-0.004326,0.033412,0.128539,0.157540,-0.069287,-0.412511,-0.108517,0.095686,-0.159460,-0.023369,...,-0.080103,-0.006913,-0.160744,0.010921,-0.043623,0.115135,-0.036965,0.080481,-0.391659,-0.088664
2722,-0.050445,-0.113272,0.043725,-0.042658,-0.007289,-0.024678,0.020870,0.167174,-0.140193,-0.032033,...,0.011392,-0.012428,-0.162258,-0.065964,-0.082638,0.042300,-0.055497,-0.058397,-0.195030,-0.001469
1240,-0.107508,-0.012275,-0.002032,-0.019153,-0.181822,-0.072768,-0.147076,0.048370,-0.003338,0.025225,...,-0.129167,0.065244,-0.000609,-0.117576,0.029564,0.042286,-0.202108,-0.089422,0.057128,-0.023457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5296,-0.071316,-0.260221,0.249639,0.035986,-0.256356,0.030883,-0.077751,-0.054574,-0.147421,-0.107786,...,0.082112,0.018038,-0.252532,-0.317374,-0.052163,-0.178625,0.137790,0.190010,0.039525,0.187531
6595,-0.352974,-0.009408,0.309246,0.019282,-0.304302,-0.163986,-0.028036,0.064279,-0.053205,-0.138728,...,0.203658,-0.071585,-0.201682,-0.251072,-0.345726,-0.050247,-0.012828,0.148783,-0.023225,0.011725
1268,-0.040126,0.034807,0.004430,-0.021378,-0.137350,-0.007811,-0.041850,-0.019635,-0.002238,0.050778,...,-0.226073,-0.110431,0.274995,0.051850,0.131792,-0.068844,0.019978,0.121521,0.171526,0.084821
2579,0.064013,0.437011,-0.080528,0.026785,-0.246758,-0.100551,-0.038227,0.151160,0.068889,-0.094297,...,-0.058310,-0.028667,0.081755,-0.140365,0.166512,-0.296749,-0.037051,0.207333,0.085112,0.257403
