In [43]:
import pandas as pd
import numpy as np
import math
from math import sqrt
from sklearn.model_selection import train_test_split
import random
import time
from operator import itemgetter
from gensim.models import word2vec
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances
from scipy.spatial.distance import correlation, cosine
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from keras.layers import Dense,Embedding,Input,Flatten,dot,BatchNormalization,LeakyReLU,Dropout,concatenate,multiply
from keras.losses import mean_squared_error
from keras.models import Model,load_model
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.utils import plot_model

## load data


In [37]:
data = pd.read_csv('ml-1m/ratings.dat',sep='::').sample(500000)
data.columns=['user_id','movie_id','rating','timestamp']
data = data.sort_values(by=['user_id','timestamp'])
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,movie_id,rating,timestamp
30,1,3186,4,978300019
26,1,1721,4,978300055
36,1,1022,5,978300055
23,1,2340,3,978300103
35,1,1836,5,978300172


## train/test splitting
uses the most recent top 5 item as target data

In [9]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()
for user, df in data.groupby('user_id'):
    train_df = pd.concat([train_df,df.head(len(df)-5)],ignore_index=True)
    test_df = pd.concat([test_df,df.tail(5)], ignore_index=True)   
train_df.head()
test_df.head()

Unnamed: 0,user_id,movie_id,rating,datetime,month,day,hour,minute,second
0,1,1566,4,1970-01-12 02:53:44,1,12,2,53,44
1,1,588,4,1970-01-12 02:53:44,1,12,2,53,44
2,1,1907,4,1970-01-12 02:53:44,1,12,2,53,44
3,1,783,4,1970-01-12 02:53:44,1,12,2,53,44
4,1,1,5,1970-01-12 02:53:44,1,12,2,53,44


In [10]:
train_df["movie_id"] = train_df["movie_id"].astype('str')
test_df["movie_id"] = test_df["movie_id"].astype('str')

In [11]:
def df2_item_dict(df):
    df_group = df.groupby("user_id").agg({"movie_id": lambda x: list(x.unique())})
    df_group.columns = ["movie_list"]
    data_dict = df_group.to_dict(orient='index')
    data_dict = {k: v["movie_list"] for k, v in data_dict.items()}
    return data_dict

In [12]:
train_dict = df2_item_dict(train_df)
test_dict = df2_item_dict(test_df)

## training

In [14]:
texts = list(train_dict.values())
model = word2vec.Word2Vec(texts,min_count=1, workers=4,size = 64)
#model.most_similar('1',topn=5)

In [16]:
from operator import itemgetter
def GetRecommendation(user,n):
    seen = train_dict[user]
    recom_list = []
    for item in seen:
        if item not in model:
            continue
        recom_list.extend(model.most_similar(item, topn=n))
    recom_list = [item for item in recom_list if item[0] not in seen]
    recom_list = sorted(recom_list,key=itemgetter(1), reverse=True)
    recom = recom_list[:5]
    return recom

## evaluation metrix
1. Recall
2. Precision

In [21]:
class Eval():
    def __init__(self, train, test, GetRecommendation,N):
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.N =N

    def Recall(self):
        hit = 0
        all = 0
        for user,items in self.test.items():
            tu = items
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += len(tu)
        return round(hit / (all * 1.0), 2)

    def Precision(self):
        hit = 0
        all = 0
        for user in self.test.keys():
            tu = set(self.test[user])
            rank = self.GetRecommendation(user, self.N)
            for item,pui in rank:
                if item in tu:
                    hit += 1
            all += self.N
        return round(hit / (all * 1.0), 2)
    

    def eval(self):
        metric = {'Recall': self.Recall(),'Precision': self.Precision()}
        print('Metric:', metric)
        return metric

## result 

### embedding

In [22]:
N=5
eval_ = Eval(train_dict,test_dict,GetRecommendation,N)
metric = eval_.eval()

  
  


Metric: {'Recall': 0.01, 'Precision': 0.01}


### collabrative filtering

In [39]:
table = pd.pivot_table(data, index = 'user_id',columns = 'movie_id',values = 'rating')
table = table.fillna(0)
table.head(5)

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
cosine_sim = 1 - pairwise_distances(table, metric="cosine")
cosine_sim_df = pd.DataFrame(cosine_sim)
cosine_sim_df *= np.tri(*cosine_sim_df.values.shape, k=-1).T
cosine_sim_df = cosine_sim_df.stack()
cosine_sim_df = cosine_sim_df.reindex(cosine_sim_df.abs().sort_values(ascending=False).index).reset_index()
cosine_sim_df.columns = ["User_A", "User_B", "Cosine_Sim"]

In [61]:
def Recall():
    hit = 0
    all = 0
    for user,items in test_dict.items():
        tu = items
        rank = cosine_sim_df.iloc[sorted(np.concatenate((np.where(cosine_sim_df['User_A'] == which_user)[0][0:number_of_similar_users],np.where(cosine_sim_df['User_B'] == which_user)[0][0:number_of_similar_users])))][0:number_of_similar_users].User_B
        for item in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return round(hit / (all * 1.0), 2)

In [54]:
# model.most_similar('1',topn=5)

# which_user = 1
# number_of_similar_users = 5
# cosine_sim_df.iloc[sorted(np.concatenate((np.where(cosine_sim_df['User_A'] == which_user)[0][0:number_of_similar_users],np.where(cosine_sim_df['User_B'] == which_user)[0][0:number_of_similar_users])))][0:number_of_similar_users]