In [11]:
import os
import sys 
sys.path.insert(0, '/Users/Jackie/Work/RecommendationSystem')
from main.collaborative_filtering.itemcf import ItemCF
from main.collaborative_filtering.lfm import LFM
from main.util.data import get_data, train_test_split
from main.util.debug import LogUtil
import numpy as np

In [5]:
base_dir = "/Users/Jackie/Work/RecommendationSystem/data/ml-1m"
movies = get_data(os.path.join(base_dir, "movies.dat"), 'MovieID::Title::Genres'.split("::"))
ratings = get_data(os.path.join(base_dir, "ratings.dat") , "UserID::MovieID::Rating::Timestamp".split("::"))
users = get_data(os.path.join(base_dir, "users.dat"), "UserID::Gender::Age::Occupation::Zip-code".split("::"))

2020-07-20 22:40:08,736 - get_data - INFO - loading data from /Users/Jackie/Work/RecommendationSystem/data/ml-1m/movies.dat with columns ['MovieID', 'Title', 'Genres'] takes 0.022  
2020-07-20 22:40:11,519 - get_data - INFO - loading data from /Users/Jackie/Work/RecommendationSystem/data/ml-1m/ratings.dat with columns ['UserID', 'MovieID', 'Rating', 'Timestamp'] takes 2.779  
2020-07-20 22:40:11,567 - get_data - INFO - loading data from /Users/Jackie/Work/RecommendationSystem/data/ml-1m/users.dat with columns ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'] takes 0.015  


In [20]:
movieLen_df = ratings
train_data, test_data = train_test_split(ratings[['UserID', 'MovieID', 'Rating']], frac=0.2)

2020-07-20 22:48:26,251 - train_test_split - INFO - splitting test and train data takes 70.402


In [7]:
movieLen_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [17]:
sum_movie_rating = (movieLen_df[["UserID","MovieID"]].groupby(["MovieID"],as_index = False).count())
sum_movie_rating.rename(mapper = {"UserID" : "Item Popularity"}, axis = 1, inplace = True)
sum_movie_rating = sum_movie_rating.groupby("Item Popularity", as_index = False).count()
sum_movie_rating.rename(mapper = {"MovieID" : "counts"}, axis = 1, inplace = True)
sum_movie_rating["log_Item_Popularity"] = np.log(sum_movie_rating["Item Popularity"])
sum_movie_rating.head()

Unnamed: 0,Item Popularity,counts,log_Item_Popularity
0,1,114,0.0
1,2,89,0.693147
2,3,42,1.098612
3,4,45,1.386294
4,5,39,1.609438


In [21]:
train_data.shape

(800193, 3)

In [22]:
cached_filename = "/Users/Jackie/Work/RecommendationSystem/data/LFM.pickle"
model = ItemCF(discount_popularity=True, filename=cached_filename)
model.train(train_data)

2020-07-20 22:49:40,356 - ItemCF - INFO - loaded item_sim_matrix from /Users/Jackie/Work/RecommendationSystem/data/LFM.pickle
2020-07-20 22:49:40,385 - ItemCF - INFO - load data from existing /Users/Jackie/Work/RecommendationSystem/data/LFM.pickle


In [55]:
N = 3000
K = 10
res = model.recommend('4169', N, K)

2020-07-20 23:02:24,849 - ItemCF - INFO - start recommend 4169 with 3000 items via 10 similar items


In [27]:
test_data.groupby('UserID')['MovieID'].count().sort_values()

UserID
4332      4
5207      4
4056      4
5215      4
4558      4
       ... 
1181    304
1941    319
4277    349
1680    370
4169    463
Name: MovieID, Length: 6040, dtype: int64

In [73]:
res

{'3212': 8.611970199972426,
 '2821': 8.611970199972426,
 '1510': 8.611970199972426,
 '3380': 8.611970199972426,
 '660': 8.611970199972426,
 '884': 7.908892769667693,
 '1820': 7.908892769667693,
 '584': 7.908892769667693,
 '3517': 7.908892769667693,
 '701': 7.908892769667693,
 '796': 6.964400678003869,
 '607': 6.859464782416334,
 '1040': 5.870468313881625,
 '1160': 5.859249199465043,
 '3542': 5.758059245498102,
 '3377': 5.618404397087639,
 '712': 5.41174634619074,
 '1548': 5.383172332057536,
 '3413': 5.181018588356526,
 '827': 5.104583352439954,
 '1375': 5.049384451276555,
 '1196': 5.038049852274363,
 '589': 5.035736700536835,
 '2571': 5.019409066470438,
 '1372': 5.001590398086758,
 '260': 4.999088276255265,
 '2811': 4.975568463520773,
 '1240': 4.970208954449478,
 '2916': 4.958030358514288,
 '1070': 4.925134229805946,
 '1022': 4.881580702353534,
 '1527': 4.818635382431637,
 '1200': 4.777046185652466,
 '1291': 4.765972246353574,
 '608': 4.739789013603682,
 '1376': 4.726846496183557,
 '20

In [60]:
b = list(set(res.keys()))
b.sort()

In [64]:
b[:10]

['10', '100', '1003', '1006', '1008', '1009', '1010', '1013', '1016', '1019']

In [62]:
a.sort()

In [63]:
a[:10]

['1012',
 '1014',
 '1017',
 '1018',
 '1024',
 '1026',
 '1027',
 '1028',
 '1035',
 '1043']

In [65]:
len(res)

1851

In [67]:
c = list(model.user2items.get('4169'))
c.sort()

In [71]:
train_data[train_data['UserID'] == '4169'].sort_values('MovieID')

Unnamed: 0,UserID,MovieID,Rating
696649,4169,10,4
697532,4169,100,3
697708,4169,1003,3
697711,4169,1006,2
697714,4169,1008,4
...,...,...,...
695937,4169,988,4
695952,4169,991,3
695953,4169,994,4
695954,4169,996,3


In [72]:
test_data[test_data['UserID'] == '4169'].sort_values('MovieID')

Unnamed: 0,UserID,MovieID,Rating
697728,4169,1012,5
697732,4169,1014,4
697736,4169,1017,4
697737,4169,1018,4
697752,4169,1024,4
...,...,...,...
695910,4169,969,5
695922,4169,972,3
695924,4169,974,4
695934,4169,984,2


In [91]:
user= '4169'
related_items = model.user2items.get(user, set)

from collections import defaultdict
recommends = defaultdict(int)
for item in related_items:
    sim_map = model.item_sim_matrix.get(item)

    for e, sim in  sorted(sim_map.items(), key=lambda x: x[1], reverse=True)[: K]:
        if e in related_items:
            continue
        recommends[e] += sim

True
True
True
True
True
True
True
True
True
True
True
True
True


In [88]:
'10' in related_items

True

In [92]:
'10' in recommends.keys()

False

In [99]:
a = [1, 2, 3]

In [96]:
a.clear()

In [100]:

s = set(a)

In [102]:

s.clear()

In [103]:
s

set()