# 소비자 구매 데이터를 추천시스템에 적용

In [46]:
import pandas as pd
import numpy as np
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
import heapq
from collections import defaultdict
from operator import itemgetter
from six import iteritems

In [4]:
friday = pd.read_csv('BlackFriday.csv')
friday.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [7]:
friday = friday[['User_ID', 'Product_ID']]
friday['rating'] = 1
friday.head()

Unnamed: 0,User_ID,Product_ID,rating
0,1000001,P00069042,1
1,1000001,P00248942,1
2,1000001,P00087842,1
3,1000001,P00085442,1
4,1000002,P00285442,1


In [8]:
friday.isnull().sum()

User_ID       0
Product_ID    0
rating        0
dtype: int64

In [14]:
friday[friday[['User_ID', 'Product_ID']].duplicated()]

Unnamed: 0,User_ID,Product_ID,rating


In [32]:
reader = Reader(line_format='user item rating')
data = Dataset.load_from_df(df=friday, reader=reader)

In [33]:
trainset = data.build_full_trainset()

In [40]:
len(friday['User_ID'].unique())

5891

In [43]:
len(friday['Product_ID'].unique())

3623

In [51]:
def jaccard_sim(n_x, yr):
    sim = np.zeros((n_x, n_x), np.double)
    for y, y_ratings in iteritems(yr):
        for xi, ri in y_ratings:
            for xj, rj in y_ratings:
                sim[xi, xj] += 1
    np.fill_diagonal(sim, 1)
    return sim

In [52]:
n_x, yr = trainset.n_items, trainset.ur
simsMatrix = jaccard_sim(n_x, yr)
simsMatrix

array([[ 1., 52.,  7., ...,  1.,  0.,  0.],
       [52.,  1., 15., ...,  0.,  0.,  0.],
       [ 7., 15.,  1., ...,  0.,  0.,  0.],
       ...,
       [ 1.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [68]:
testSubject = 1000001
testUserInnerID = trainset.to_inner_uid(testSubject)
testUserRatings = trainset.ur[testUserInnerID]

recommend_row = np.zeros((1, n_x), np.double)
for item, rating in testUserRatings:
    recommend_row += simsMatrix[item]

sorted_item = np.argsort(recommend_row).reshape(-1,)
for i in sorted_item[:5]:
    print(trainset.to_raw_iid(i))

P00091542
P00066342
P00144942
P00299042
P00228842
