# 라이브러리 로드

In [None]:
!pip install implicit
!pip install fastparquet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import implicit

# 데이터 로드

추천에 필요한 컬러만 가져오겠습니다.

In [None]:
try:
  path = 'C:/Users/User/Desktop/AIB_13/CP2/data/'
  df = pd.read_parquet(path + 'light_2019-Oct.parquet', engine='fastparquet')
except:
  path = '/content/drive/MyDrive/CP2/data/'
  df = pd.read_parquet(path + 'light_2019-Oct.parquet', engine='fastparquet')


In [None]:
data = df[['user_id','product_id','event_type']]

# 데이터 전처리

In [None]:
data.head()

Unnamed: 0,user_id,product_id,event_type
0,541312140,44600062,view
1,554748717,3900821,view
2,519107250,17200506,view
3,550050854,1307067,view
4,535871217,1004237,view


In [None]:
data['user_id'].nunique()

3022290

3백만 유저중 view만 있는 유저를 가져와 보도록 하겠습니다.

먼저 cart, purchase 이력이 있는 유저들의 id값을 받아옵니다.

In [None]:
drop_user_id = data.loc[data['event_type'] != 'view', 'user_id']

user_id를 기준으로 조회할 때 위에서 받아온 cart, purchase 이력이 있는 유저들은 제외하고 받아옵니다.

In [None]:
data = data.loc[~data['user_id'].isin(drop_user_id)].reset_index()

data.head()

Unnamed: 0,index,user_id,product_id,event_type
0,1,554748717,3900821,view
1,2,519107250,17200506,view
2,3,550050854,1307067,view
3,4,535871217,1004237,view
4,6,555447699,17300353,view


In [None]:
data['event_type'].unique()

['view']
Categories (3, object): ['cart', 'purchase', 'view']

event_type을 확인해보니 view만 존재하는 것을 확인할 수 있습니다.

In [None]:
data['event_type'].dtype

CategoricalDtype(categories=['cart', 'purchase', 'view'], ordered=False)

In [None]:
data['event_type'] = data['event_type'].astype('object')

event_type을 object 형태로 변경합니다.

유저가 제품을 몇번 보았는지 확인합니다.

In [None]:
grouped = data.groupby(['user_id','product_id'])['event_type']

In [None]:
data = grouped.count()
data = data.reset_index()

In [None]:
data = data.sort_values('user_id')

In [None]:
data = data.rename(columns ={'event_type' : 'view_counts'})

In [None]:
data.head()

Unnamed: 0,user_id,product_id,view_counts
0,33869381,7002639,1
1,64078358,10600284,1
2,183503497,22200103,1
3,184265397,6902133,2
4,184265397,6902303,2


# Item Table(product lookup) 테이블 만들기

In [None]:
product_lookup = df[['product_id','category_code','brand']].drop_duplicates('product_id').reset_index(drop=True).sort_values('product_id')
product_lookup.head()

Unnamed: 0,product_id,category_code,brand
151915,1000978,electronics.smartphone,
8437,1001588,electronics.smartphone,meizu
85152,1001606,electronics.smartphone,apple
32556,1002042,electronics.smartphone,samsung
9400,1002062,electronics.smartphone,samsung


# Rating Matrix 만들기

In [None]:
num_user = data['user_id'].nunique()
num_item = data['product_id'].nunique()
num_user, num_item

(2540832, 159298)

약 250만명의 유저와 15만개의 제품이 있습니다.

In [None]:
users = list(np.sort(data['user_id'].unique()))
products  =list(data['product_id'].unique())
counts = list(data['view_counts'])


rows = data['user_id'].astype('category').cat.codes
data['user_id_codes'] = data['user_id'].astype('category').cat.codes

cols = data['product_id'].astype('category').cat.codes
data['product_id_codes'] = data['product_id'].astype('category').cat.codes

len(users), len(products), len(counts)

(2540832, 159298, 15364927)

In [None]:
user_item_matrix = sparse.csr_matrix((counts,(rows, cols)), shape=(num_user, num_item))
user_item_matrix

<2540832x159298 sparse matrix of type '<class 'numpy.int64'>'
	with 15364927 stored elements in Compressed Sparse Row format>

희소성 확인

In [None]:
matrix_size = user_item_matrix.shape[0] * user_item_matrix.shape[1]
num = len(user_item_matrix.nonzero()[0])

sparsity = 100 * (1 - num / matrix_size)
print(sparsity)

99.99620384245743


99.9%의 희소성을 보이고 있습니다.

In [None]:
from implicit.als import AlternatingLeastSquares
model = AlternatingLeastSquares()

model.fit(user_item_matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

# 추천하기

In [None]:
def get_recom_product(user_id, n=10):
  """
  user_id에 맞는 product를 n개 만큼 추천하여 데이터프레임 형태로 반환

  Parameter

  user_id : 원본 데이터의 user_id
  n : 추천 받게 될 item의 수
  """
  user_id_code = data.loc[data['user_id'] == user_id, 'user_id_codes'].unique()[0]
  recommended_product = model.recommend(user_id_code, user_item_matrix[user_id_code], N=n)[0]

  results = []
  for product in recommended_product:
    recommended_product_id = data.loc[data['product_id_codes'] == product, 'product_id'].unique()[0]
    results.append(product_lookup.loc[product_lookup['product_id'] == recommended_product_id])

  return pd.concat(results)

In [None]:
data.sort_values('view_counts', ascending=False)[:10]

Unnamed: 0,user_id,product_id,view_counts,user_id_codes,product_id_codes
243939,512475445,4700419,228,33887,15476
243944,512475445,4700478,225,33887,15505
243946,512475445,4700557,211,33887,15540
243949,512475445,4700590,206,33887,15561
244014,512475445,5800802,197,33887,21004
243965,512475445,5700788,193,33887,20355
5085523,519607186,1004887,192,559255,938
243987,512475445,5701086,184,33887,20475
243980,512475445,5701062,184,33887,20459
11531379,554252708,25600088,129,1562686,105533


아래의 데이터프레임은 view_counts가 높은 순으로 정렬한 데이터입니다.  
user_id가 512475445, 519607186가 높은 랭크를 기록하고 있습니다.

가장 많은 view_counts를 기록한 512475445 유저에게 추천하는 상품을 확인하겠습니다.

In [None]:
get_recom_product(512475445)

Unnamed: 0,product_id,category_code,brand
890,1003316,electronics.smartphone,apple
1675,1005098,electronics.smartphone,samsung
1104,1004777,electronics.smartphone,xiaomi
563,1004768,electronics.smartphone,samsung
261,3600661,appliances.kitchen.washer,samsung
1487,1005160,electronics.smartphone,xiaomi
252,1004246,electronics.smartphone,apple
2267,5801218,electronics.audio.subwoofer,
317,1004957,electronics.smartphone,xiaomi
449,15800006,,karcher


해당 유저(512475445)에게는 smartphone과 subwoofer, headphone을 추천합니다.

512475445 유저가 실제로 많이 보았던 제품들을 확인해보겠습니다.

In [None]:
data.loc[data['user_id'] == 512475445].sort_values('view_counts', ascending=False)[:20]

Unnamed: 0,user_id,product_id,view_counts,user_id_codes,product_id_codes
243939,512475445,4700419,228,33887,15476
243944,512475445,4700478,225,33887,15505
243946,512475445,4700557,211,33887,15540
243949,512475445,4700590,206,33887,15561
244014,512475445,5800802,197,33887,21004
243965,512475445,5700788,193,33887,20355
243980,512475445,5701062,184,33887,20459
243987,512475445,5701086,184,33887,20475
243955,512475445,5700384,127,33887,20244
243966,512475445,5700791,123,33887,20356


In [None]:
def user_view_product_topN(user_id, n=20):
  """
  유저가 실제로 많이 보았던 제품 N개를 반환

  Parameter

  user_id : 유저 ID
  n : 반환할 item 수
  """
  product_ids = data.loc[data['user_id'] == user_id].sort_values('view_counts', ascending=False)[:n]['product_id'].values
  product_view_counts = data.loc[data['user_id'] == user_id].sort_values('view_counts', ascending=False)[:n]['view_counts'].values
  results = []
  for i in product_ids:
    results.append(product_lookup.loc[product_lookup['product_id'] == i])

  frame = pd.concat(results)
  frame['view_count'] = product_view_counts
  return frame

In [None]:
user_view_product_topN(512475445)

Unnamed: 0,product_id,category_code,brand,view_count
868,4700419,auto.accessories.videoregister,sho-me,228
763,4700478,auto.accessories.videoregister,sho-me,225
4726,4700557,auto.accessories.videoregister,sho-me,211
4864,4700590,auto.accessories.videoregister,sho-me,206
14595,5800802,electronics.audio.subwoofer,kenwood,197
2019,5700788,auto.accessories.player,kenwood,193
2814,5701062,auto.accessories.player,pioneer,184
4495,5701086,auto.accessories.player,pioneer,184
487,5700384,auto.accessories.player,pioneer,127
4658,5700791,auto.accessories.player,sony,123


해당 유저(512475445)에게는 스마트폰을 추천했지만, 이 유저가 많이 본 제품은 videoregister, subwoofer, player 입니다.

유저가 보지 않은 제품을 추천하다보니 이렇게 결과가 나온것으로 보입니다.

다른 유저를 확인해보겠습니다.

519607186 유저를 확인해보겠습니다.

In [None]:
get_recom_product(519607186)

Unnamed: 0,product_id,category_code,brand
141,1004838,electronics.smartphone,oppo
585,1004839,electronics.smartphone,oppo
1113,1004886,electronics.smartphone,oppo
1055,1004961,electronics.smartphone,oppo
2392,1004990,electronics.smartphone,oppo
120205,1005205,electronics.smartphone,
73,15100337,,
524,1005021,electronics.smartphone,oppo
97,1004720,electronics.smartphone,huawei
303,15100370,,


이 유저에게 oppo 브랜드의 smartphone을 추천합니다.

실제로 이 유저(519607186)가 보았던 제품을 확인해보겠습니다.

In [None]:
user_view_product_topN(519607186)

Unnamed: 0,product_id,category_code,brand,view_count
5969,1004887,electronics.smartphone,oppo,192
50,1002544,electronics.smartphone,apple,3
4187,14701391,furniture.living_room.cabinet,,3
171,15100367,,,3
1166,1005159,electronics.smartphone,xiaomi,3
1259,14701558,furniture.living_room.cabinet,brw,3
19848,14701705,furniture.living_room.cabinet,brw,2
2286,15100147,,lider,2
9008,3700737,appliances.environment.vacuum,philips,2
40247,14700394,furniture.living_room.cabinet,,2


이 유저는 smartphone을 많이 본것으로 확인됩니다.

알맞게 추천을 한것을 확인할 수 있습니다.

als 모델에 explain이라는 함수가 있습니다.

이 함수는 사용자에게 제품이 추천된 이유를 제공합니다.

In [None]:
def get_explain(user_id, user_item, item_id):
  """
  사용자에게 제품이 추천된 이유(view_counts, scores, category)를 반환하는 함수

  Parameter

  user_id : 유저 ID
  user_item : User-Item Matrix(Sparse Matrix)
  item_id : 제품 ID
  """
  #입력 받은 user_id를 user_id_code로 바꾸어줌
  user_id_code = data.loc[data['user_id'] == user_id, 'user_id_codes'].unique()[0]
  
  #입력 받은 item_id를 proudct_id_code로 바꾸어줌
  product_id_code = data.loc[data['product_id'] == item_id, 'product_id_codes'].unique()[0]
  total_score, top_contributions, user_weights = model.to_cpu().explain(user_id_code, user_item, product_id_code)

  results = []
  category = []
  scores = []
  for id_, score_ in top_contributions:
    product_id = data.loc[data['product_id_codes'] == id_,'product_id'].unique()[0]
    results.append(data.loc[(data['product_id'] == product_id) & (data['user_id'] == user_id)][['user_id','product_id','view_counts']])

    category.append(product_lookup.loc[product_lookup['product_id'] == product_id, 'category_code'].unique()[0])
    scores.append(score_)
  
  frame = pd.concat(results)
  frame['scores'] = scores
  frame['category']= category
  return frame, total_score

In [None]:
frame, total_score = get_explain(519607186, user_item_matrix, 1004838)
display(frame)
print(total_score)

Unnamed: 0,user_id,product_id,view_counts,scores,category
5085523,519607186,1004887,192,1.691613,electronics.smartphone
5085525,519607186,1005015,1,0.025177,electronics.smartphone
5085514,519607186,1002099,1,0.020664,electronics.smartphone
5085565,519607186,15100367,3,0.018051,
5085537,519607186,4804055,1,0.011295,electronics.audio.headphone
5085533,519607186,1005239,1,0.010338,electronics.smartphone
5085535,519607186,3701141,1,0.010221,appliances.environment.vacuum
5085534,519607186,3700737,2,0.003006,appliances.environment.vacuum
5085561,519607186,15100110,1,0.00286,
5085536,519607186,3701349,1,0.002257,appliances.environment.vacuum


1.6087074664760541


In [None]:
frame, total_score = get_explain(512475445, user_item_matrix, 1003316)
display(frame)
print(total_score)

Unnamed: 0,user_id,product_id,view_counts,scores,category
243949,512475445,4700590,206,0.28758,auto.accessories.videoregister
243946,512475445,4700557,211,0.249531,auto.accessories.videoregister
244014,512475445,5800802,197,0.221426,electronics.audio.subwoofer
243965,512475445,5700788,193,0.175815,auto.accessories.player
244042,512475445,6100194,123,0.152096,auto.accessories.radar
243939,512475445,4700419,228,0.146628,auto.accessories.videoregister
243966,512475445,5700791,123,0.142473,auto.accessories.player
243984,512475445,5701079,100,0.141342,auto.accessories.player
243978,512475445,5701058,121,0.138827,auto.accessories.player
244046,512475445,6100261,94,0.135606,auto.accessories.radar


4.8185920894434675
