# 전환이 일어난 유저(cart or purchase)에게 특정 기준으로 추천
특정 기준 : 유저가 해당 제품에 대해서 전환이 일어났다면(cart or purchase) 1, 아니면(only view) 0

# 라이브러리 로드

In [None]:
!pip install implicit
!pip install fastparquet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import random
import implicit

from tqdm import tqdm

# 데이터 로드

In [None]:
try:
  path = 'C:/Users/User/Desktop/AIB_13/CP2/data/'
  df = pd.read_parquet(path + 'light_2019-Oct.parquet', engine='fastparquet')
except:
  path = '/content/drive/MyDrive/CP2/data/'
  df = pd.read_parquet(path + 'light_2019-Oct.parquet', engine='fastparquet')


In [None]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,-251657396,,shiseido,35.790001,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,-780140327,appliances.environment.water_heater,aqua,33.200001,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,-1904213353,furniture.living_room.sofa,,543.099976,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,1518338663,computers.notebook,lenovo,251.740005,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,-1769995873,electronics.smartphone,apple,1081.97998,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [None]:
data = df[['user_id','product_id','event_type']]

# 데이터 전처리

## cart, purchase가 존재하는 유저 데이터 추출

1. event_type 컬럼에 cart가 존재하는 user_id 추출
2. event_type 컬럼에 purchase가 존재하는 user_id 추출
3. 위에서 구한 id 합치기

In [None]:
cart_user_id = set(data.loc[data['event_type'] == 'cart','user_id'].unique())
purchase_user_id = set(data.loc[data['event_type'] == 'purchase', 'user_id'].unique())

all_id = cart_user_id.union(purchase_user_id)

In [None]:
data = data.loc[data['user_id'].isin(all_id)].reset_index(drop=True)

In [None]:
data['event_type'] = data['event_type'].astype('object')

## 제품을 view 만 했다면 0, 전환이 일어났다면 1로 변환

In [None]:
grouped = data.groupby(['user_id','product_id','event_type'])['event_type'].count()
grouped = pd.DataFrame(grouped).rename(columns={'event_type' : 'count'}).reset_index()

In [None]:
table = grouped.pivot_table(index=['user_id','product_id'], columns=['event_type'], values=['count'])
table = table.reset_index()
table.columns = ['user_id','product_id','cart','purchase','view']
table = table.fillna(0)

In [None]:
table.head()

Unnamed: 0,user_id,product_id,cart,purchase,view
0,264649825,8500081,0.0,0.0,1.0
1,264649825,8500083,0.0,1.0,12.0
2,264649825,8500084,0.0,1.0,4.0
3,264649825,8500086,0.0,0.0,2.0
4,264649825,27700136,0.0,0.0,1.0


In [None]:
table['value'] = None
#제품에 대하여 view만 있는 경우
table.loc[(table['cart'] == 0) & (table['purchase'] == 0) & (table['view'] >= 1), 'value'] = 0
#제품에 대하여 전환이 일어난 적이 있는 경우
table.loc[(table['cart'] >= 1) | (table['purchase'] >= 1), 'value'] = 1

In [None]:
data = table[['user_id','product_id','value']]

In [None]:
data.head()

Unnamed: 0,user_id,product_id,value
0,264649825,8500081,0
1,264649825,8500083,1
2,264649825,8500084,1
3,264649825,8500086,0
4,264649825,27700136,0


# Item Lookup(product lookup) 테이블 만들기

In [None]:
product_lookup = df[['product_id','category_code','brand']].drop_duplicates('product_id').reset_index(drop=True).sort_values('product_id')
product_lookup.head()

Unnamed: 0,product_id,category_code,brand
151915,1000978,electronics.smartphone,
8437,1001588,electronics.smartphone,meizu
85152,1001606,electronics.smartphone,apple
32556,1002042,electronics.smartphone,samsung
9400,1002062,electronics.smartphone,samsung


# Rating Matrix 만들기

In [None]:
num_user = data['user_id'].nunique()
num_item = data['product_id'].nunique()

num_user, num_item

(481458, 148488)

In [None]:
users = list(np.sort(data['user_id'].unique()))
products = list(data['product_id'].unique())
value = list(data['value'])

rows = data['user_id'].astype('category').cat.codes
data['user_id_code'] = data['user_id'].astype('category').cat.codes

cols = data['product_id'].astype('category').cat.codes
data['product_id_code'] = data['product_id'].astype('category').cat.codes

len(users), len(products), len(value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


(481458, 148488, 7942703)

In [None]:
user_item_matrix = sparse.csr_matrix((value, (rows, cols)), shape=(num_user, num_item))
user_item_matrix

<481458x148488 sparse matrix of type '<class 'numpy.int64'>'
	with 7942703 stored elements in Compressed Sparse Row format>

# 추천 시스템 class 구현

In [None]:
from implicit.als import AlternatingLeastSquares
from sklearn import metrics
class MyALS():
  """
  implicit 라이브러리를 이용하여 필요한 기능을 담았습니다.

  Parameters
  ----------
  model : implicit 라이브러리의 AlternatingLeastSquares() 클래스로 생성된 인스턴스
  user_item_matrix : 사용자가 정의한 User-Item Matrix(Sparse Matrix)
  item_lookup : item(product)에 대한 정보를 담은 테이블
  data : User-Item Matrix를 만든 원본데이터

  """
  def __init__(self, model, user_item_matrix, item_lookup, data):

    self.model = model
    self.metrics_model = model
    self.user_item_matrix = user_item_matrix
    self.item_lookup = item_lookup
    self.data = data
  
  def test_lookup(self):
    display(self.item_lookup.head())
  def test_check_user_ind(self):
    print(self.zero_user_inds[:10])

  def fit(self, user_item_matrix):
    """
    행렬분해의 ALS를 이용하여 모델을 학습합니다.

    Parameters
    ----------
    user_item_matrix : 사용자가 정의한 User-Item Matrix(Sparse Matrix)
    """
    self.model.fit(user_item_matrix)
    
  def user_id_2_code(self, user_id):
    """
    입력받은 user_id를 User_Item_Matrix에 있는 user_id_code로 바꾸어주는 함수

    Parameters
    ----------
    user_id : 유저 ID

    """
    user_id_code = self.data.loc[self.data['user_id'] == user_id, 'user_id_code'].unique()[0]
    return user_id_code

  def product_id_2_code(self, product_id):
    """
    입력받은 product_id를 User-Item-Matrix에 있는 product_id_code로 바꾸어주는 함수

    Parameters
    ----------
    product_id : 상품 ID
    """
    product_id_code = self.data.loc[self.data['product_id'] == product_id, 'product_id_code'].unique()[0]
    return product_id_code

  def code_2_product_id(self, product_id_code):
    """
    입력받은 product_id_code를 User-Item-Matrix에 있는 product_id로 바꾸어주는 함수

    Parameters
    ----------
    product_id_code : 상품 ID code    
    """
    product_id = self.data.loc[self.data['product_id_code'] == product_id_code, 'product_id'].unique()[0]
    return product_id

  def get_recom_product(self, user_id, n = 10):
    """  
    user_id에 맞는 product를 n개 만큼 추천하여 데이터프레임 형태로 반환하는 함수

    Parameters
    ----------
    user_id : 유저 ID
    n : 추천 받게 될 item의 수
    """ 

    # user_id_2_code 함수를 이용하여 유저의 ID를 user_id_code로 변환합니다
    user_id_code = self.user_id_2_code(user_id)
  
    # model의 recommend를 이용하여 추천받는 제품의 id를 추출합니다.
    # 이때 추천 받는 제품의 id는 product_id가 아니라 product_id_code 입니다.
    recommended = self.model.recommend(user_id_code, self.user_item_matrix[user_id_code], N=n)[0]
    #결과를 담을 리스트를 초기화 합니다.
    results = []
    # 추천 받은 id를 돌면서 item_lookup 테이블에서 해당 product의 정보를 찾아 결과에 담습니다.
    for product_id_code in recommended:
      
      recommended_product_id = self.code_2_product_id(product_id_code)
      result = self.item_lookup.loc[self.item_lookup['product_id'] == recommended_product_id]
      results.append(result)
      
    return pd.concat(results)
  
  def get_user_topN_product(self,user_id,column, n = 20):
    """
    유저가 특정 기준값이 높은 제품 N개를 반환

    Parameters
    ----------
    user_id : 유저 ID
    column : 어떠한 값을 확인할 기준이 되는 컬럼
    n : 반환할 Item 수
    """

    product_ids = self.data.loc[self.data['user_id'] == user_id].sort_values(column, ascending=False)[:n]['product_id'].values
    product_values = self.data[self.data['user_id'] == user_id].sort_values(column, ascending=False)[:n][column].values
    results = []
    for i in product_ids:
      result = self.item_lookup.loc[self.item_lookup['product_id'] == i]
      results.append(result)
    frame = pd.concat(results)
    frame[column] = product_values

    return frame

  def get_explain(self, user_id, item_id, column):
    """
    사용자에게 제품이 추천된 이유를 반환하는 함수

    Parameters
    ----------
    user_id : 유저 ID
    item_id : Item(product) ID
    column : 확인할 컬럼
    """

    user_id_code = self.user_id_2_code(user_id)
    product_id_code = self.product_id_2_code(item_id)

    total_score, top_contributions, user_weights = self.model.to_cpu().explain(user_id_code, self.user_item_matrix, product_id_code)

    results = []
    categorys = []
    brands = []
    scores = []
    for id_, score_ in top_contributions:
      product_id = self.code_2_product_id(id_)
      result = self.data.loc[(self.data['product_id'] == product_id) & (self.data['user_id'] == user_id)][['user_id','product_id',column]]
      category = self.item_lookup.loc[self.item_lookup['product_id'] == product_id, 'category_code'].unique()[0]
      brand = self.item_lookup.loc[self.item_lookup['product_id'] == product_id, 'brand'].unique()[0]

      results.append(result)
      categorys.append(category)
      brands.append(brand)
      scores.append(score_)

    frame = pd.concat(results)
    frame['score'] = scores
    frame['category'] = categorys
    frame['brand'] = brands
    
    frame = frame[['user_id', 'product_id','category','brand',column, 'score']]
    return frame, total_score

  def get_train_test(self,percentage=.2, seed=42):
    test_set = self.user_item_matrix.copy()
    train_set = self.user_item_matrix.copy()

    test_set[test_set != 0] = 1

    nonzero_idxs = train_set.nonzero()
    nonzero_pairs = list(zip(nonzero_idxs[0], nonzero_idxs[1]))

    random.seed(seed)
    n_samples = int(np.ceil(percentage * len(nonzero_pairs)))
    samples = random.sample(nonzero_pairs, n_samples)

    user_idxs = [index[0] for index in samples]
    item_idxs = [index[1] for index in samples]

    train_set[user_idxs, item_idxs] = 0
    train_set.eliminate_zeros()

    self.zero_user_idxs = user_idxs
    self.zero_item_idxs = item_idxs
    self.train_set = train_set
    self.test_set = test_set
    

  def get_score(self, test_size =.2, seed=42, k = 10, method='hit_at_k'):
    # train, test로 데이터가 분리
    self.get_train_test(test_size, seed)
    # metrics_model이 학습
    self.metrics_model.fit(self.train_set)
    # 샘플 user를 랜덤으로 추출
    random_state = np.random.RandomState(seed)
    user_id_code_samples = random_state.choice(self.data['user_id_code'], 10000)
    if method == 'hit_at_k':
      scores = self.hit_at_k(user_id_code_samples, k)
    
    return scores
  def hit_at_k(self, user_id_code_samples, k):
    """
    k개의 추천 중 relevant한것(전환된 이력이 있다면 것==1)이 있다면 1, 아니면 0을 반환
    추천을 받은 user의 수 만큼 나누어줌

    Parameters
    ----------
    user_id_code_samples : 샘플링한 유저의 ID
    k : 추천할 아이템의 수
    """
    scores = []
    for user_id_code_sample in tqdm(user_id_code_samples):
      recommedation_ids = self.metrics_model.recommend(user_id_code_sample, self.train_set[user_id_code_sample], N=k)[0]
      results = []
      for id_ in recommedation_ids:
        result = self.test_set[user_id_code_sample, id_]
        results.append(result)

      if 1 in results:
        scores.append(1)
      else:
        scores.append(0)
    return np.mean(scores)
  
  def get_precision_score(self,y_true, y_pred):
    return metrics.precision_score(y_true, y_pred)

# 추천하기

In [None]:
data.groupby('user_id')['value'].sum()

user_id
264649825    2
284344819    1
293957954    1
303160429    1
304325717    1
            ..
566272904    1
566274637    1
566276996    1
566278294    1
566280291    1
Name: value, Length: 481458, dtype: object

In [None]:
data.loc[data['user_id']==264649825]

Unnamed: 0,user_id,product_id,value,user_id_code,product_id_code
0,264649825,8500081,0,0,29201
1,264649825,8500083,1,0,29203
2,264649825,8500084,1,0,29204
3,264649825,8500086,0,0,29206
4,264649825,27700136,0,0,114969
5,264649825,45601506,0,0,143569


In [None]:
model = MyALS(AlternatingLeastSquares(), user_item_matrix, product_lookup, data)

In [None]:
model.fit(user_item_matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
user = 264649825
column = 'value'

model.get_user_topN_product(user, column)

Unnamed: 0,product_id,category_code,brand,value
7164,8500083,,kiturami,1
8398,8500084,,kiturami,1
35249,8500081,,kiturami,0
497,8500086,,hubert,0
26747,27700136,construction.tools.pump,,0
97808,45601506,apparel.shoes,,0


In [None]:
model.get_recom_product(user)

Unnamed: 0,product_id,category_code,brand
147,1002532,electronics.smartphone,apple
890,1003316,electronics.smartphone,apple
401,1004872,electronics.smartphone,samsung
1048,1003310,electronics.smartphone,apple
56,1005105,electronics.smartphone,apple
19,1003306,electronics.smartphone,apple
117,1002528,electronics.smartphone,apple
637,1801881,electronics.video.tv,samsung
270,4804055,electronics.audio.headphone,apple
58,1004659,electronics.smartphone,samsung


In [None]:
explain_frame, total_score = model.get_explain(user, 1002532, column)
explain_frame

  "OpenBLAS detected. Its highly recommend to set the environment variable "


Unnamed: 0,user_id,product_id,category,brand,value,score
1,264649825,8500083,,kiturami,1,0.000444225
2,264649825,8500084,,kiturami,1,1.165747e-07
5,264649825,45601506,apparel.shoes,,0,0.0
4,264649825,27700136,construction.tools.pump,,0,0.0
3,264649825,8500086,,hubert,0,-0.0
0,264649825,8500081,,kiturami,0,-0.0


# 성능 평가하기

In [None]:
scores = model.get_score()

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:12<00:00, 823.32it/s]


In [None]:
print('hit at k(10) : ', round(scores, 2))

hit at k(10) :  0.02
