In [21]:
from IPython.display import display

In [1]:
import pandas as pd
df=pd.read_csv('../dataset/2019-Dec.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3533286 entries, 0 to 3533285
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   event_time     object 
 1   event_type     object 
 2   product_id     int64  
 3   category_id    int64  
 4   category_code  object 
 5   brand          object 
 6   price          float64
 7   user_id        int64  
 8   user_session   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 242.6+ MB


In [2]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-12-01 00:00:00 UTC,remove_from_cart,5712790,1487580005268456287,,f.o.x,6.27,576802932,51d85cb0-897f-48d2-918b-ad63965c12dc
1,2019-12-01 00:00:00 UTC,view,5764655,1487580005411062629,,cnd,29.05,412120092,8adff31e-2051-4894-9758-224bfa8aec18
2,2019-12-01 00:00:02 UTC,cart,4958,1487580009471148064,,runail,1.19,494077766,c99a50e8-2fac-4c4d-89ec-41c05f114554
3,2019-12-01 00:00:05 UTC,view,5848413,1487580007675986893,,freedecor,0.79,348405118,722ffea5-73c0-4924-8e8f-371ff8031af4
4,2019-12-01 00:00:07 UTC,view,5824148,1487580005511725929,,,5.56,576005683,28172809-7e4a-45ce-bab0-5efa90117cd5


In [17]:
df.isnull().sum()

event_time             0
event_type             0
product_id             0
category_id            0
category_code    3474821
brand            1510289
price                  0
user_id                0
user_session         779
score             664655
dtype: int64

## 단순한 추천 로직 (1)
- best5~10은 저번 시간에 이미 코드로 증명
- event_type -> 명시적이진 않고 암묵적인 형태의 데이터인데 -> 그래도 view,  < (cart, purchase) 구매에 대한 확률이 높은 행동
- 가중치를 통한 추천리스트 만들 수 있다. 
- e.g. event { view :1점, cart :3점, purchase : 5점 가중치 }
    - product_id 기준으로 가중치 벡터가 만들어진다.
        - 단순한 가중치에 의한 추천을 호출할 수 있다.
        - 선형결합형태로 벡터랑 가중치랑 결합해서 추천스코어가 나올 수 있다.
- 이 형태로 스코어를 계산해 보자!

In [3]:
weights = {
    'view':1,
    'cart':3,
    'purchase':5    
}

df['score']=df['event_type'].map(weights)
product_scores = df.groupby('product_id')['score'].sum().reset_index()

## 추천지수가 높은 제품 상위 10개 출력
product_scores=product_scores.sort_values(by='score', ascending=False)
print(product_scores)

       product_id    score
17703     5809910  52253.0
16260     5802432  20129.0
7206      5700037  20017.0
29295     5854897  17311.0
17705     5809912  16946.0
...           ...      ...
41767     5905666      0.0
41775     5905677      0.0
4714      5668993      0.0
4716      5668995      0.0
15801     5800696      0.0

[44624 rows x 2 columns]


- 필수과제1
    - 스코어링의 분포를 살펴보고 해당 스코어 점수를 정규화할 수 있는 방법은 무엇이 있을까!?
    - 1.1
        - 스코어에 대한 분포도 확인하여, 한쪽에 치우친 분포인지도 체크하기!
    - 1.2
        - softmax함수를 통해스 스코어링을 재정의하고 다시 점수를 출력해 주세요!

In [4]:
print(product_scores)

       product_id    score
17703     5809910  52253.0
16260     5802432  20129.0
7206      5700037  20017.0
29295     5854897  17311.0
17705     5809912  16946.0
...           ...      ...
41767     5905666      0.0
41775     5905677      0.0
4714      5668993      0.0
4716      5668995      0.0
15801     5800696      0.0

[44624 rows x 2 columns]


## 단순추천로직 (2)
- KNN 거리반으로 단순하게 추천 가능하다.
    - 추천에 필요한 메트릭스를 생각하면 행과 열에 따라 다르게 나타날 수 있다.
    - 행이 제품이고, 열이 제품일 수 있고 nxn 메트릭스
    - 행이 제품이고, 열이 유저 
    - 행이 유정이고, 열이 제품일 수 있다.
    - product_id기준으로 pivot_table 통해 메트릭스 만들기!

In [5]:
### 구매를 한 경우에만 일단 메트릭스를 만들려고 한다.
### 원래는 purchase로 하셔야 하는데 -> 데이터 좀 부족해서 이번만 cart 
# df_cart =df[df['event_type']=='cart'].copy()
df_purchase =df[df['event_type']=='purchase'].copy()

In [6]:
#주문한 데이터만 가지고 만들어 보자!
product_user_matrix =df_purchase.pivot_table(index='product_id',columns = 'user_id', aggfunc='size', fill_value=0)

In [7]:
product_user_matrix

user_id,12055855,15400971,25392526,28129653,31647175,32701458,34080306,37189384,40725049,41738010,...,595285565,595308157,595316689,595323204,595363945,595367593,595372293,595373694,595376179,595413503
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3762,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3774,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3928,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5916489,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5916498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5916499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5916561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
## KNN 알고리즘 사용
from sklearn.neighbors import NearestNeighbors

## Cosine 유사도
#product_user_matrix

knn=NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(product_user_matrix)

## KNN 기반 추천 시스템이 구현이 되어야 한다.
## 추천제품을  출력하는 코드 -> input에 따라 다를 것 
## A라는 제품을 구매했을 때 -> 같이 구매하면 좋을 장바구니에 담았을 때 같이 추천할 제품을 찾기 

def knn_recommendation(selected_proudct, product_user_matrix, knn_model, n_neighbors = 5):
    ## 제품이 추천이 되지 않는 경우 예외조건 설정
    
    if selected_product not in product_user_matrix.index:
        print(f'제품이 {selected_product}에 대한 데이터가 없습니다.')
        return []
    
    ## KNN 계산을 하기 위해 선택된 벡터가 필요하고 , knn으로 계산을 진행해야 한다.
    product_vector = product_user_matrix.loc[selected_product].values.reshape(1,-1) # 행,열 맞춤
    distances, indices =knn_model.kneighbors(product_vector, n_neighbors=n_neighbors+1)
    
    recommendation = []
    # 코사인유사도는 가장 높은 값은 자기자신일 것 그것은 제외하는 조건으로 기준을 나누기도 한다.
    for i in range(1, len(indices[0])):
        recommend_product = product_user_matrix.index[indices[0][i]] 
        recommendation.append((recommend_product, 1-distances[0][i]))
        
    return sorted(recommendation, key = lambda x:x[1], reverse =True)
    
selected_product = 5916489
recommendation_products =knn_recommendation(selected_product,product_user_matrix,knn)

In [11]:
print(f'제품 {selected_product}과 함께 장바구니에 담을 추천 제품:')
for prod, similarity in recommendation_products:
    print(f'제품 ID{prod}, 유사도: {similarity:.4f}')

제품 5916489과 함께 장바구니에 담을 추천 제품:
제품 ID5916385, 유사도: 0.7071
제품 ID5916486, 유사도: 0.7071
제품 ID5916401, 유사도: 0.7071
제품 ID5916478, 유사도: 0.5000
제품 ID5910724, 유사도: 0.5000


In [12]:
recommendation_products

[(5916385, 0.7071067811865475),
 (5916486, 0.7071067811865475),
 (5916401, 0.7071067811865475),
 (5916478, 0.4999999999999999),
 (5910724, 0.4999999999999999)]

In [24]:
product_info = df[['product_id','category_id','brand']].drop_duplicates()
product_info.head()

Unnamed: 0,product_id,category_id,brand
0,5712790,1487580005268456287,f.o.x
1,5764655,1487580005411062629,cnd
2,4958,1487580009471148064,runail
3,5848413,1487580007675986893,freedecor
4,5824148,1487580005511725929,


In [25]:
product_info.loc[product_info['product_id']==selected_product]

Unnamed: 0,product_id,category_id,brand
3208624,5916621,1542195323827388674,
3234464,5916621,1542195323827388674,relouis


In [26]:
rec_ids = [r[0] for r in recommendation_products]
product_info.loc[product_info['product_id'].isin(rec_ids)].sort_values(by='product_id')

Unnamed: 0,product_id,category_id,brand
75233,5878938,1487580013145358517,
375309,5895429,2068966806634103136,levrana
202586,5895430,2068966806634103136,levrana
2045104,5914286,1921723491720102387,
3208624,5916621,1542195323827388674,
3234464,5916621,1542195323827388674,relouis


In [None]:
# 같은 브랜드의 제품들이 대부분 추천되는 경우
selected_product = 3928
recommendation_products =knn_recommendation(selected_product,product_user_matrix,knn)
print('selected product')
display(product_info.loc[product_info['product_id']==selected_product])
rec_ids = [r[0] for r in recommendation_products]
print('recommended product')
display(product_info.loc[product_info['product_id'].isin(rec_ids)].sort_values(by='product_id'))

selected product


Unnamed: 0,product_id,category_id,brand
13058,3928,1487580005411062629,cnd


recommended product


Unnamed: 0,product_id,category_id,brand
129,3978,1487580005411062629,cnd
94353,8084,1487580005411062629,cnd
1358,5805751,1487580005411062629,cnd
503002,5809172,1487580005411062629,cnd
760274,5911858,1487580007675986893,dartnails


In [None]:
selected_product = 5916621
recommendation_products =knn_recommendation(selected_product,product_user_matrix,knn)
print('selected product')
display(product_info.loc[product_info['product_id']==selected_product])
rec_ids = [r[0] for r in recommendation_products]
print('recommended product')
display(product_info.loc[product_info['product_id'].isin(rec_ids)].sort_values(by='product_id'))

selected product


Unnamed: 0,product_id,category_id,brand
3208624,5916621,1542195323827388674,
3234464,5916621,1542195323827388674,relouis


recommended product


Unnamed: 0,product_id,category_id,brand
75233,5878938,1487580013145358517,
375309,5895429,2068966806634103136,levrana
202586,5895430,2068966806634103136,levrana
2045104,5914286,1921723491720102387,
3208624,5916621,1542195323827388674,
3234464,5916621,1542195323827388674,relouis


## 단순추천로직(3)
- 동시구매 형태를 살펴보자!
- 동시에 함께 담는 제품들
- 5814516
- 5877456
- (5814516,5877456) 같이 구매한 경우 
- 동시에 함께 들어간다 -> 확률, P(A) , P(B) , P(AnB)
- 베이지안으로 생각을 해서 -> 베이지안 확률 모델로 랭킹을 스코어링할 수 있다.

In [29]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,score
0,2019-12-01 00:00:00 UTC,remove_from_cart,5712790,1487580005268456287,,f.o.x,6.27,576802932,51d85cb0-897f-48d2-918b-ad63965c12dc,
1,2019-12-01 00:00:00 UTC,view,5764655,1487580005411062629,,cnd,29.05,412120092,8adff31e-2051-4894-9758-224bfa8aec18,1.0
2,2019-12-01 00:00:02 UTC,cart,4958,1487580009471148064,,runail,1.19,494077766,c99a50e8-2fac-4c4d-89ec-41c05f114554,3.0
3,2019-12-01 00:00:05 UTC,view,5848413,1487580007675986893,,freedecor,0.79,348405118,722ffea5-73c0-4924-8e8f-371ff8031af4,1.0
4,2019-12-01 00:00:07 UTC,view,5824148,1487580005511725929,,,5.56,576005683,28172809-7e4a-45ce-bab0-5efa90117cd5,1.0
