## 필수과제 1-2
* 실제 데이터셋을 통해 베이지안 기반의 제품 추천 코드 구현
* 동일 상품에 대해 조건부 확률 추천과과 베이지안 추천의 결과를 비교 & why?
    * 만약 12월의 데이터만 활용하면 둘 다 P(B|A)를 구하는 것이기에 결론적으로 같은 값이 도출된다.
    * 단, 상황을 아래와 같이 setitng 해보자.
        * 12월의 거래 이력을 바탕으로 추천을 하는 상황에서
        * 조건부 확률 기반은 12월의 이력을 바탕으로 계산하고
        * 베이지안 업데이트 기반은 11월의 이력을 prior 로 설정하고 evidence 를 12월 의 이력으로 설정해 posterior 를 update 하는 방식으로 계산하면 차이가 발생한다.

In [12]:
import pandas as pd
pd.options.display.float_format = '{:.3f}'.format
import numpy as np
from itertools import combinations
from collections import Counter


In [2]:
df = pd.read_csv('../dataset/2019-Dec.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3533286 entries, 0 to 3533285
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   event_time     object 
 1   event_type     object 
 2   product_id     int64  
 3   category_id    int64  
 4   category_code  object 
 5   brand          object 
 6   price          float64
 7   user_id        int64  
 8   user_session   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 242.6+ MB


In [3]:
purchase_events = df.loc[df['event_type']=='purchase', ['user_id','product_id']].drop_duplicates()

# 동시 구매이기 때문에 1-user & 1-item 구매 건은 제외
user_item_cnt = purchase_events.groupby('user_id')['product_id'].count()
idx_ = list(user_item_cnt[user_item_cnt==1].index)
multi_purchases = purchase_events.loc[~purchase_events['user_id'].isin(idx_)]
user_products = multi_purchases.groupby(['user_id'])['product_id'].apply(list)

In [4]:
user_products.head()

user_id
12055855                 [5769907, 5803082, 5651702, 5885811]
15400971                          [5692888, 5694176, 5889691]
25392526                          [5700037, 5605922, 5565812]
28129653    [5565820, 5565822, 5697541, 5725886, 5804464, ...
31647175                                   [5895422, 5810673]
Name: product_id, dtype: object

In [5]:
all_combinations = []
for l in user_products:
    all_combinations +=list(combinations(sorted(l), 2))

pair_counts = dict(Counter(all_combinations))

In [6]:
pair_counts[(5751383, 5751422)]

164

In [7]:
product_counts = multi_purchases['product_id'].value_counts()

In [8]:
product_counts.head(15)

product_id
5809910    1568
5854897     763
5802432     695
5700037     603
5809912     603
5833330     587
5304        536
5751422     520
5815662     505
5751383     408
5809911     375
5849033     371
5792800     368
5843836     348
5833326     338
Name: count, dtype: int64

In [11]:
product_counts.tail(5)

product_id
5814737    1
5789615    1
5722991    1
5911333    1
5876948    1
Name: count, dtype: int64

In [13]:
total_carts = len(multi_purchases)
total_carts

204656

In [14]:
def bayesian_recommendation(selected_product, products_pairs, products_counts , total_carts, threshold = 0.01, k=5):
    recommendation = {}
    for (prod1, prod2), count in products_pairs.items():
        # selected_product가 포함된 쌍만 처리
        if selected_product not in (prod1, prod2):
            continue
        other_prod = prod2 if prod1 == selected_product else prod1
        # P(A|B)
        P_A_given_B = count / products_counts[other_prod]
        
        # P(B)
        P_B = products_counts[other_prod] / total_carts
        
        # P(A)
        P_A = products_counts[selected_product] / total_carts 
        
        # P(B|A)
        P_B_given_A = (P_A_given_B * P_B) / P_A
        
        if P_B_given_A >= threshold:
            recommendation[other_prod] = P_B_given_A

    print('>>>>>> Purchase Item Id : {}'.format(selected_product))

    rec_item_list = sorted(recommendation.items(), key = lambda x: x[1], reverse=True)[:k]
    print('>>>>>> Recommend Item Ids : {}'.format([l[0] for l in rec_item_list]))
    return rec_item_list

In [21]:
item_id = 5833326

In [22]:
bayesian_recommendation(item_id, pair_counts, product_counts, total_carts,k=10)

>>>>>> Purchase Item Id : 5833326
>>>>>> Recommend Item Ids : [5833325, 5833330, 5833335, 5906119, 5906098, 5906122, 5809910, 5877490, 5833323, 5809912]


[(5833325, 0.28402366863905326),
 (5833330, 0.23076923076923073),
 (5833335, 0.22485207100591714),
 (5906119, 0.14201183431952663),
 (5906098, 0.13905325443786984),
 (5906122, 0.1242603550295858),
 (5809910, 0.09171597633136094),
 (5877490, 0.07100591715976332),
 (5833323, 0.0650887573964497),
 (5809912, 0.059171597633136105)]

In [18]:
# 조건부확률 기반 추천
data = []

for (prod1, prod2), count in pair_counts.items():
    #P(prod2|prod1)
    if product_counts[prod1]>0:
        data.append([prod1, prod2, count / product_counts[prod1]])
    if product_counts[prod2]>0:
        data.append([prod2, prod1, count / product_counts[prod2]])

cond_prob_df =pd.DataFrame(data, columns = ['item_A','recommend_item','Confidence'])

def conditional_prob_recommendation(item_id, k = 5):
    print('>>>>>> Purchase Item Id : {}'.format(item_id))
    rec_item_list = cond_prob_df.loc[cond_prob_df['item_A']==item_id ,:].sort_values(by='Confidence', ascending=False).head(k)
    print('>>>>>> Recommend Item Ids : {}'.format(list(rec_item_list['recommend_item'].values)))
    return rec_item_list

In [23]:
conditional_prob_recommendation(item_id, k=10)

>>>>>> Purchase Item Id : 5833326
>>>>>> Recommend Item Ids : [5833325, 5833330, 5833335, 5906119, 5906098, 5906122, 5809910, 5877490, 5833323, 5802432]


Unnamed: 0,item_A,recommend_item,Confidence
68679,5833326,5833325,0.284
68700,5833326,5833330,0.231
68702,5833326,5833335,0.225
51164,5833326,5906119,0.142
68726,5833326,5906098,0.139
145430,5833326,5906122,0.124
4335,5833326,5809910,0.092
68712,5833326,5877490,0.071
4587,5833326,5833323,0.065
802565,5833326,5802432,0.059


----

In [30]:
df2 = pd.read_csv('../dataset/2019-Nov.csv')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4635837 entries, 0 to 4635836
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   event_time     object 
 1   event_type     object 
 2   product_id     int64  
 3   category_id    int64  
 4   category_code  object 
 5   brand          object 
 6   price          float64
 7   user_id        int64  
 8   user_session   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 318.3+ MB


In [31]:
purchase_events = df2.loc[df2['event_type']=='purchase', ['user_id','product_id']].drop_duplicates()

# 동시 구매이기 때문에 1-user & 1-item 구매 건은 제외
user_item_cnt = purchase_events.groupby('user_id')['product_id'].count()
idx_ = list(user_item_cnt[user_item_cnt==1].index)
multi_purchases = purchase_events.loc[~purchase_events['user_id'].isin(idx_)]
user_products = multi_purchases.groupby(['user_id'])['product_id'].apply(list)
all_combinations = []
for l in user_products:
    all_combinations +=list(combinations(sorted(l), 2))

pair_counts_nov = dict(Counter(all_combinations))
product_counts_nov = multi_purchases['product_id'].value_counts()
total_carts_nov = len(multi_purchases)

In [32]:
def bayesian_update_recommendation(selected_product, products_pairs_pr, products_counts_pr , total_carts_pr, product_counts_ev, total_carts_ev, threshold = 0.01, k=5):
    """ m-1 (1달전) 의 이력을 바탕으로 prior 를 정의하고 최근 1달의 이력을 evidence 로 받아서 posterior 를 구하는 방식

    Args:
        selected_product : 선택된 item id
        products_pairs_pr : prior 에 해당하는 기간의 동시구매 count dict
        products_counts_pr : prior에 해당하는 기간의 구매 제품 count
        total_carts_pr : prior에 해당하는 기간의 전체 거래 건 수
        product_counts_ev : evidence 에 해당하는 기간의 구매 제품 count
        total_carts_ev : evidence 에 해당하는 기간의 전체 거래 건 수
        threshold (float, optional): threshold Defaults to 0.01.
        k (int, optional): # of result Defaults to 5.

    Returns:
        (list): 추천 item lsit
    """
    recommendation = {}
    for (prod1, prod2), count in products_pairs_pr.items():
        # selected_product가 포함된 쌍만 처리
        if selected_product not in (prod1, prod2):
            continue
        other_prod = prod2 if prod1 == selected_product else prod1
        # P(A|B)
        P_A_given_B = count / products_counts_pr[other_prod]
        
        # P(B)
        P_B = products_counts_pr[other_prod] / total_carts_pr
        
        # P(A)
        P_A = product_counts_ev[selected_product] / total_carts_ev 
        
        # P(B|A)
        P_B_given_A = (P_A_given_B * P_B) / P_A
        
        if P_B_given_A >= threshold:
            recommendation[other_prod] = P_B_given_A

    print('>>>>>> Purchase Item Id : {}'.format(selected_product))

    rec_item_list = sorted(recommendation.items(), key = lambda x: x[1], reverse=True)[:k]
    print('>>>>>> Recommend Item Ids : {}'.format([l[0] for l in rec_item_list]))
    return rec_item_list

In [33]:
item_id = 5833326

In [36]:
conditional_prob_recommendation(item_id, k=10)

>>>>>> Purchase Item Id : 5833326
>>>>>> Recommend Item Ids : [5833325, 5833330, 5833335, 5906119, 5906098, 5906122, 5809910, 5877490, 5833323, 5802432]


Unnamed: 0,item_A,recommend_item,Confidence
68679,5833326,5833325,0.284
68700,5833326,5833330,0.231
68702,5833326,5833335,0.225
51164,5833326,5906119,0.142
68726,5833326,5906098,0.139
145430,5833326,5906122,0.124
4335,5833326,5809910,0.092
68712,5833326,5877490,0.071
4587,5833326,5833323,0.065
802565,5833326,5802432,0.059


In [37]:
bayesian_update_recommendation(item_id, pair_counts_nov, product_counts_nov , total_carts_nov, product_counts, total_carts, threshold = 0.01, k=10)

>>>>>> Purchase Item Id : 5833326
>>>>>> Recommend Item Ids : [5833325, 5833330, 5833335, 5809910, 5833323, 5833327, 5877490, 5833334, 5862943, 5906098]


[(5833325, 0.4079657596179191),
 (5833330, 0.3535703250021965),
 (5833335, 0.16318630384716762),
 (5809910, 0.14181666881956234),
 (5833323, 0.12627511607221306),
 (5833327, 0.12433242197879438),
 (5877490, 0.10879086923144508),
 (5833334, 0.08936392829725846),
 (5862943, 0.08547854011042115),
 (5906098, 0.08547854011042114)]

* item id : 5833326 일 때
    * 조건부 확률 추천 결과
        > [5833325, 5833330, 5833335, 5906119, 5906098, 5906122, 5809910, 5877490, 5833323, 5802432]
    * 베이지안 업데이트 추천 결과
        > [5833325, 5833330, 5833335, 5809910, 5833323, 5833327, 5877490, 5833334, 5862943, 5906098]