# Brazilian-Ecommerce 데이터를 활용한 추천 시스템

# 고객별 제품에 대한 Rating

* 구매했다면 1, 아니면 0.

* 고객별 제품별 주문 수 - 얼마나 자주 구매하였는가.

* 고객이 주문마다 매긴 Rating을 통한 개별 상품의 평균 Rating.

이 세 가지 DataFrame을 만들고 Collaborative Filtering을 적용해 본다.

* import

In [1]:
import pandas as pd
import numpy as np

* DataFrame not used.

In [2]:
geolocation_dataset = pd.read_csv('brazilian-ecommerce/olist_geolocation_dataset.csv')
sellers_dataset = pd.read_csv('brazilian-ecommerce/olist_sellers_dataset.csv')
order_payments_dataset = pd.read_csv('brazilian-ecommerce/olist_order_payments_dataset.csv')

* DataFrame used

In [3]:
customers_dataset = pd.read_csv('brazilian-ecommerce/olist_customers_dataset.csv')
order_item_dataset = pd.read_csv('brazilian-ecommerce/olist_order_items_dataset.csv')
order_reviews_dataset = pd.read_csv('brazilian-ecommerce/olist_order_reviews_dataset.csv')
orders_dataset = pd.read_csv('brazilian-ecommerce/olist_orders_dataset.csv')
products_dataset = pd.read_csv('brazilian-ecommerce/olist_products_dataset.csv')
product_category_dataset = pd.read_csv('brazilian-ecommerce/product_category_name_translation.csv')

* 필요한 데이터셋 정리

In [4]:
order_score = order_reviews_dataset[['order_id', 'review_score']]
customer = customers_dataset[['customer_id', 'customer_unique_id']]
order_customer = orders_dataset[['order_id', 'customer_id']]
order_product = order_item_dataset[['order_id', 'product_id']]
product_category = products_dataset[['product_id', 'product_category_name']]
product_category_english = product_category_dataset

In [5]:
customer.describe()

Unnamed: 0,customer_id,customer_unique_id
count,99441,99441
unique,99441,96096
top,2664a440338cee1a8dae9a8417ba2433,8d50f5eadf50201ccdcedfb9e2ac8455
freq,1,17


In [6]:
order_customer.describe()

Unnamed: 0,order_id,customer_id
count,99441,99441
unique,99441,99441
top,8d3882299a9f08603a9617dad43be00c,2664a440338cee1a8dae9a8417ba2433
freq,1,1


> order_product에 중복 데이터가 존재하여 제거해 주었다.

In [7]:
order_product2 = order_product.copy()
order_product2.drop_duplicates(inplace=True)
order_product2.describe()

Unnamed: 0,order_id,product_id
count,102425,102425
unique,98666,32951
top,ca3625898fbd48669d50701aba51cd5f,99a4788cb24856965c36a24e339b6058
freq,8,467


* __order_customer__ 와 __order_product__ DataFrame을 __order_id__를 중심으로 inner join을 해준다. order_id와 customer_id, product_id를 합친다.  
* __customer_unique_id__가 있는것으로 보아 같은 사람이 여러 customer_id를 가지고 있는 경우가 있어보인다.  
* 첫번 째 결과 DataFrame을 customer dataframe과 left join을 통해 합쳐준 다음 __customer_unique_id__만 남기도록한다.

In [8]:
order_customer_product = pd.merge(order_customer, order_product2, how='inner', left_on='order_id', right_on='order_id')
order_customer_product.describe()

Unnamed: 0,order_id,customer_id,product_id
count,102425,102425,102425
unique,98666,98666,32951
top,ca3625898fbd48669d50701aba51cd5f,0d861a5e4dd6a9079d89e1330848f0ab,99a4788cb24856965c36a24e339b6058
freq,8,8,467


In [9]:
order_customer_product_unique = pd.merge(order_customer_product, customer, how='left', left_on='customer_id', right_on='customer_id')
order_customer_product_unique.describe()

Unnamed: 0,order_id,customer_id,product_id,customer_unique_id
count,102425,102425,102425,102425
unique,98666,98666,32951,95420
top,ca3625898fbd48669d50701aba51cd5f,0d861a5e4dd6a9079d89e1330848f0ab,99a4788cb24856965c36a24e339b6058,8d50f5eadf50201ccdcedfb9e2ac8455
freq,8,8,467,16


In [10]:
cpo_data = order_customer_product_unique[['customer_unique_id', 'product_id', 'order_id']]
cpo_data.columns=['user_id', 'item_id', 'order_id']
cpo_data.describe()

Unnamed: 0,user_id,item_id,order_id
count,102425,102425,102425
unique,95420,32951,98666
top,8d50f5eadf50201ccdcedfb9e2ac8455,99a4788cb24856965c36a24e339b6058,ca3625898fbd48669d50701aba51cd5f
freq,16,467,8


# 1. Binary Rating

* 한 유저가 같은 상품에 대해 여러번 order를 했을 수 있다. 이 점을 이용해 frequency rating을 구할 것이다. 
* 지금은 rating을 binary하게 주기 위해 order_id를 1로 바꾸어주었다. 때문에 중복데이터가 생겨버린다. 이를 제거해준다.

In [11]:
binary_rating = cpo_data.copy()
binary_rating['order_id'] = 1
binary_rating.columns = ['user_id', 'item_id', 'rating']
binary_rating.describe()

Unnamed: 0,rating
count,102425.0
mean,1.0
std,0.0
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [12]:
binary_rating.drop_duplicates(inplace=True)

In [13]:
binary_rating['user_id'].describe()

count                               101987
unique                               95420
top       8d50f5eadf50201ccdcedfb9e2ac8455
freq                                    15
Name: user_id, dtype: object

# 2. Frequency Rating

In [14]:
freq_rating = cpo_data.copy()
freq_rating['rating'] = freq_rating.groupby(['user_id', 'item_id'])['order_id'].transform(np.size)
freq_rating.head()

Unnamed: 0,user_id,item_id,order_id,rating
0,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,e481f51cbdc54678b7cc49136f2d6af7,1
1,af07308b275d755c9edb36a90c618231,595fac2a385ac33a80bd5114aec74eb8,53cdb2fc8bc7dce0b6741e2150273451,1
2,3a653a41f6f9fc3d2a113cf8398680e8,aa4383b373c6aca5d8797843e5594415,47770eb9100c2d0c44946d9cf07ec65d,1
3,7c142cf63193a1473d2e66489a9ae977,d0b61bfb1de832b15ba9d266ca96e5b0,949d5b44dbf5de918fe9c16f97b45f8a,1
4,72632f0f9dd73dfee390c9b22eb56dd6,65266b2da20d04dbe00c5c2d3bb7859e,ad21c59c0840e6cb83a9ceb5573f8159,1


In [15]:
freq_rating['rating'].value_counts()

1    101568
2       804
3        45
4         8
Name: rating, dtype: int64

* rating이 1즉 한 제품을 한번만 주문한 사람이 너무 많다. Binary case와 어떤 차이점이 있을지 의문이다.
* rating을 계산해준 다음으로는 역시 중복을 제거하였다.

In [16]:
freq_rating = freq_rating[['user_id', 'item_id', 'rating']]
freq_rating.drop_duplicates(inplace=True)

In [17]:
freq_rating['user_id'].describe()

count                               101987
unique                               95420
top       8d50f5eadf50201ccdcedfb9e2ac8455
freq                                    15
Name: user_id, dtype: object

# 3. Review Rating

In [18]:
rev_rating = cpo_data.copy()
rev_rating.describe()

Unnamed: 0,user_id,item_id,order_id
count,102425,102425,102425
unique,95420,32951,98666
top,8d50f5eadf50201ccdcedfb9e2ac8455,99a4788cb24856965c36a24e339b6058,ca3625898fbd48669d50701aba51cd5f
freq,16,467,8


In [19]:
order_score2 = order_score.drop_duplicates()
order_score2['order_id'].describe()

count                                99650
unique                               99441
top       4f68ed0836136db8064bf34ebb1965e2
freq                                     2
Name: order_id, dtype: object

In [20]:
result = pd.merge(rev_rating, order_score2, how='left', left_on='order_id', right_on='order_id')
result.isnull().sum()

user_id         0
item_id         0
order_id        0
review_score    0
dtype: int64

In [21]:
result.head()

Unnamed: 0,user_id,item_id,order_id,review_score
0,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,e481f51cbdc54678b7cc49136f2d6af7,4
1,af07308b275d755c9edb36a90c618231,595fac2a385ac33a80bd5114aec74eb8,53cdb2fc8bc7dce0b6741e2150273451,4
2,3a653a41f6f9fc3d2a113cf8398680e8,aa4383b373c6aca5d8797843e5594415,47770eb9100c2d0c44946d9cf07ec65d,5
3,7c142cf63193a1473d2e66489a9ae977,d0b61bfb1de832b15ba9d266ca96e5b0,949d5b44dbf5de918fe9c16f97b45f8a,5
4,72632f0f9dd73dfee390c9b22eb56dd6,65266b2da20d04dbe00c5c2d3bb7859e,ad21c59c0840e6cb83a9ceb5573f8159,5


In [22]:
result['rating'] = result.groupby(['user_id', 'item_id'])['review_score'].transform(np.mean)

In [23]:
result.head()

Unnamed: 0,user_id,item_id,order_id,review_score,rating
0,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,e481f51cbdc54678b7cc49136f2d6af7,4,4.0
1,af07308b275d755c9edb36a90c618231,595fac2a385ac33a80bd5114aec74eb8,53cdb2fc8bc7dce0b6741e2150273451,4,4.0
2,3a653a41f6f9fc3d2a113cf8398680e8,aa4383b373c6aca5d8797843e5594415,47770eb9100c2d0c44946d9cf07ec65d,5,5.0
3,7c142cf63193a1473d2e66489a9ae977,d0b61bfb1de832b15ba9d266ca96e5b0,949d5b44dbf5de918fe9c16f97b45f8a,5,5.0
4,72632f0f9dd73dfee390c9b22eb56dd6,65266b2da20d04dbe00c5c2d3bb7859e,ad21c59c0840e6cb83a9ceb5573f8159,5,5.0


In [24]:
result[result['review_score'] != result['rating']].head(10)

Unnamed: 0,user_id,item_id,order_id,review_score,rating
846,3702265e16e5246cff7462ea675f6d90,c403e106353e1aa40efe783ecb39ed7a,c761a8b74f1e876bc5efc4186f720e27,4,2.5
847,3702265e16e5246cff7462ea675f6d90,c403e106353e1aa40efe783ecb39ed7a,c761a8b74f1e876bc5efc4186f720e27,1,2.5
1196,76ffb5b3f1186c6c8022f6047ae9ead5,6ed0ed10d62b45f3de46933b0b2989a6,3ff220e93d03aad9bb6767d90ac0e846,3,3.5
1197,76ffb5b3f1186c6c8022f6047ae9ead5,6ed0ed10d62b45f3de46933b0b2989a6,3ff220e93d03aad9bb6767d90ac0e846,4,3.5
1574,b08b68bac767c3269e827f822d308182,727a46f0b58a9ef165af9f7ebcfebfa9,c53b15af645d91a057ddba9d0e1f21c7,1,2.0
1600,3aa71d539bb760f054b280604391602a,4b6b3a7072d549354c3743dedbf4170a,bf9a3a8d9db6be5e4dfcfe2f071ef405,5,4.0
1601,3aa71d539bb760f054b280604391602a,4b6b3a7072d549354c3743dedbf4170a,bf9a3a8d9db6be5e4dfcfe2f071ef405,3,4.0
1781,bb58670190dba4e9b320f84cb98317a3,50d77758742285ff49f340ebdf0df5a7,700bf35cd16de6e4d5d1e6e023b0221d,3,4.0
1782,bb58670190dba4e9b320f84cb98317a3,50d77758742285ff49f340ebdf0df5a7,700bf35cd16de6e4d5d1e6e023b0221d,5,4.0
2014,4badd97df3e4ad6d64508b5c25d4dca9,ca33dbc1342572c1aa64e74fb42bab4a,b0ee8cb0ccd868a2b5fd4b0f66bea08a,5,4.0


In [25]:
result[(result['user_id'] == '59d8d1ea35c02a6a070ee29944b0ea86') & (result['item_id'] == '5e21d5cab5d33e770d8150a4ee6117db')]

Unnamed: 0,user_id,item_id,order_id,review_score,rating
2817,59d8d1ea35c02a6a070ee29944b0ea86,5e21d5cab5d33e770d8150a4ee6117db,e86c9bfc5e5cce878c7d0c2419927cd7,1,3.0
64254,59d8d1ea35c02a6a070ee29944b0ea86,5e21d5cab5d33e770d8150a4ee6117db,a8324c7ec6b72c7978896a221606c93e,5,3.0


* 같은 주문에 대해 여러번 평가한 사람도 있고, 같은 상품을 여러번 주문했으나 다르게 평가한 사람도 있다.

* 모두 평균을 내주어 고객별 아이템에 대한 Rating을 구했다.

In [26]:
rev_rating = result[['user_id', 'item_id', 'rating']].drop_duplicates()

In [27]:
rev_rating['user_id'].describe()

count                               101987
unique                               95420
top       8d50f5eadf50201ccdcedfb9e2ac8455
freq                                    15
Name: user_id, dtype: object

# Category Dict

* Item의 카테고리를 딕셔너리 형태로 저장시켜둔 다음 Top N을 뽑았을 때 어떤 항목에 속하는지 알아본다.

In [28]:
product_category.head()

Unnamed: 0,product_id,product_category_name
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria
1,3aa071139cb16b67ca9e5dea641aaa2f,artes
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer
3,cef67bcfe19066a932b7673e239eb23d,bebes
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas


In [29]:
product_category_english.head()

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


In [30]:
product_category['product_id'].describe()

count                                32951
unique                               32951
top       eda67c42c2e05d1ba88119d626ec921f
freq                                     1
Name: product_id, dtype: object

In [31]:
category = pd.merge(product_category, product_category_english, how='left', left_on='product_category_name', right_on='product_category_name')
category = category[['product_id', 'product_category_name_english']]
category.columns = ['item_id', 'category']
category.describe()

Unnamed: 0,item_id,category
count,32951,32328
unique,32951,71
top,eda67c42c2e05d1ba88119d626ec921f,bed_bath_table
freq,1,3029


In [32]:
category.isnull().sum()

item_id       0
category    623
dtype: int64

In [33]:
category_dict = dict(zip(category['item_id'], category['category']))
len(category_dict.keys())

32951

# Collaborative Filtering

In [34]:
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
import heapq
from collections import defaultdict
from operator import itemgetter
from six import iteritems

def item_to_category(top_N):
    result = []
    for i in top_N:
        result.append(category_dict[i])
    return result

def item_based_cf(df):
    testSubject = '3702265e16e5246cff7462ea675f6d90'
    k = 10
    reader = Reader(line_format='user item rating')
    data = Dataset.load_from_df(df=df, reader=reader)
    trainSet = data.build_full_trainset()
    sim_options = {'name': 'cosine', 'user_based': False}
    
    model = KNNBasic(sim_options=sim_options)
    model.fit(trainSet)
    simsMatrix = model.compute_similarities()
    
    testUserInnerID = trainSet.to_inner_uid(testSubject)
    testUserRatings = trainSet.ur[testUserInnerID]
    kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t:t[1])
    
    candidates = defaultdict(float)
    for itemID, rating in kNeighbors:
        similarityRow = simsMatrix[itemID]
        for innerID, score in enumerate(similarityRow):
            candidates[innerID] += score * (rating / 5.0)
    
    # Build a dictionary of stuff the user has already seen
    watched = {}
    for itemID, rating in trainSet.ur[testUserInnerID]:
        watched[itemID] = 1
    
    # Get top-rated items from similar users:
    pos = 0
    result = []
    for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            result.append(trainSet.to_raw_iid(itemID))
            pos += 1
            if (pos > 10):
                break
    return result, item_to_category(result)

In [35]:
item_based_cf(rev_rating)

Computing the cosine similarity matrix...


MemoryError: 