# Sample Code

## 基礎建設

In [None]:
import time
import datetime
import pandas as pd
import gzip, json
import numpy as np
from itertools import combinations
from collections import defaultdict

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib 
matplotlib.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore")

from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

In [None]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [None]:
metadata = pd.read_json("meta_All_Beauty.json", lines=True)
ratings = pd.read_json('All_Beauty.json', lines=True)

In [None]:
metadata.head()

In [None]:
ratings = ratings[['asin', 'reviewerID', 'overall', 'unixReviewTime']]
ratings.head()

## 去空值、重複&清洗rank、price欄位

In [None]:
def empty_na(cell):
    try:
        if len(cell) == 0:
            return np.nan
        else:
            return cell
    except:
        return cell
metadata_na = metadata.applymap(lambda x : empty_na(x))

In [None]:
metadata_clean = metadata_na[['asin', 'title', 'rank', 'brand', 'description', 'price']]
metadata_clean.head()

In [None]:
metadata_clean.duplicated(subset='asin').value_counts()

In [None]:
metadata_clean = metadata_clean.drop_duplicates(subset='asin').reset_index(drop=True)

In [None]:
metadata_clean['rank'] = metadata_clean['rank'].str.split('in', expand=True)[0]
metadata_clean['rank'] = pd.to_numeric(metadata_clean['rank'].str.replace(',', ''))
metadata_clean['price'] = pd.to_numeric(metadata_clean['price'].str.replace('$', ''), errors='coerce')

In [None]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [None]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 資料整理

In [None]:
metadata_clean.dtypes

In [None]:
ratings_trainings = ratings_trainings.merge(metadata_clean[['asin', 'rank', 'price']], on='asin', how='left')
ratings_trainings

In [None]:
ratings_trainings.describe()

In [None]:
ratings_trainings.isna().sum().sort_values(ascending=False)

In [None]:
ratings_trainings.drop(columns='unixReviewTime', inplace=True)

In [None]:
overall_group = ratings_trainings.groupby(by = 'asin').agg({'overall':['count', 'mean']})['overall'].rename(columns={'count':'sales', 'mean':'overall_mean'})
overall_group.reset_index(inplace=True)

In [None]:
ratings_trainings = ratings_trainings.merge(overall_group, on='asin', how='left')
ratings_trainings

In [None]:
ratings_trainings.dtypes

In [None]:
ratings_trainings = (
        ratings_trainings
        .sort_values("DATE", ascending=False)
        .groupby(['reviewerID', 'asin']).head(1)
    )

In [None]:
ratings_trainings.reset_index(drop=True, inplace=True)

In [None]:
asin_trainings = ratings_trainings[['asin', 'rank', 'price', 'sales', 'overall_mean', 'DATE']].sort_values(by='DATE', ascending=False).drop_duplicates(subset='asin', keep='first')
asin_trainings.reset_index(drop=True, inplace=True)

In [None]:
asin_ratings = asin_trainings.sort_values(by = ['sales'], ascending=(False))['asin'].tolist()
asin_ratings

## EDA

In [None]:
plt.figure(figsize=(15,10))
sns.pairplot(asin_trainings)
plt.title("Looking for Insight in Data")
plt.tight_layout()
plt.plot()

In [None]:
fig=px.histogram(ratings_trainings, 
                 x="overall",
                 hover_data=ratings_trainings.columns,
                 title="Distribution of overall",
                 barmode="group",
                 text_auto=True
                )

fig.show()

In [None]:
fig = px.pie(ratings_trainings,  
             values=ratings_trainings['overall'].value_counts(),
             names=ratings_trainings['overall'].value_counts().index,
             title='Distribution of overall')
fig.show()

In [None]:
fig = px.pie(ratings_trainings,  
             values=ratings_trainings['reviewerID'].value_counts().value_counts(),
             names=ratings_trainings['reviewerID'].value_counts().value_counts().index,
             title='Distribution of buy_times')
fig.show()

In [None]:
fig=px.histogram(ratings_trainings, 
                 x="DATE",
                 hover_data=ratings_trainings.columns,
                 title="Distribution of date",
                 barmode="group",
                )
fig.show()

In [None]:
fig=px.histogram(ratings_trainings, 
                 x="DATE",
                 color="overall",
                 hover_data=ratings_trainings.columns,
                 title="Distribution of date & overall",
                 barmode="relative",
                )
fig.show()

In [None]:
fig=px.histogram(asin_trainings, 
                 x="overall_mean",
                 hover_data=asin_trainings.columns,
                 title="Distribution of item average rating",
                 barmode="group",
                )
fig.show()

In [None]:
fig = px.scatter(asin_trainings, 
                 x="sales", 
                 y="rank", 
#                  log_x=True,
                 color="price",
                 size="overall_mean", 
                 hover_data=asin_trainings.columns,
                 title='Distribution of sales & rank'
                )
fig.show()

In [None]:
fig = px.scatter(asin_trainings, 
                 x="sales", 
                 y="overall_mean", 
#                  log_x=True,
                 color="price",
                 size="overall_mean",
                 hover_data=asin_trainings.columns,
                 title='Distribution of sales & overall_mean'
                )
fig.show()

In [None]:
fig = px.scatter(asin_trainings, 
                 x="sales", 
                 y="price", 
#                  log_x=True,
#                  color="rank",
                 size="overall_mean",
                 hover_data=asin_trainings.columns,
                 title='Distribution of sales & price'
                )
fig.show()

## 產生推薦

### User-based

In [None]:
# header: user_id,item_id,rating,timestamp

def user_based_recommender(training_data, users, k, days):
    
    if isinstance(days, int):
        training_data = training_data[(training_data['DATE'] + datetime.timedelta(days = days)) > '2018-09-01']
    ratings_trainings = training_data
    ratings_info = ratings_trainings.groupby(by = ['asin'], as_index=False).agg(rating_count=('overall', 'size'), rating_mean=('overall', 'mean'))
    asin_ratings = ratings_info.sort_values(by = ['rating_count'], ascending=(False))['asin'].tolist()
    
    # loading data from dataframe
    # user_to_items dict:
    # {
    #   'user': {
    #       'item': ratings...
    #   }...
    # }
    user_to_items = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])

        user_to_items[user][item] = rating

#     print("total users before filtering: ", len(user_to_items))

    # remove obscure user to decrease data size
    # filtering params
    remove_obscure_user = True
    user_rating_threshold = 3
    all_users = list(user_to_items.keys())
    for user in all_users:
        ratings = user_to_items[user]
        if remove_obscure_user and len(ratings) < user_rating_threshold:
            del user_to_items[user]

#     print("total users  after filtering: ", len(user_to_items))

    # generate item to user mapping dict
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for user, items in user_to_items.items():
        for item, rating in items.items():
            item_to_users[item][user] = rating

    # prepare data of computing user similarity 
    init_sim = lambda: [0 for _ in range(3)]
    factory = lambda: defaultdict(init_sim)
    pre_user_similarity = defaultdict(factory)
    n = len(item_to_users)
    index = 0
    for item, user_ratings in item_to_users.items():
        if len(user_ratings) > 1:
            # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
            for user1, user2 in combinations(user_ratings.keys(), 2):
                xy = user_ratings[user1] * user_ratings[user2]
                xx = user_ratings[user1] ** 2
                yy = user_ratings[user2] ** 2
                pre_user_similarity[user1][user2][0] += xy
                pre_user_similarity[user1][user2][1] += xx
                pre_user_similarity[user1][user2][2] += yy

                pre_user_similarity[user2][user1][0] += xy
                pre_user_similarity[user2][user1][1] += xx
                pre_user_similarity[user2][user1][2] += yy
        index += 1

    user_similarity = {}
    for src_user in pre_user_similarity:
        user_similarity_order = []
        for dst_user, val in pre_user_similarity[src_user].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0:
                continue
            for i, s in enumerate(user_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    user_similarity_order.insert(i, (dst_user, similarity))
                    break
            else:
                user_similarity_order.append((dst_user, similarity))
        user_similarity[src_user] = user_similarity_order

    recommendation = {}
    user_have_rated = set(user_to_items[user])
    for user in users:
        if user in user_similarity:
            sim_users = user_similarity[user]
            recommended_items = []
            recommended_items_set = set()
            user_have_rated = set(user_to_items[user])
            stop_recommend = False
            for sim_user, _ in sim_users:
                items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
                for item, _ in items_from_sim_user:
                    if item not in user_have_rated and item not in recommended_items_set:
                        recommended_items.append(item)
                        recommended_items_set.add(item)
                    if len(recommended_items) >= k:
                        stop_recommend = True
                        break
                        
                if len(recommended_items) < k:
                    asin_list = [asin for asin in asin_ratings if asin not in user_have_rated and asin not in recommended_items_set]
                    recommended_items.extend(asin_list)
                    stop_recommend = True

                if stop_recommend:
                    break
            recommendation[user] = recommended_items[:k]
        else:
            asin_list = [asin for asin in asin_ratings if asin not in user_have_rated]
            recommendation[user] = asin_list[:k]

    return recommendation

### Item-based

In [None]:
def item_based_recommender(training_data, users, k, days):
    
    if isinstance(days, int):
        training_data = training_data[(training_data['DATE'] + datetime.timedelta(days = days)) > '2018-09-01']
    ratings_trainings = training_data
    ratings_info = ratings_trainings.groupby(by = ['asin'], as_index=False).agg(rating_count=('overall', 'size'), rating_mean=('overall', 'mean'))
    asin_ratings = ratings_info.sort_values(by = ['rating_count'], ascending=(False))['asin'].tolist()
    
    # loading data from dataframe
    # item_to_users dict:
    # {
    #   'item': {
    #       'user': ratings...
    #   }...
    # }
    item_to_users = defaultdict(dict)
    for _, row in training_data.iterrows():
        row = dict(row)
        user = row['reviewerID']
        item = row['asin']
        rating = float(row['overall'])
        item_to_users[item][user] = rating

#     print("data converted")

    user_to_items = defaultdict(dict)
    for item, rating_users in item_to_users.items():
        for user, rating in rating_users.items():
            user_to_items[user][item] = rating

#     print("data inverted")

    init_sim = lambda: [0, 0, 0]
    factory = lambda: defaultdict(init_sim)
    pre_item_similarity = defaultdict(factory)
    for user, items in user_to_items.items():
        if len(items) > 1:
            for i1, i2 in combinations(items.keys(), 2):
                xy = items[i1] * items[i2]
                xx = items[i1] ** 2
                yy = items[i2] ** 2
                pre_item_similarity[i1][i2][0] += xy
                pre_item_similarity[i1][i2][1] += xx
                pre_item_similarity[i1][i2][2] += yy

                pre_item_similarity[i2][i1][0] += xy
                pre_item_similarity[i2][i1][1] += xx
                pre_item_similarity[i2][i1][2] += yy

#     print("sim data prepared")

    item_similarity = {}
    for src_item in pre_item_similarity:
        item_similarity_order = []
        for dst_item, val in pre_item_similarity[src_item].items():
            xy = val[0]
            xx = val[1]
            yy = val[2]
            div = ((xx*yy) ** 0.5)
            if div == 0:
                continue
            similarity = xy / div
            if similarity < 0:
                continue
            for i, s in enumerate(item_similarity_order):
                target_similarity = s[1]
                if target_similarity < similarity:
                    item_similarity_order.insert(i, (dst_item, similarity))
                    break
            else:
                item_similarity_order.append((dst_item, similarity))
        item_similarity[src_item] = item_similarity_order

#     print(f"get {k} recommendation items for for user: {users}")

    recommendation = {}
    for user in users:
        items = []
        items_set = set()
        stop = False
        user_has_rated = set(user_to_items[user])
        for item in user_has_rated:
            if item in item_similarity:
                for sim_item, _ in item_similarity[item]:
                    # skip the item user has rated
                    if sim_item not in user_has_rated and sim_item not in items_set:
                        items.append(sim_item)
                        items_set.add(sim_item)
                    if len(items) >= k:
                        stop = True
                        break
                        
            if len(items) < k:
                asin_list = [asin for asin in asin_ratings if asin not in user_has_rated and asin not in items_set]
                items.extend(asin_list)
                stop = True
                
            if stop:
                break
        
        if items:
            recommendation[user] = items[:k]
        else:
            asin_list = [asin for asin in asin_ratings if asin not in user_has_rated]
            recommendation[user] = asin_list[:k]
  
    return recommendation   

### Surprise

In [None]:
def surprise_recommender(training_data, users, k, days, user_based=False, algo=KNNBasic):
    
    if isinstance(days, int):
        training_data = training_data[(training_data['DATE'] + datetime.timedelta(days = days)) > '2018-09-01']
    ratings_trainings = training_data
    ratings_info = ratings_trainings.groupby(by = ['asin'], as_index=False).agg(rating_count=('overall', 'size'), rating_mean=('overall', 'mean'))
    asin_ratings = ratings_info.sort_values(by = ['rating_count'], ascending=(False))['asin'].tolist()
    
    reader = Reader(rating_scale=(0, 5))
    training_data = training_data[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)

    sim_options = {
        'name': 'cosine',
        'user_based': user_based  # compute similarities between items
    }
    algo_impl = algo(sim_options=sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)

    recommendation = {}
    for user in users:
        items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
        recommend_item_list = []
        recommend_item_set = set()
        for item in items_user_rated:
            iid = algo_impl.trainset.to_inner_iid(item)
            recommend_items_iid = algo_impl.get_neighbors(iid, k)
            for sim_item_iid in recommend_items_iid:
                item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                    recommend_item_list.append(item_raw_id)
                    recommend_item_set.add(item_raw_id)

            if len(recommend_item_list) >= k:
                recommend_item_list = recommend_item_list[:k]
                break
                
            if len(recommend_item_list) < k:
                asin_list = [asin for asin in asin_ratings if asin not in items_user_rated and asin not in recommend_item_set]
                recommend_item_list.extend(asin_list)
        
        if recommend_item_list:
            recommendation[user] = recommend_item_list[:k]
        else:
            asin_list = [asin for asin in asin_ratings if asin not in items_user_rated]
            recommendation[user] = asin_list[:k]

    return recommendation

## 結果評估

In [None]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

In [None]:
# user-based
score_list = []
k_list = [5, 10, 30]
days_list = [30, 60, 90, 180, 'All']
for k in k_list:
    for days in days_list:
        ratings_by_user = user_based_recommender(ratings_trainings, users, k, days)
        score = evaluate(ratings_testings_by_user, ratings_by_user)
        score_list.append(score)

user_result_df = pd.DataFrame(np.reshape(score_list, (3, 5)).T, index=pd.Index(days_list, name='Days'), columns=pd.Index(k_list, name='K'))
user_result_df

In [None]:
# item-based
score_list = []
k_list = [5, 10, 30]
days_list = [30, 60, 90, 180, 'All']
for k in k_list:
    for days in days_list:
        ratings_by_user = item_based_recommender(ratings_trainings, users, k, days)
        score = evaluate(ratings_testings_by_user, ratings_by_user)
        score_list.append(score)

item_result_df = pd.DataFrame(np.reshape(score_list, (3, 5)).T, index=pd.Index(days_list, name='Days'), columns=pd.Index(k_list, name='K'))
item_result_df

In [None]:
# surprise
score_list = []
k_list = [5, 10, 30]
days_list = [30, 60, 90, 180, 360]
for k in k_list:
    for days in days_list:
        ratings_by_user = surprise_recommender(ratings_trainings, users, k, days)
        score = evaluate(ratings_testings_by_user, ratings_by_user)
        score_list.append(score)

surprise_result_df = pd.DataFrame(np.reshape(score_list, (3, 5)).T, index=pd.Index(days_list, name='Days'), columns=pd.Index(k_list, name='K'))
surprise_result_df

## 檢查訓練集與測試集的使用者交集購買次數狀況

In [None]:
len(set(ratings_trainings[ratings_trainings['reviewerID'].isin(users)]['reviewerID']))

In [None]:
dup_user = ratings_trainings[ratings_trainings['reviewerID'].isin(users)]

In [None]:
dup_user = dup_user.groupby(by='reviewerID').agg({'asin':'count'})['asin'].reset_index().rename(columns={'asin':'buy_times'})
dup_user

In [None]:
fig = px.pie(dup_user,  
             values=dup_user['buy_times'].value_counts(),
             names=dup_user['buy_times'].value_counts().index,
             title='Distribution of 38 user buy_times')
fig.show()