# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-30 11:15:00--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2021-12-30 11:15:02 (7.64 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2021-12-30 11:15:02--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2021-12-30 11:15:04 (6.58 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [4]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [5]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [6]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [21]:
# 用最近三個月評論超過四星的商品來篩選 metadata
ratings_trainings = ratings[
    (ratings['DATE'] >= '2018-08-01') & (ratings['DATE'] < '2018-09-01') & (ratings['overall'] >= 4)
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

# 計算平均分
ratings_training_3 = ratings_trainings.groupby('asin', as_index = False)['overall'].mean()

# 計算評論數
rating_comment = pd.DataFrame(ratings_trainings.asin.value_counts())
rating_comment.reset_index(inplace=True)
rating_comment.columns = ['asin', 'count']

# merge
rating_comment = rating_comment.merge(ratings_training_3, on='asin')
rating_comment

Unnamed: 0,asin,count,overall
0,B01DKQAXC0,51,4.862745
1,B00W259T7G,37,4.810811
2,B01DLR9IDI,26,5.000000
3,B013XKHA4M,25,4.920000
4,B0195R1FT8,20,4.800000
...,...,...,...
636,B01C3K03VK,1,5.000000
637,B01DYP0WRS,1,5.000000
638,B01CT5SNQQ,1,5.000000
639,B01BHFBOYM,1,5.000000


## Metadata 文字處理

In [22]:
metadata.drop(['category','tech1','fit','also_buy','tech2','brand','also_view','details','main_cat','similar_item','date','price','imageURL','imageURLHighRes','feature'], axis=1, inplace = True)

KeyError: ignored

In [23]:
metadata = metadata.merge(rating_comment, on = 'asin')
metadata.head()

Unnamed: 0,description,title,rank,asin,count_x,overall_x,rank_no,all_text,count_y,overall_y
0,"Infused with kukui nut lipids, this is a desig...",Paul Brown Hawaii Gelatine Goo Firm Holding Ge...,in Beauty & Personal Care (,B000MAJD4W,1,5.0,479011,infused kukui nut lipid designed sculpt mold d...,1,5.0
1,WHY WE LOVE IT Win the fight against PORES! In...,Benefit Cosmetics The Porefessional Pores Away...,in Beauty & Personal Care (,B0047NFF4C,1,5.0,862335,love win fight pore instantly minimize appeara...,1,5.0
2,Derma Sciences Surgilast Tubular Elastic Dress...,Derma Sciences Surgilast Tubular Elastic Dress...,in Beauty & Personal Care (,B00AN382P4,1,5.0,1795033,derma science surgilast tubular elastic dressi...,1,5.0
3,Now you can be in total control of your job. N...,"Vectra Furniture, Carpet, Fabric and Wall Cove...",in Beauty & Personal Care (,B00BPM41MA,1,5.0,91984,total control job hassle scheduling fabric pro...,1,5.0
4,Polo Blue by Ralph Lauren is a Aromatic Fouger...,Polo Blue by Ralph Lauren for Men 0.5 oz EDT T...,in Beauty & Personal Care (,B00GCRTTHA,2,5.0,59383,polo blue ralph lauren aromatic fougere fragra...,1,5.0


In [24]:
# 處理掉description 的中括號
metadata['description'] = metadata['description'].apply(lambda x: ' '.join(x))

# 處理rank
rankdata = metadata['rank'].str.split(' ', n=1, expand=True)
metadata['rank_no'], metadata['rank'] = rankdata.iloc[:,0], rankdata.iloc[:,1]

# 合併三個文字欄位
metadata['all_text'] = metadata['description'] + ' ' + metadata['title'] + ' ' + metadata['rank']

# 去除特殊符號，除了數字之間的
import re
regex = r"(?<!\d)[\W](?!\d)"
metadata['all_text'] = metadata['all_text'].apply(lambda x: re.sub(regex, ' ', str(x)))

# 轉小寫
metadata['all_text'] = metadata['all_text'].str.lower()

## TFIDF

In [25]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# set lemmatizer
lemmatizer = WordNetLemmatizer()

# remove stopword
def remove_stopword(sentence):
    tokens = nltk.word_tokenize(sentence)
    result = [lemmatizer.lemmatize(voca) for voca in tokens if not voca in stopwords.words('english')]
    return ' '.join(result)

metadata['all_text'] = metadata['all_text'].apply(lambda x: remove_stopword(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
# TDIDF
from sklearn.feature_extraction.text import TfidfVectorizer
metadata = metadata.drop_duplicates('all_text')
tf = TfidfVectorizer(max_df = 0.80)
tfidf_matrix = tf.fit_transform(metadata['all_text'])

In [27]:
# 相似程度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(metadata.index,index = metadata['asin'])

In [28]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]

## 產生推薦

In [29]:
# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (metadata['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res


In [30]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    ratings_trainings = training_data
    for user in users:

        # content based
        recom_list = recommend_items(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['asin'].tolist(), k)
        if recom_list:
            recommendations[user] = recom_list
        else:
            # base-ruled
            recommendations[user] = rating_comment.asin[:k].tolist()

    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{'A100XQFWKQ30O2': ['B01DKQAXC0',
  'B00W259T7G',
  'B01DLR9IDI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B01CJNZKZK',
  'B012Z7IHHI',
  'B01C39X6TW',
  'B0168SXRR0'],
 'A103T1QOGFCSEH': ['B01DKQAXC0',
  'B00W259T7G',
  'B01DLR9IDI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B01CJNZKZK',
  'B012Z7IHHI',
  'B01C39X6TW',
  'B0168SXRR0'],
 'A106UKKSJ2KXPF': ['B01DKQAXC0',
  'B00W259T7G',
  'B01DLR9IDI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B01CJNZKZK',
  'B012Z7IHHI',
  'B01C39X6TW',
  'B0168SXRR0'],
 'A10A7GV4D5A11V': ['B01DKQAXC0',
  'B00W259T7G',
  'B01DLR9IDI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B01CJNZKZK',
  'B012Z7IHHI',
  'B01C39X6TW',
  'B0168SXRR0'],
 'A1119JJ37ZLB8R': ['B01DKQAXC0',
  'B00W259T7G',
  'B01DLR9IDI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01AVJCDYA',
  'B01CJNZKZK',
  'B012Z7IHHI',
  'B01C39X6TW',
  'B0168SXRR0'],
 'A113UOOLBSZN52': ['B01DKQAXC0',
  'B00W259T7G',
  'B01DLR9IDI',
  'B013XKHA4M',
  'B0195R1FT8',
  'B01A

## 結果評估

In [31]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.1440677966101695