# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-25 13:38:25--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2021-12-25 13:38:25 (69.2 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2021-12-25 13:38:25--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2021-12-25 13:38:26 (61.9 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [4]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [5]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


In [6]:
ratings.count()

asin              371345
reviewerID        371345
overall           371345
unixReviewTime    371345
dtype: int64

* **reviewerID** - ID of the reviewer, e.g. A2SUAM1J3GNN3B ← 用戶 ID
* **asin** - ID of the product, e.g. 0000013714 ← 商品 ID
* **overall** - rating of the product ← 用戶對商品的評分
* **unixReviewTime** - time of the review (unix time) ← 留下評論的時間戳記


## 資料整理

In [7]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [66]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [9]:
# 共 584 test users
len(users)

584

In [10]:
# train data只含有53筆 test users 購買紀錄
len(ratings_trainings.loc[ratings_trainings["reviewerID"].isin(users),:])

53

# rule based 思路
1. 檢查測試的使用者名單中有無在訓練資料裡有其過去的購買紀錄，如果有就可以分析其過去的購買行為來進行推薦，最後發現共有53筆過去有購買紀錄的使用者。
2. 對於53筆過去有購買紀錄的使用者，根據他們以前購買過的商品(asin) 做推薦。
  *   從他們的購買過的商品去 join 查找 metadata 中 also view & aslo buy item 資訊。
      並針對 overall 排序取其 k 個 item 推薦。
  *   若取不滿 k 個則加入 overall 隨機推薦。

3. 對於新的使用者，因為沒有過去的購買行為資料，所以依照商品本身的 overall 隨機推薦。

4. overall推薦從加總overall分數高低優先推薦最近評價高的新品。



In [42]:
ratings_trainings[['asin', 'overall']].groupby('asin').sum().reset_index().sort_values("overall", ascending=False, ignore_index=True)

Unnamed: 0,asin,overall
0,B000FOI48G,38100.0
1,B000GLRREU,36743.0
2,1620213982,22994.0
3,B001QY8QXM,20703.0
4,B01DKQAXC0,17679.0
...,...,...
32577,B01AVG6704,1.0
32578,B017CWLDZO,1.0
32579,B00KAUN66U,1.0
32580,B01AVJ16R0,1.0


In [82]:
from typing import List, Dict

def recommender(
    training_data: pd.DataFrame,
    users: List[str] = [],
    k: int = 10,
    start_time: str = "1970-01-01"
) -> Dict[str, List[str]]:
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    join_df = ratings_trainings.join(metadata[['asin', 'also_buy', 'also_view']].set_index('asin'), on='asin')
    # 針對過去有購買紀錄的 user 推薦
    temp_df = join_df.loc[join_df["reviewerID"].isin(users),:].reset_index().drop_duplicates(subset=['index'])
    # 檢查時間範圍是否會影響
    overall_df = ratings_trainings.loc[ratings_trainings['DATE'] > start_time]
    # 找出最熱門的商品來推薦
    overall_df = overall_df[['asin', 'overall']].groupby('asin').sum().reset_index().sort_values("overall", ascending=False, ignore_index=True)

    def make_recommend(row) -> list:
      # 將過去瀏覽或購買的紀錄都加入其中去考慮
      items = list(set((row['also_buy'] + row['also_view'])))
      # 如果沒有過去瀏覽或購買的紀錄或不足 k 則推薦 top k items
      if len(items) < k:
        items = items + overall_df['asin'][:k].tolist()
      return items[:k]

    temp_df["recommend"] = temp_df.apply(make_recommend, axis=1)
    temp_df = temp_df[['reviewerID', 'recommend']]
    past_users_recommand = temp_df.to_dict("records")

    # TODO 針對沒有瀏覽歷史紀錄的新客戶
    new_users = list(set(users) - set(temp_df['reviewerID']))
    new_users_recommand = [
      {
        "reviewerID": user,
        "recommend": overall_df['asin'][:k].tolist()
      } for user in new_users
    ]
    recommend = new_users_recommand + past_users_recommand
    recommendations = {
        recom["reviewerID"]: recom["recommend"] for recom in recommend
    }
    return recommendations

## 結果評估

In [54]:
from typing import Optional

def evaluate(
    ratings_testings_by_user: Dict[str, List[str]] = {},
    ratings_by_user: Dict[str, List[str]] = {},
    method :Optional[str] = None
) -> float:
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for receiveID in ratings_testings_by_user:
        if receiveID in ratings_by_user:
            # 測試與訓練共同購買商品 id list 的交集
            total += len(
                set(ratings_by_user[receiveID]) & set(ratings_testings_by_user[receiveID])
            )

    score = total / len(ratings_testings)
    return score

In [89]:
start_times = ["2017-01-01", "2018-01-01", "2018-06-01", "2018-08-01"]
for start_time in start_times:
  ratings_by_user = recommender(ratings_trainings, users, 10, start_time)
  score = evaluate(ratings_testings_by_user, ratings_by_user)
  print(f"start_time: {start_time}, score: {score}")

start_time: 2017-01-01, score: 0.09491525423728814
start_time: 2018-01-01, score: 0.09661016949152543
start_time: 2018-06-01, score: 0.13220338983050847
start_time: 2018-08-01, score: 0.14067796610169492


# 結論
1. 針對5星評等中物品中用隨機推薦的方法變動幅度很大

2. 利用過去瀏覽或購買的紀錄來推薦看起來命中幅度不高

3. 用平均得到的評分推薦不佳，用評分加總的方式分數比較好

4. 最近時間對於推薦影響非常大，只用最近一個月的資料推薦(8-9)月效果最好 score: 0.14