In [7]:
import sys
sys.path.append("../..")

%load_ext autoreload
%autoreload 2
import product_refine
import recommend
import common
from common import FILE_PATH
from common import MODEL_PATH

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM
import pickle
import locale

locale.setlocale(locale.LC_ALL, '') 

st = sns.axes_style("whitegrid")
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})

plt.rcParams ['font.family'] = 'NanumGothic'

import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## DeeFM Architecture

<img src ="images/architecture.png">

### feather 파일로 저장되어있는 각각의 dataframe을 로드

In [8]:
%%time
# feather 파일로 저장되어있는 각각의 dataframe을 읽어옴

reviews = pd.read_feather (FILE_PATH + 'reviews.ftr', use_threads = True)
users = pd.read_feather (FILE_PATH + 'users.ftr', use_threads = True)
products = pd.read_feather (FILE_PATH + 'products.ftr', use_threads = True)
products_brand_rank = pd.read_feather (FILE_PATH + 'products_brand_rank.ftr', use_threads = True)
# product_categories = pd.read_feather (FILE_PATH + 'product_categories.ftr', use_threads = True)

glowpick_before_labeling = pd.read_feather (FILE_PATH + 'glowpick_before_labeling.ftr', use_threads = True)
glowpick = pd.read_feather (FILE_PATH + 'glowpick.ftr', use_threads = True)

# products와 products_brand_rank를 merge한 dataframe
refined_products = pd.read_feather (FILE_PATH + 'refined_products.ftr', use_threads = True)

CPU times: user 7.36 s, sys: 2.08 s, total: 9.43 s
Wall time: 8.62 s


In [9]:
print ('number of reviews: \t', format (len(reviews), ','))
print ('number of users: \t', format (len(users), ','))
print ('number of products: \t', format (len(products), ','))

number of reviews: 	 1,574,817
number of users: 	 76,490
number of products: 	 87,538


In [10]:
glowpick.head (3)

Unnamed: 0,contents,created_at,rating,origin_user_id,origin_product_id,origin_age,origin_gender,is_blinded,is_closed,is_inactivated,...,volume,price,brandName,origin_idThirdCategory,product_id,user_id,gender,age,skin_type,idThirdCategory
0,"티 컬렉션으로 출시되었던 제품으로, 가벼운 녹차향이 납니다. 향 자체는 좀 날리는 ...",2020-04-30T02:12:36Z,3,119763,100000,36.0,f,0.0,False,False,...,175g,11000.0,해피바스 (HAPPY BATH),112.0,0,7230,0,32,2,108
1,살짝 로션같이 짜지고 묽음.\r\n향은 독하지 않고 적절히 향긋함.\r\n거품잘남\...,2020-03-15T09:08:20Z,4,338669,100000,38.0,f,0.0,False,False,...,175g,11000.0,해피바스 (HAPPY BATH),112.0,0,22864,0,34,3,108
2,"해피바스는 무난하고 순한 매력이 있음! 다른것들도 잘 썼지만 정말 무난함,, 그치만...",2020-01-21T01:29:44Z,3,24862,100000,28.0,f,0.0,False,False,...,175g,11000.0,해피바스 (HAPPY BATH),112.0,0,19317,0,24,2,108


### 데이터 준비

In [11]:
sparse_features = ["product_id", "user_id", "gender", "age", "skin_type", "idThirdCategory", ]
target = ['rating']

# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat (feat, glowpick [feat].nunique(), embedding_dim = 4)
                          for feat in sparse_features]
fixlen_feature_columns

[SparseFeat(name='product_id', vocabulary_size=50761, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='product_id', group_name='default_group'),
 SparseFeat(name='user_id', vocabulary_size=74773, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='user_id', group_name='default_group'),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='gender', group_name='default_group'),
 SparseFeat(name='age', vocabulary_size=67, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='age', group_name='default_group'),
 SparseFeat(name='skin_type', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='skin_type', group_name='default_group'),
 SparseFeat(name='idThirdCategory', vocabulary_size=286, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='idThirdCategory', group_name='default_group')]

In [12]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names (linear_feature_columns + dnn_feature_columns)

feature_names

['product_id', 'user_id', 'gender', 'age', 'skin_type', 'idThirdCategory']

In [13]:
linear_feature_columns

[SparseFeat(name='product_id', vocabulary_size=50761, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='product_id', group_name='default_group'),
 SparseFeat(name='user_id', vocabulary_size=74773, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='user_id', group_name='default_group'),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='gender', group_name='default_group'),
 SparseFeat(name='age', vocabulary_size=67, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='age', group_name='default_group'),
 SparseFeat(name='skin_type', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='skin_type', group_name='default_group'),
 SparseFeat(name='idThirdCategory', vocabulary_size=286, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='idThirdCategory', group_name='default_group')]

In [16]:
# linear_feature_columns list 와 dnn_feature_columns list 를 load
with open (FILE_PATH + 'linear_feature_columns_list.pickle', 'rb') as fp:
    linear_feature_columns = pickle.load (fp)
    
with open (FILE_PATH + 'dnn_feature_columns_list.pickle', 'rb') as fp:
    dnn_feature_columns = pickle.load (fp)    

feature_names = get_feature_names (linear_feature_columns + dnn_feature_columns)    
linear_feature_columns

[SparseFeat(name='product_id', vocabulary_size=50761, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='product_id', group_name='default_group'),
 SparseFeat(name='user_id', vocabulary_size=74773, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='user_id', group_name='default_group'),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='gender', group_name='default_group'),
 SparseFeat(name='age', vocabulary_size=67, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='age', group_name='default_group'),
 SparseFeat(name='skin_type', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='skin_type', group_name='default_group'),
 SparseFeat(name='idThirdCategory', vocabulary_size=286, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='idThirdCategory', group_name='default_group')]

In [17]:
# 3.generate input data for model
train, test = train_test_split (glowpick, test_size = 0.2)
train_model_input = {name: train [name] for name in feature_names}
test_model_input = {name: test [name] for name in feature_names}

### Load Model

In [18]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'
    
model = DeepFM (linear_feature_columns, dnn_feature_columns, task = 'regression', device = device)
model.load_state_dict (torch.load (MODEL_PATH))
model.eval ()

cuda ready...


DeepFM(
  (embedding_dict): ModuleDict(
    (age): Embedding(67, 4)
    (gender): Embedding(2, 4)
    (idThirdCategory): Embedding(286, 4)
    (product_id): Embedding(50761, 4)
    (skin_type): Embedding(5, 4)
    (user_id): Embedding(74773, 4)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict(
      (age): Embedding(67, 1)
      (gender): Embedding(2, 1)
      (idThirdCategory): Embedding(286, 1)
      (product_id): Embedding(50761, 1)
      (skin_type): Embedding(5, 1)
      (user_id): Embedding(74773, 1)
    )
  )
  (out): PredictionLayer()
  (fm): FM()
  (dnn): DNN(
    (dropout): Dropout(p=0, inplace=False)
    (linears): ModuleList(
      (0): Linear(in_features=24, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=128, bias=True)
    )
    (activation_layers): ModuleList(
      (0): ReLU(inplace=True)
      (1): ReLU(inplace=True)
    )
  )
  (dnn_linear): Linear(in_features=128, out_features=1, bias=False)
)

### 평가

In [19]:
# epoch 6
from math import sqrt

pred_ans = model.predict (test_model_input, batch_size = 256)

print("test MSE", round (mean_squared_error (test [target].values, pred_ans), 4))

print ("\ntest RMSE", round (sqrt (mean_squared_error (test [target].values, pred_ans)), 4))

test MSE 0.8693

test RMSE 0.9324


### 실제 유저용 추천

#### 평점 예측기에 입력할 데이터 생성

In [20]:
use_col = ['created_at', 'rating', 'origin_user_id', 'origin_product_id', 'origin_age', 'origin_gender', 'price', 'brandName', 'origin_idThirdCategory', 'origin_skin_type']
glowpick [use_col].head (10)

Unnamed: 0,created_at,rating,origin_user_id,origin_product_id,origin_age,origin_gender,price,brandName,origin_idThirdCategory,origin_skin_type
0,2020-04-30T02:12:36Z,3,119763,100000,36.0,f,11000.0,해피바스 (HAPPY BATH),112.0,복합성
1,2020-03-15T09:08:20Z,4,338669,100000,38.0,f,11000.0,해피바스 (HAPPY BATH),112.0,중성
2,2020-01-21T01:29:44Z,3,24862,100000,28.0,f,11000.0,해피바스 (HAPPY BATH),112.0,복합성
3,2020-01-18T08:03:59Z,4,1331797,100000,20.0,f,11000.0,해피바스 (HAPPY BATH),112.0,복합성
4,2020-01-11T07:15:15Z,4,888968,100000,33.0,f,11000.0,해피바스 (HAPPY BATH),112.0,복합성
5,2019-10-30T11:10:53Z,4,674978,100000,22.0,f,11000.0,해피바스 (HAPPY BATH),112.0,복합성
6,2019-08-15T07:46:56Z,3,1019028,100000,22.0,f,11000.0,해피바스 (HAPPY BATH),112.0,지성
7,2019-05-18T09:47:57Z,4,1262271,100000,19.0,f,11000.0,해피바스 (HAPPY BATH),112.0,건성
8,2019-03-13T08:45:42Z,3,1017755,100000,27.0,f,11000.0,해피바스 (HAPPY BATH),112.0,지성
9,2019-01-31T07:12:17Z,4,1150641,100000,30.0,f,11000.0,해피바스 (HAPPY BATH),112.0,복합성


In [21]:
%%time
REAL_USER_ID = '1150641'
top_n = 20

real_model_input, new_glowpick =  recommend.generate_user_input (REAL_USER_ID, glowpick, refined_products)

top_n_recommend, top_n_real_reviews = recommend.recommendation (REAL_USER_ID, model, real_model_input, glowpick, new_glowpick, top_n)

top_n_recommend

평가 상품수:  120
CPU times: user 1.33 s, sys: 180 ms, total: 1.51 s
Wall time: 1.51 s


Unnamed: 0,productTitle,rating
19456,매트 아이 컬러_267,4.19
2606,M5001A 플러스,4.1
44068,팬텀 MS6001A,4.09
6491,헤븐스 듀 올 오버 글리머,4.06
17190,클래식 크림 얼티미트,4.03
34877,러브리 치크칼라,4.03
37092,오 도랑쥬 베르트 오 드 코롱,4.03
32212,블루 스카이스 앤드 플러피 화이트 클라우즈,4.03
5528,미르토 디 파나레아 오 드 뚜왈렛,4.02
5360,맥스 MF5002B,4.02


In [22]:
top_n_real_reviews

Unnamed: 0,productTitle,rating
9,티컬렉션 그린티 미셀라 클렌징폼,4
176876,마이 컨실러 [다크서클 커버],4
203530,레드 에너지 리커버리 세럼,4
224394,베러 댄 아이즈,4
231331,스무스 매트 립틴트,4
233052,수드 쿨링 풋 스프레이,4
241280,안티에이징 바이오셀룰로오스 마스크,4
282943,DIY 플럼핑 패치,4
301575,엑소메가 크렘 에몰리앙뜨 D.E.F.I,4
5018,아쿠아 필링 젤 모이스처,4
