In [2]:
import sys
sys.path.append("../..")

import common
from common import FILE_PATH
from common import MODEL_PATH

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM
import pickle

%load_ext autoreload
%autoreload 2
import product_refine

st = sns.axes_style("whitegrid")
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})

plt.rcParams ['font.family'] = 'NanumGothic'

import warnings
warnings.filterwarnings("ignore")

### feather 파일로 저장되어있는 각각의 dataframe을 로드

In [3]:
%%time
# feather 파일로 저장되어있는 각각의 dataframe을 읽어옴

reviews = pd.read_feather (FILE_PATH + 'reviews.ftr', use_threads = True)
users = pd.read_feather (FILE_PATH + 'users.ftr', use_threads = True)
products = pd.read_feather (FILE_PATH + 'products.ftr', use_threads = True)
products_brand_rank = pd.read_feather (FILE_PATH + 'products_brand_rank.ftr', use_threads = True)
product_categories = pd.read_feather (FILE_PATH + 'product_categories.ftr', use_threads = True)

CPU times: user 3.26 s, sys: 727 ms, total: 3.99 s
Wall time: 3.77 s


### products  전처리

In [3]:
%%time
# df_products = product_refine.refine_products (products, products_brand_rank)

len del_id_list:  1317 

CPU times: user 5.73 s, sys: 98 ms, total: 5.82 s
Wall time: 5.75 s


#### df_products dataframe을 저장

In [4]:
# df_products Dataframe을 저장하기 위해 index를 reset 시킴. 
# df_products Dataframe의 결측치 rows 를 삭제 했으므로, index가 연속되지 않음. 그러면 저장 안됨
# df_products.reset_index (drop = True, inplace = True)

# df_products.to_feather (FILE_PATH + 'refined_products.ftr')

#### df_products dataframe을 파일로 부터 불러옴

In [4]:
# products와 products_brand_rank를 merge한 dataframe
df_products = pd.read_feather (FILE_PATH + 'refined_products.ftr', use_threads = True)

In [5]:
df_products.head (3)

Unnamed: 0,product_id,productTitle,price,volume,description,ratingAvg,wishCount,reviewCount,firstCategoryText,idFirstCategory,idSecondCategory,idThirdCategory,secondCategoryText,thirdCategoryText,rank,brand,brandName
0,108711,코스메티 LED 마스크,219000.0,1ea,"안면리프팅, 눈가탄력개선, 전피치밀도 수분개선 등에 도움을 주는 제품",3.75,2,4,디바이스,16,85,255,뷰티디바이스,LED마스크,5,"{'idBrand': None, 'brandTitle': '아름다운연구소', 'br...",아름다운연구소
1,118971,LED 마스크,149000.0,1ea,가장 적정거리의 96개의 LED로 피부 침투율을 높여주는 LED 마스크\n\n- 둥...,3.75,0,4,디바이스,16,85,255,뷰티디바이스,LED마스크,6,"{'idBrand': None, 'brandTitle': '에끌레어 (eclair)...",에끌레어 (eclair)
2,118819,디쎄 1.0 LED 마스크 [업무용],1320000.0,1ea,하루 한 번 자기전 15분 사용으로 전문적인 케어\r\n\r\n- 3가지 파장으로 ...,5.0,2,1,디바이스,16,85,255,뷰티디바이스,LED마스크,7,"{'idBrand': None, 'brandTitle': '닥터슈라클 (Dr.Ceu...",닥터슈라클 (Dr.Ceuracle)


In [6]:
df_products.isnull ().sum ()

product_id            0
productTitle          0
price                 0
volume                0
description           0
ratingAvg             0
wishCount             0
reviewCount           0
firstCategoryText     0
idFirstCategory       0
idSecondCategory      0
idThirdCategory       0
secondCategoryText    0
thirdCategoryText     0
rank                  0
brand                 0
brandName             0
dtype: int64

In [7]:
df_products.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86184 entries, 0 to 86183
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          86184 non-null  object 
 1   productTitle        86184 non-null  object 
 2   price               86184 non-null  float64
 3   volume              86184 non-null  object 
 4   description         86184 non-null  object 
 5   ratingAvg           86184 non-null  float64
 6   wishCount           86184 non-null  int64  
 7   reviewCount         86184 non-null  int64  
 8   firstCategoryText   86184 non-null  object 
 9   idFirstCategory     86184 non-null  int64  
 10  idSecondCategory    86184 non-null  int64  
 11  idThirdCategory     86184 non-null  int64  
 12  secondCategoryText  86184 non-null  object 
 13  thirdCategoryText   86184 non-null  object 
 14  rank                86184 non-null  int64  
 15  brand               86184 non-null  object 
 16  bran

In [8]:
users.head ()

Unnamed: 0,age,birth_year,email,gender,is_blinded,is_closed,is_inactivated,nickname,profile_image,rank,review_count,skin_type,user_id
0,31.0,1990.0,yhcu88@naver.com,f,0,False,False,희뷰리,https://d9vmi5fxk1gsw.cloudfront.net/prod/regi...,27347,44,복합성,1281918
1,33.0,1988.0,winwinanna@hanmail.net,f,0,False,False,소극적인뷰터,,41582,15,건성,1255686
2,37.0,1984.0,dmswjddlskfk@hanmail.net,f,0,False,False,greengables,https://d9vmi5fxk1gsw.cloudfront.net/home/glow...,946,375,지성,631689
3,32.0,1989.0,deer402@naver.com,f,0,False,True,백비송,https://d9vmi5fxk1gsw.cloudfront.net/prod/regi...,43404,29,건성,1188087
4,26.0,1995.0,kkr3348@naver.com,f,0,False,False,초코감귤,https://d9vmi5fxk1gsw.cloudfront.net/home/glow...,9386,97,복합성,632220


In [10]:
len (users ['age'].unique ())

79

In [11]:
# 100세 이상, 음수나이, 연도를 나이로 입력한듯한 나이(2016, 2018, 1995)등이 보임
users ['age'].value_counts ().tail (32)

 57.0      14
 121.0     11
 58.0      11
 7.0       11
 12.0      10
 3.0        9
 2.0        9
 60.0       8
 104.0      6
 62.0       5
 61.0       5
 11.0       5
 101.0      5
 59.0       4
 72.0       2
 103.0      2
 71.0       2
 102.0      2
 64.0       2
 8.0        2
 63.0       2
 105.0      2
-518.0      1
 67.0       1
 70.0       1
 9.0        1
 89.0       1
 98.0       1
 2016.0     1
 68.0       1
 2018.0     1
 1995.0     1
Name: age, dtype: int64

In [9]:
users.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76490 entries, 0 to 76489
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             76489 non-null  float64
 1   birth_year      76489 non-null  float64
 2   email           76490 non-null  object 
 3   gender          76490 non-null  object 
 4   is_blinded      76490 non-null  int64  
 5   is_closed       76490 non-null  bool   
 6   is_inactivated  76490 non-null  bool   
 7   nickname        76489 non-null  object 
 8   profile_image   44893 non-null  object 
 9   rank            76490 non-null  int64  
 10  review_count    76490 non-null  int64  
 11  skin_type       76490 non-null  object 
 12  user_id         76490 non-null  object 
dtypes: bool(2), float64(2), int64(3), object(6)
memory usage: 6.6+ MB


In [10]:
# 0세 ~ 90세 회원 정보만 뽑아냄.  음수나이와 90세 이상은 제외
users = users [(users ['age'] < 90) & (users ['age'] > 0)]
users.info ()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76456 entries, 0 to 76489
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             76456 non-null  float64
 1   birth_year      76456 non-null  float64
 2   email           76456 non-null  object 
 3   gender          76456 non-null  object 
 4   is_blinded      76456 non-null  int64  
 5   is_closed       76456 non-null  bool   
 6   is_inactivated  76456 non-null  bool   
 7   nickname        76455 non-null  object 
 8   profile_image   44871 non-null  object 
 9   rank            76456 non-null  int64  
 10  review_count    76456 non-null  int64  
 11  skin_type       76456 non-null  object 
 12  user_id         76456 non-null  object 
dtypes: bool(2), float64(2), int64(3), object(6)
memory usage: 7.1+ MB


In [11]:
users.describe ()

Unnamed: 0,age,birth_year,is_blinded,rank,review_count
count,76456.0,76456.0,76456.0,76456.0,76456.0
mean,25.250118,1995.749882,0.0,69090.6,41.476823
std,5.984109,5.984109,0.0,504406.0,56.878904
min,2.0,1932.0,0.0,1.0,-46.0
25%,21.0,1993.0,0.0,18536.0,10.0
50%,25.0,1996.0,0.0,38404.0,26.0
75%,28.0,2000.0,0.0,62796.0,51.0
max,89.0,2019.0,0.0,9999999.0,2003.0


### Merge reviews and  users

In [12]:
review_user_df = pd.merge (reviews, users, how = 'left', on = 'user_id')
review_user_df.head (3)

Unnamed: 0,contents,created_at,is_evaluation,like_count,rating,review_id,state,user_id,product_id,age,...,email,gender,is_blinded,is_closed,is_inactivated,nickname,profile_image,rank,review_count,skin_type
0,"티 컬렉션으로 출시되었던 제품으로, 가벼운 녹차향이 납니다. 향 자체는 좀 날리는 ...",2020-04-30T02:12:36Z,False,0,3,5416271,N,119763,100000,36.0,...,intears23@naver.com,f,0.0,False,False,kyo,https://d9vmi5fxk1gsw.cloudfront.net/home/glow...,2914.0,205.0,복합성
1,살짝 로션같이 짜지고 묽음.\r\n향은 독하지 않고 적절히 향긋함.\r\n거품잘남\...,2020-03-15T09:08:20Z,False,0,4,5340616,N,338669,100000,38.0,...,hihearyeong@naver.com,f,0.0,False,False,뭐라,https://d9vmi5fxk1gsw.cloudfront.net/prod/regi...,1023.0,285.0,중성
2,"해피바스는 무난하고 순한 매력이 있음! 다른것들도 잘 썼지만 정말 무난함,, 그치만...",2020-01-21T01:29:44Z,False,0,3,5228598,N,24862,100000,28.0,...,leemj1993@naver.com,f,0.0,False,False,민졍,,2292.0,230.0,복합성


In [16]:
len (reviews)

1574817

In [17]:
len (review_user_df)

1574817

### Merge review_user_df and  df_products

In [13]:
review_user_product_df = pd.merge (review_user_df, df_products, how = 'left', on = 'product_id')
review_user_product_df.head (3)

Unnamed: 0,contents,created_at,is_evaluation,like_count,rating,review_id,state,user_id,product_id,age,...,reviewCount,firstCategoryText,idFirstCategory,idSecondCategory,idThirdCategory,secondCategoryText,thirdCategoryText,rank_y,brand,brandName
0,"티 컬렉션으로 출시되었던 제품으로, 가벼운 녹차향이 납니다. 향 자체는 좀 날리는 ...",2020-04-30T02:12:36Z,False,0,3,5416271,N,119763,100000,36.0,...,12.0,클렌징,7.0,32.0,112.0,페이셜클렌저,클렌징폼,386.0,"{'idBrand': None, 'brandTitle': '해피바스 (HAPPY B...",해피바스 (HAPPY BATH)
1,살짝 로션같이 짜지고 묽음.\r\n향은 독하지 않고 적절히 향긋함.\r\n거품잘남\...,2020-03-15T09:08:20Z,False,0,4,5340616,N,338669,100000,38.0,...,12.0,클렌징,7.0,32.0,112.0,페이셜클렌저,클렌징폼,386.0,"{'idBrand': None, 'brandTitle': '해피바스 (HAPPY B...",해피바스 (HAPPY BATH)
2,"해피바스는 무난하고 순한 매력이 있음! 다른것들도 잘 썼지만 정말 무난함,, 그치만...",2020-01-21T01:29:44Z,False,0,3,5228598,N,24862,100000,28.0,...,12.0,클렌징,7.0,32.0,112.0,페이셜클렌저,클렌징폼,386.0,"{'idBrand': None, 'brandTitle': '해피바스 (HAPPY B...",해피바스 (HAPPY BATH)


In [19]:
len (review_user_product_df)

1574817

In [14]:
review_user_product_df.isnull ().sum ()

contents                   0
created_at                 0
is_evaluation              0
like_count                 0
rating                     0
review_id                  0
state                      0
user_id                    0
product_id                 0
age                   102673
birth_year            102673
email                 102673
gender                102673
is_blinded            102673
is_closed             102673
is_inactivated        102673
nickname              102684
profile_image         549143
rank_x                102673
review_count          102673
skin_type             102673
productTitle           43805
price                  43805
volume                 43805
description            43805
ratingAvg              43805
wishCount              43805
reviewCount            43805
firstCategoryText      43805
idFirstCategory        43805
idSecondCategory       43805
idThirdCategory        43805
secondCategoryText     43805
thirdCategoryText      43805
rank_y        

### product_categoreis

새로운 상품 raw 데이터에 카테고리 정보가 포함되 있음

그래서 지금은 필요없음

In [21]:
# product_categories.info()

In [22]:
# product_categories.isnull ().sum ()

In [23]:
# product_categories ['idThirdCategory'].nunique ()

In [24]:
# 하나의 상품마다 2 이상의 카테고리를 가질수 있음
# product_categories ['product_id'].value_counts ().head (50)

In [25]:
# products [products ['product_id'] == '1030']

In [26]:
# product_categories [product_categories ['product_id'] == '1030']

In [27]:
# product_categories [product_categories ['product_id'] == '1030']

In [28]:
# 상품별 중복 row를 삭제. 하나의 상품당 하나의 카테고리 (first > second > third)만
# categories_drop_dupli = product_categories.drop_duplicates ('product_id')
# categories_drop_dupli

In [29]:
# print (categories_drop_dupli ['idFirstCategory'].nunique ())
# print (categories_drop_dupli ['idSecondCategory'].nunique ())
# print (categories_drop_dupli ['idThirdCategory'].nunique ())

In [30]:
# categories_drop_dupli.isnull ().sum ()

In [31]:
# categories_drop_dupli.info ()

In [32]:
# categories_drop_dupli ['product_id'].value_counts ()

In [33]:
# categories_drop_dupli [categories_drop_dupli ['product_id'] == '2851']

### merge

review_user_product_df, categories_drop_dupli

In [34]:
# total_merged_df = pd.merge (review_user_product_df, categories_drop_dupli, how = 'left', on = 'product_id')
# total_merged_df.head (3)

In [35]:
# print (len (review_user_product_df))
# print (len (total_merged_df))

In [36]:
# print (total_merged_df.columns)
# total_merged_df.head (1)

In [15]:
review_user_product_df.columns 

Index(['contents', 'created_at', 'is_evaluation', 'like_count', 'rating',
       'review_id', 'state', 'user_id', 'product_id', 'age', 'birth_year',
       'email', 'gender', 'is_blinded', 'is_closed', 'is_inactivated',
       'nickname', 'profile_image', 'rank_x', 'review_count', 'skin_type',
       'productTitle', 'price', 'volume', 'description', 'ratingAvg',
       'wishCount', 'reviewCount', 'firstCategoryText', 'idFirstCategory',
       'idSecondCategory', 'idThirdCategory', 'secondCategoryText',
       'thirdCategoryText', 'rank_y', 'brand', 'brandName'],
      dtype='object')

In [16]:
review_user_product_df.head (3)

Unnamed: 0,contents,created_at,is_evaluation,like_count,rating,review_id,state,user_id,product_id,age,...,reviewCount,firstCategoryText,idFirstCategory,idSecondCategory,idThirdCategory,secondCategoryText,thirdCategoryText,rank_y,brand,brandName
0,"티 컬렉션으로 출시되었던 제품으로, 가벼운 녹차향이 납니다. 향 자체는 좀 날리는 ...",2020-04-30T02:12:36Z,False,0,3,5416271,N,119763,100000,36.0,...,12.0,클렌징,7.0,32.0,112.0,페이셜클렌저,클렌징폼,386.0,"{'idBrand': None, 'brandTitle': '해피바스 (HAPPY B...",해피바스 (HAPPY BATH)
1,살짝 로션같이 짜지고 묽음.\r\n향은 독하지 않고 적절히 향긋함.\r\n거품잘남\...,2020-03-15T09:08:20Z,False,0,4,5340616,N,338669,100000,38.0,...,12.0,클렌징,7.0,32.0,112.0,페이셜클렌저,클렌징폼,386.0,"{'idBrand': None, 'brandTitle': '해피바스 (HAPPY B...",해피바스 (HAPPY BATH)
2,"해피바스는 무난하고 순한 매력이 있음! 다른것들도 잘 썼지만 정말 무난함,, 그치만...",2020-01-21T01:29:44Z,False,0,3,5228598,N,24862,100000,28.0,...,12.0,클렌징,7.0,32.0,112.0,페이셜클렌저,클렌징폼,386.0,"{'idBrand': None, 'brandTitle': '해피바스 (HAPPY B...",해피바스 (HAPPY BATH)


In [18]:
get_cols = ['contents', 'created_at', 'rating', 'user_id', 'product_id', 'age', 'gender', 'is_blinded', 'is_closed', 'is_inactivated', 'skin_type', 'productTitle', 'volume', 'price', 'brandName', 'idThirdCategory']

total_df = review_user_product_df [get_cols]
total_df.head (3)

Unnamed: 0,contents,created_at,rating,user_id,product_id,age,gender,is_blinded,is_closed,is_inactivated,skin_type,productTitle,volume,price,brandName,idThirdCategory
0,"티 컬렉션으로 출시되었던 제품으로, 가벼운 녹차향이 납니다. 향 자체는 좀 날리는 ...",2020-04-30T02:12:36Z,3,119763,100000,36.0,f,0.0,False,False,복합성,티컬렉션 그린티 미셀라 클렌징폼,175g,11000.0,해피바스 (HAPPY BATH),112.0
1,살짝 로션같이 짜지고 묽음.\r\n향은 독하지 않고 적절히 향긋함.\r\n거품잘남\...,2020-03-15T09:08:20Z,4,338669,100000,38.0,f,0.0,False,False,중성,티컬렉션 그린티 미셀라 클렌징폼,175g,11000.0,해피바스 (HAPPY BATH),112.0
2,"해피바스는 무난하고 순한 매력이 있음! 다른것들도 잘 썼지만 정말 무난함,, 그치만...",2020-01-21T01:29:44Z,3,24862,100000,28.0,f,0.0,False,False,복합성,티컬렉션 그린티 미셀라 클렌징폼,175g,11000.0,해피바스 (HAPPY BATH),112.0


In [19]:
total_df.info ()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1574817 entries, 0 to 1574816
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   contents         1574817 non-null  object 
 1   created_at       1574817 non-null  object 
 2   rating           1574817 non-null  int64  
 3   user_id          1574817 non-null  object 
 4   product_id       1574817 non-null  object 
 5   age              1472144 non-null  float64
 6   gender           1472144 non-null  object 
 7   is_blinded       1472144 non-null  float64
 8   is_closed        1472144 non-null  object 
 9   is_inactivated   1472144 non-null  object 
 10  skin_type        1472144 non-null  object 
 11  productTitle     1531012 non-null  object 
 12  volume           1531012 non-null  object 
 13  price            1531012 non-null  float64
 14  brandName        1531012 non-null  object 
 15  idThirdCategory  1531012 non-null  float64
dtypes: float64(4), int

### 결측치 삭제

1. 단순 삭제 모드

In [20]:
total_df.isnull ().sum ()

contents                0
created_at              0
rating                  0
user_id                 0
product_id              0
age                102673
gender             102673
is_blinded         102673
is_closed          102673
is_inactivated     102673
skin_type          102673
productTitle        43805
volume              43805
price               43805
brandName           43805
idThirdCategory     43805
dtype: int64

In [21]:
total_df ['gender'].value_counts (dropna = False)

f      1457621
NaN     102673
m        14523
Name: gender, dtype: int64

In [22]:
# 'age', 'productTitle', 'price', 'idThirdCategory' 컬럼에 결측치가 있으면 해당 row를 삭제

glowpick = total_df.dropna (subset = ['age', 'productTitle', 'price', 'idThirdCategory'], how = 'any', axis = 0)
glowpick.info ()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1431471 entries, 0 to 1574816
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   contents         1431471 non-null  object 
 1   created_at       1431471 non-null  object 
 2   rating           1431471 non-null  int64  
 3   user_id          1431471 non-null  object 
 4   product_id       1431471 non-null  object 
 5   age              1431471 non-null  float64
 6   gender           1431471 non-null  object 
 7   is_blinded       1431471 non-null  float64
 8   is_closed        1431471 non-null  object 
 9   is_inactivated   1431471 non-null  object 
 10  skin_type        1431471 non-null  object 
 11  productTitle     1431471 non-null  object 
 12  volume           1431471 non-null  object 
 13  price            1431471 non-null  float64
 14  brandName        1431471 non-null  object 
 15  idThirdCategory  1431471 non-null  float64
dtypes: float64(4), int

In [23]:
# 14만건 정도가 삭제됨
len (total_df) - len (glowpick)

143346

In [24]:
# glowpick [glowpick ['product_id'] == '19043']

Unnamed: 0,contents,created_at,rating,user_id,product_id,age,gender,is_blinded,is_closed,is_inactivated,skin_type,productTitle,volume,price,brandName,idThirdCategory
412450,너무너무 촉촉해서 좋아요 역시 최고입니당 ㅜㅜ 가격도 착해요 !!,2020-05-03T13:01:44Z,5,1366339,19043,26.0,m,0.0,False,False,복합성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0
412451,엇? 바이오더마의 숨은 꿀템이!!??\r\n보습력까지 키워주구 거기다가 발림성까지 ...,2020-02-16T08:49:11Z,5,1333766,19043,22.0,m,0.0,False,False,민감성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0
412452,온갖 립밤 다 써봤는데 이걸로 정착함\r\n말이 필요가 없다 써보세요,2018-10-11T06:54:28Z,5,1034229,19043,26.0,m,0.0,False,False,지성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0
412453,몽쥬약국에서 2+1해서 샀는데 발라보니깐 예쁘게 기름지다라는 말이 어울리는 거 같아...,2017-08-20T15:06:19Z,5,664808,19043,25.0,m,0.0,False,False,지성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0
412454,향도 좋고 부드럽게 잘 발린다. 보습도 괜찮은듯.. 바르다가 안 바르면 입술이 건조...,2017-07-21T18:23:44Z,5,722148,19043,24.0,m,0.0,False,True,중성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415321,파리가서........샀는데 거기직원이 프랑스인들은 유리아주보다 이거더쓴다하고 가격...,2015-08-28T09:26:02Z,1,60380,19043,28.0,f,0.0,False,True,건성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0
415322,냄새만좋고 저한텐 효과 없었어요,2015-06-21T13:14:25Z,1,86223,19043,26.0,f,0.0,False,True,건성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0
1392170,일단 나는 향이 거북했음\r\n그때가 아마 머리가 어지러웠서 더 향이 거북했던 거 ...,2020-05-07T16:08:54Z,3,1359528,19043,19.0,f,0.0,False,False,지성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0
1392171,이거슨 나의 인생템\r\n바ㅅ린 만큼은 아니지만 스틱형 립밤 치고 꽤 지속력 있고 ...,2020-05-07T08:16:05Z,5,1349460,19043,21.0,f,0.0,False,False,복합성,아토덤 립스틱,4g,15000.0,바이오더마 (BIODERMA),59.0


In [25]:
glowpick ['product_id'].unique ()

array(['100000', '1030', '100002', ..., '78226', '4', '5'], dtype=object)

In [26]:
glowpick [glowpick ['product_id'] == '46429']

Unnamed: 0,contents,created_at,rating,user_id,product_id,age,gender,is_blinded,is_closed,is_inactivated,skin_type,productTitle,volume,price,brandName,idThirdCategory


In [None]:
dfasfas

#### labeling하기 전 glowpick을 저장

In [49]:
# glowpick Dataframe을 저장하기 위해 index를 reset 시킴. 
# glowpick Dataframe의 결측치 rows 를 삭제 했으므로, index가 연속되지 않음. 그러면 저장 안됨
# glowpick.reset_index (drop = True, inplace = True)
# glowpick Dataframe을 저장
# glowpick.to_feather (FILE_PATH + 'glowpick_before_labeling.ftr')

In [30]:
# sparse_features로 사용할 features 설정
# glowpick의 sparse_feature 컬럼을 rename 하기 위한 dictionary (origin_dict) 생성 
origin_dict = {}
sparse_features = ["product_id", "user_id", "gender", "age", "skin_type", "idThirdCategory", ]
target = ['rating']

for feature in sparse_features:
    origin_dict [feature] = 'origin_' + feature
    
origin_dict

{'product_id': 'origin_product_id',
 'user_id': 'origin_user_id',
 'gender': 'origin_gender',
 'age': 'origin_age',
 'skin_type': 'origin_skin_type',
 'idThirdCategory': 'origin_idThirdCategory'}

In [31]:
# glowpick의 sparse_feature 컬럼을 rename 
glowpick.rename (origin_dict, axis = 'columns', inplace = True)
glowpick.head (3)

Unnamed: 0,contents,created_at,rating,origin_user_id,origin_product_id,origin_age,origin_gender,is_blinded,is_closed,is_inactivated,origin_skin_type,productTitle,volume,price,brandName,origin_idThirdCategory
0,"티 컬렉션으로 출시되었던 제품으로, 가벼운 녹차향이 납니다. 향 자체는 좀 날리는 ...",2020-04-30T02:12:36Z,3,119763,100000,36.0,f,0.0,False,False,복합성,티컬렉션 그린티 미셀라 클렌징폼,175g,11000.0,해피바스 (HAPPY BATH),112.0
1,살짝 로션같이 짜지고 묽음.\r\n향은 독하지 않고 적절히 향긋함.\r\n거품잘남\...,2020-03-15T09:08:20Z,4,338669,100000,38.0,f,0.0,False,False,중성,티컬렉션 그린티 미셀라 클렌징폼,175g,11000.0,해피바스 (HAPPY BATH),112.0
2,"해피바스는 무난하고 순한 매력이 있음! 다른것들도 잘 썼지만 정말 무난함,, 그치만...",2020-01-21T01:29:44Z,3,24862,100000,28.0,f,0.0,False,False,복합성,티컬렉션 그린티 미셀라 클렌징폼,175g,11000.0,해피바스 (HAPPY BATH),112.0


In [28]:
glowpick ['skin_type'].nunique ()

5

In [29]:
# 이전 labelEncoding
# from sklearn.preprocessing import LabelEncoder

# # 1.Label Encoding for sparse features,and process sequence features
# for feat in sparse_features:
#     lbe = LabelEncoder()
#     glowpick ['labeled' + feat] = lbe.fit_transform (glowpick [feat])
    
# glowpick.head (3)

In [32]:
from sklearn.preprocessing import LabelEncoder

# 1.Label Encoding for sparse features,and process sequence features
# sparse_feature 는 origin_dict 의 원래 컬럼명. ex) origin_user_id (origin_이 붙은 컬럼명) : user_id (원래 컬럼 명)
for feat, origin_feat in zip (sparse_features, origin_dict):
    lbe = LabelEncoder()
    glowpick [feat] = lbe.fit_transform (glowpick [origin_dict [origin_feat]])    
    
glowpick.head (3)

Unnamed: 0,contents,created_at,rating,origin_user_id,origin_product_id,origin_age,origin_gender,is_blinded,is_closed,is_inactivated,...,volume,price,brandName,origin_idThirdCategory,product_id,user_id,gender,age,skin_type,idThirdCategory
0,"티 컬렉션으로 출시되었던 제품으로, 가벼운 녹차향이 납니다. 향 자체는 좀 날리는 ...",2020-04-30T02:12:36Z,3,119763,100000,36.0,f,0.0,False,False,...,175g,11000.0,해피바스 (HAPPY BATH),112.0,0,7230,0,32,2,108
1,살짝 로션같이 짜지고 묽음.\r\n향은 독하지 않고 적절히 향긋함.\r\n거품잘남\...,2020-03-15T09:08:20Z,4,338669,100000,38.0,f,0.0,False,False,...,175g,11000.0,해피바스 (HAPPY BATH),112.0,0,22864,0,34,3,108
2,"해피바스는 무난하고 순한 매력이 있음! 다른것들도 잘 썼지만 정말 무난함,, 그치만...",2020-01-21T01:29:44Z,3,24862,100000,28.0,f,0.0,False,False,...,175g,11000.0,해피바스 (HAPPY BATH),112.0,0,19317,0,24,2,108


### glowpick Dataframe을 저장

In [53]:
# glowpick Dataframe을 저장하기 위해 index를 reset 시킴. 
# glowpick Dataframe의 결측치 rows 를 삭제 했으므로, index가 연속되지 않음. 그러면 저장 안됨
glowpick.reset_index (drop = True, inplace = True)
# glowpick Dataframe을 저장
# glowpick.to_feather (FILE_PATH + 'glowpick.ftr')

In [54]:
glowpick.isnull ().sum ()

contents           0
created_at         0
rating             0
user_id            0
product_id         0
age                0
gender             0
is_blinded         0
is_closed          0
is_inactivated     0
skin_type          0
productTitle       0
volume             0
price              0
brandName          0
idThirdCategory    0
dtype: int64

### 2. sequence_feature에 대한 SparseFeat 생성

In [55]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat (feat, glowpick [feat].nunique(), embedding_dim=4)
                          for feat in sparse_features]
fixlen_feature_columns

[SparseFeat(name='product_id', vocabulary_size=50761, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='product_id', group_name='default_group'),
 SparseFeat(name='user_id', vocabulary_size=74773, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='user_id', group_name='default_group'),
 SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='gender', group_name='default_group'),
 SparseFeat(name='age', vocabulary_size=67, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='age', group_name='default_group'),
 SparseFeat(name='skin_type', vocabulary_size=5, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='skin_type', group_name='default_group'),
 SparseFeat(name='idThirdCategory', vocabulary_size=286, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='idThirdCategory', group_name='default_group')]

In [56]:
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names (linear_feature_columns + dnn_feature_columns)

feature_names

['product_id', 'user_id', 'gender', 'age', 'skin_type', 'idThirdCategory']

### linear_feature_columns list 와 dnn_feature_columns list 를 저장

In [None]:
# import pickle

# with open (FILE_PATH + 'linear_feature_columns_list.pickle', 'wb') as fp:
#     pickle.dump (linear_feature_columns, fp)
    
# with open (FILE_PATH + 'dnn_feature_columns_list.pickle', 'wb') as fp:
#     pickle.dump (dnn_feature_columns, fp)

In [None]:
# with open (FILE_PATH + 'linear_feature_columns_list.pickle', 'rb') as fp:
#     linear_feature_columns = pickle.load (fp)
    
# with open (FILE_PATH + 'dnn_feature_columns_list.pickle', 'rb') as fp:
#     dnn_feature_columns = pickle.load (fp)

In [None]:
type (dnn_feature_columns)

### 3. input data 생성

In [58]:
# 3.generate input data for model
train, test = train_test_split (glowpick, test_size = 0.2)
train_model_input = {name: train [name] for name in feature_names}
test_model_input = {name: test [name] for name in feature_names}

In [None]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

### 4. Training

In [None]:
model = DeepFM (linear_feature_columns, dnn_feature_columns, task = 'regression', device = device)
model.compile ("adam", "mse", metrics=['mse'], )

In [None]:
history = model.fit (train_model_input, train [target].values,
                    batch_size = 256, epochs = 6, verbose = 2, validation_split = 0.2, )

### model 저장

In [None]:
# torch.save (model.state_dict (), MODEL_PATH)

In [None]:
model = DeepFM (linear_feature_columns, dnn_feature_columns, task = 'regression', device = device)
model.load_state_dict (torch.load (MODEL_PATH))
model.eval ()

### 05.25일 categories 데이터 에러였을때 ----------

In [None]:
history = model.fit (train_model_input, train[target].values,
                    batch_size = 256, epochs = 5, verbose = 2, validation_split = 0.2, )

In [None]:
# epoch 10
pred_ans = model.predict (test_model_input, batch_size = 256)
print("test MSE", round (mean_squared_error (test [target].values, pred_ans), 4))

In [None]:
from math import sqrt

rms = sqrt (mean_squared_error (test [target].values, pred_ans))
print (rms)

# 1.03

In [None]:
# epoch 5

pred_ans = model.predict (test_model_input, batch_size = 256)
print("test MSE", round(mean_squared_error(
    test [target].values, pred_ans), 4))

In [None]:
print ("test RMSE", round (sqrt (mean_squared_error (test [target].values, pred_ans)), 4))
# 0.98

### ---------- 05.25일 categories 데이터 에러였을때

### 5. 평가

In [None]:
# epoch 10
from math import sqrt

pred_ans = model.predict (test_model_input, batch_size = 256)

print("test MSE", round (mean_squared_error (test [target].values, pred_ans), 4))

print ("\ntest RMSE", sqrt (mean_squared_error (test [target].values, pred_ans)))

In [None]:
# epoch 6
from math import sqrt

pred_ans = model.predict (test_model_input, batch_size = 256)

print("test MSE", round (mean_squared_error (test [target].values, pred_ans), 4))

print ("\ntest RMSE", sqrt (mean_squared_error (test [target].values, pred_ans)))

### 실제 유저용 메서드

In [None]:
show_cols = ['created_at', 'rating', 'user_id', 'product_id', 'age', 'gender', 'is_closed', 'skin_type', 'productTitle', 'volume', 'price', 'brandName', 'idThirdCategory']

real_method_df = glowpick [show_cols]
real_method_df.head (10)

In [None]:
real_user_id = 7230

real_user_df = real_method_df [real_method_df ['user_id'] == real_user_id]
real_user_df

In [None]:
real_fixlen_feature_columns = [SparseFeat (feat, real_user_df [feat].nunique(), embedding_dim=4)
                          for feat in sparse_features]
real_fixlen_feature_columns

In [None]:
real_model_input = {name: real_user_df [name] for name in feature_names}

In [None]:
# epoch 10
from math import sqrt

pred_ans = model.predict (real_model_input, batch_size = 256)

print("test MSE", round (mean_squared_error (real_user_df [target].values, pred_ans), 4))

print ("\ntest RMSE", sqrt (mean_squared_error (real_user_df [target].values, pred_ans)))

In [None]:
pred_list = list (map (lambda x : round (x, 2), pred_ans.flatten ().tolist ()))   # 예측한 점수 리스트
target_list = real_user_df [target].values.flatten ().tolist ()  # 실제 점수 리스트

result_list = list (zip (pred_list, target_list))
result_list

In [None]:
type (real_method_df [target].values)

In [None]:
pred_ans.flatten ().tolist ()

In [None]:
real_user_df

In [None]:
user_review_indexs = list (real_method_df [real_method_df ['user_id'] == real_user_id].index)
print (user_review_indexs, '\n')
print (len (user_review_indexs))

In [None]:
glowpick.loc [5734]

In [None]:
glowpick.head ()

In [None]:
user_review_indexs

In [None]:
user_not_reviews_df = glowpick.loc [~glowpick.index.isin (user_review_indexs)]

user_not_reviews_df

In [None]:
len (glowpick) - len (user_review_indexs)

In [None]:
len (user_review_indexs)

In [None]:
all (user_index in user_not_reviews_list for user_index in user_review_indexs)    

In [None]:
user_not_reviews_list = list (user_not_reviews_df.index)
len (user_not_reviews_list)

In [None]:
user_not_reviews_list [: 10]

In [None]:
1 in user_not_reviews_list

In [None]:
user_review_indexs in user_not_reviews_list

In [None]:
len (real_user_df)

In [None]:
len (user_not_reviews_df ['product_id'])

In [None]:
print (real_user_df ['product_id'].unique ().tolist ())

In [None]:
unique_review_products = real_user_df ['product_id'].unique ().tolist ()
unique_review_products.sort ()
print (unique_review_products)

In [None]:
unique_not_review_products = user_not_reviews_df ['product_id'].unique ().tolist ()
unique_not_review_products.sort ()
print (unique_not_review_products)

In [None]:
print (user_not_reviews_df ['product_id'].unique ().tolist ())

In [None]:
len (glowpick.loc [~glowpick.index.isin (user_review_indexs)] ['product_id'].unique ())

In [None]:
user_not_reviews_df ['product_id'].value_counts ()

In [None]:
glowpick ['product_id'].nunique ()

In [None]:
len (real_method_df ['product_id'].unique ())

In [None]:

len (real_user_df ['product_id'].unique ())

In [None]:
real_method_df