# 1. 데이터 & 패키지 불러오기

## 1-1. 패키지

In [None]:
import pandas as pd
import numpy as np
import datetime
import konlpy
from ast import literal_eval
import time
import networkx as nx
import matplotlib.pyplot as plt
import re
import torch
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime
import copy
from tqdm import tqdm

## 1-2. 데이터
- (파일) `df_for_cbs_final` : 크롤링 데이터(날짜+제품정보+리뷰) & AMTL 결과 (감정 예측값 & attention 값)
- (파일) `product_dictionary` : 제품 & 제품군 매칭 정보
- (칼럼) `nva` : Noun & Verb & Adjective ( 리뷰 내의 명사/동사/형용사를 추출한 list )

In [None]:
# 1) 크롤링 데이터(날짜+제품정보+리뷰) + 
df_for_cbs = pd.read_csv('df_for_cbs_final.csv')
product_dictionary = pd.read_csv('product_dictionary.csv')

df_for_cbs['date'] = pd.to_datetime(df_for_cbs['date'])
df_for_cbs['review_split2'] = df_for_cbs['review_split'].apply(literal_eval)
df_for_cbs['nva'] = df_for_cbs['nva'].apply(literal_eval)
df_for_cbs['NVA_with_duplicate'] = df_for_cbs['NVA_with_duplicate'].apply(literal_eval)

# 2. 전처리

## 2-1. `count_df` : 제품/일자 별 리뷰 개수

In [None]:
count_df=pd.DataFrame(df_for_cbs.groupby(['prod_nm','date']).count().unstack(fill_value=0).stack()['rating'])
count_df=count_df.reset_index().set_index('prod_nm')
count_df.columns=['date','count']

## 2-2. Functions

### `get_current_k (df, Y, M, D, K)`
- 데이터 프레임 : df
- 특정 날짜 (Y-M-D)로부터, 과거 K window만큼의 기간에만 해당하는 데이터 필터링

In [None]:
def get_current_k (df, Y, M, D, K = 90 ):
    import datetime
    eval_date = datetime.datetime(Y,M,D)
    from_date = eval_date - datetime.timedelta(days = K )
    df =  df[ (df['date'] >= from_date)&(df['date'] < eval_date)]
    return eval_date, df

### `sentiment_cooc`
- 감정 weight가 반영된 cooccurence matrix 생성
- `aspects` : 15개의 감정
- `cat_detected` : 8개의 리스트를 담은 리스트
  - 8개의 리스트 = 8개의 (소) 카테고리 ... A~G
  - (대) 카테고리 : (A,B,C), (D), (E), (F,G)


In [None]:
def sentiment_cooc(emotion, prod_name, sent=True):
    aspects=['용량','민감성','향기','커버력',
             '지속력','피부톤','보습감','향','사용감',
             '발림성','세정력','촉촉함','유분기','발색감','제형']
    
    cat_detected = [['민감성','세정력','촉촉함'],
                     ['민감성', '보습감','향'],
                     ['민감성', '사용감', '발림성'],

                     ['커버력', '지속력', '피부톤'],

                     ['지속력','피부톤', '촉촉함', '유분기','발색감','제형'],

                     ['지속력', '사용감', '발색감'],
                     ['지속력', '사용감', '발색감'],

                     ['향기', '민감성', '용량'],
                     ['향기', '민감성', '용량']]
    
    # 1) 언급된 속성 탐지하기
    prod_name_group=emotion.groupby('prod_nm').get_group(prod_name) # 해당 제품의 모든 리뷰
    small_cat_idx=int(prod_name_group['small_category'].values[0][3]) # 제품이 속한 소제품군
    prod_name_sum=prod_name_group.sum()[aspects]
    small_cat_detected=cat_detected[small_cat_idx-1]
    detected_aspect_sent=prod_name_sum[small_cat_detected]

    # 2) co-occurence matrix 만들기
    sent_cooc=np.ones((len(small_cat_detected)+1,len(small_cat_detected)+1))
    sent_cooc[0,1:]=np.array(detected_aspect_sent)
    sent_cooc[1:,0]=np.array(detected_aspect_sent)
    np.fill_diagonal(sent_cooc,0)
    
    sent_cooc=pd.DataFrame(sent_cooc,
                           columns=[prod_name]+small_cat_detected,
                           index=[prod_name]+small_cat_detected)

    # 3) 속성 & 속성 간의 weight 채워넣기
    aspect_columns=sent_cooc.columns[1:] # 제품명/속성1/속성2/..속성n ---> 속성1/속성2/..속성n

    # 속성별 모든 조합 ( ab,ac,ad,bc,bd,cd )
    for i in range(len(aspect_columns)):
        for j in range(i+1,len(aspect_columns)):
            # 2개의 속성
            aspect1=aspect_columns[i]
            aspect2=aspect_columns[j]
            
            # 2개의 속성 값
            val_df=prod_name_group[aspect_columns][[aspect1,aspect2]]
            val_df['both']=((val_df.iloc[:,0]*val_df.iloc[:,1])!=0).astype(int) # 속성 2개가 동시에 등장한 경우에만!
            val=sum(val_df.iloc[:,:-1].sum(axis=1)*val_df.iloc[:,-1])

            # co-occurence 값으로 넣기
            sent_cooc.loc[aspect1,aspect2]=val
            sent_cooc.loc[aspect2,aspect1]=val

    n = len(sent_cooc.columns)
    num_review = prod_name_group.shape[0]

    return sent_cooc,n,num_review

### `connectivity_from_cooc`
- co-occurence matrix로부터 connectivity 계산하기
- 사용한 connectivity 지표 : **betweenness centrality**

In [None]:
def connectivity_from_cooc(cooc_mat):
    conn = cooc_mat.stack()
    conn=conn.rename_axis(('source','target')).reset_index(name='weight')
    G = nx.from_pandas_edgelist(conn,edge_attr=True)

    for u,v,data in G.edges(data=True):
        if 'weight' in data and data['weight']!=0:
            data['inverse']=1/data['weight']
        else:
            data['inverse']=1
    
    connectivity = nx.betweenness_centrality(G,normalized=True,weight='inverse',endpoints =True)

    return connectivity

# 3. CBS & CBSx 계산
함수명 : `get_CBS` <br>
인자
- `df_filled` : 감정점수 & attention값이 모두 채워진 dataframe
- `start_date` ~ `end_date` : 점수 계산 대상 날짜
  - ex) 2020.01.01 ~ 2020.12.31 : 총 365개의 일별 점수가 계산됨
- `window` : 특정 날짜기준, 과거 window일 만큼의 리뷰를 점수 계산 대상으로 산정
  - ex) 계산날짜 : 2020.05.31 & window = 15 $\rightarrow$ 2020.05.16~2020.05.31리뷰를 사용하여 2020.05.31의 CBS 점수 계산
- `emotion_I` : CBS의 1번째 요소 (Impression)를 계산하는데에 있어서, 사용할 긍/중/부의 가중치
- `emotion_V` : CBS의 2번째 요소 (Variety)를 계산하는데에 있어서, 사용할 긍/중/부의 가중치

In [None]:
def get_CBS(df_filled, start_date, end_date, window,emotion_I=[-1,0.5,1], emotion_V=[-1,0,1]):
    
    def daterange(start_date, end_date):
        for n in range(int((end_date - start_date).days)):
            yield start_date + timedelta(n)

    e1_i,e2_i,e3_i=emotion_I
    e1_v,e2_v,e3_v=emotion_V
    
    df_filled['effortness'] = df_filled['review_split'].apply(lambda x:len(x)-2-19)
    df_filled['validity'] = df_filled['review_split2'].apply(lambda x : len(set(x)) / len(x))
    df_filled['expressivity'] = df_filled['NVA_with_duplicate'].apply(lambda x: len(set(x))/len(x) )
    
    cbs_per_day=[]
    import tqdm    
    for single_date in tqdm.tqdm(daterange(start_date, end_date)):
        current_=single_date.strftime("%Y, %m, %d").split(',')
        date, df = get_current_k(df_filled, int(current_[0]),int(current_[1]),int(current_[2]),K=window)
        
        ############################################################################################
        ##### 1) Quality ( of every review )
        ############################################################################################
        df['Q_score']=(1/(1 + np.exp(-np.log(np.log(df['effortness'])))))*df['validity']* df['expressivity']
        quality_df = df[['prod_nm','Q_score']].copy()

        ############################################################################################
        ##### 2-1) IMPRESSION
        ############################################################################################
        S1=df[['overall_neg','overall_neu','overall_pos','prod_nm']]
        S1['score']=((S1['overall_neg']*e1_i)+(S1['overall_neu']*e2_i) + (S1['overall_pos']*e3_i)).astype('float')
        S1['score2']=S1['score']*quality_df['Q_score']
        
        # Impression ( cbs & cbsx )
        score1_1 = S1.groupby('prod_nm').sum()['score']
        score1_2 = S1.groupby('prod_nm').sum()['score2']
        
        ############################################################################################
        ##### 2-2) VARIETY
        ############################################################################################
        emotion_neg=(df[df.columns[(df.columns.str.contains('_neg'))&(~df.columns.str.contains('overall'))]])*e1_v
        emotion_neu=(df[df.columns[(df.columns.str.contains('_neu'))&(~df.columns.str.contains('overall'))]])*e2_v
        emotion_pos=(df[df.columns[(df.columns.str.contains('_pos'))&(~df.columns.str.contains('overall'))]])*e3_v
        
        emotion_ATT=df[df.columns[df.columns.str.contains('ATT')]]
        weight_ATT=(df[df.columns[df.columns.str.contains('ATT_')]].isna()==False).sum(axis=1)
        deweight_ATT=df['big_category'].replace({'catA':3,'catB':3,'catC':6,'catD':3,'catE':3})
        emotion_ATT.fillna(0,inplace=True)

        emotion_weight=emotion_ATT.multiply(np.array((weight_ATT/deweight_ATT)),axis=0)
        emotion_value=pd.DataFrame(emotion_pos.fillna(0).values + emotion_neu.fillna(0).values + emotion_neg.fillna(0).values, columns=emotion_pos.columns)
        emotion=pd.DataFrame(emotion_weight.values*emotion_value.values,columns=emotion_weight.columns,index=emotion_weight.index)
        emotion.columns=[x[4:] for x in emotion.columns]
        emotion=pd.concat([df[['brand','small_category','prod_nm']],emotion],axis=1)
        
        emotion2=emotion.copy()
        emotion2[emotion2.dtypes[emotion2.dtypes=='float64'].keys()]=emotion2[emotion2.dtypes[emotion2.dtypes=='float64'].keys()].apply(lambda x: np.asarray(x) * np.asarray(quality_df['Q_score']))

        # VARIETY ( cbs & cbsx )
        score2_1=emotion.groupby('prod_nm').sum().sum(axis=1)
        score2_2=emotion2.groupby('prod_nm').sum().sum(axis=1)

        ############################################################################################
        ##### 2-3) ASSOCIATION
        ############################################################################################

        # ASSOCIATION ( cbs )
        score3_1=[]
        for prod_nm in score2_1.index:
            try:
                cooc_sent,n,num_review=sentiment_cooc(emotion,prod_nm)
                conn = connectivity_from_cooc(cooc_sent)[prod_nm]
                conn = conn/((n-1)*(n-2)/2)*num_review
                score3_1.append(conn)
            except:
                score3_1.append(0)

        # ASSOCIATION ( cbsx )
        score3_2=[]
        for prod_nm in score2_2.index:
            try:
                cooc_sent,n,num_review=sentiment_cooc(emotion2,prod_nm)
                conn = connectivity_from_cooc(cooc_sent)[prod_nm]
                conn = conn/((n-1)*(n-2)/2)*num_review
                score3_2.append(conn)
            except:
                score3_2.append(0)
        
        
        ############################################################################################
        ##### 3) COUNT ( in all window )
        ############################################################################################
        count_windows = df.groupby('prod_nm')['rating'].count()

        ############################################################################################
        ##### 4) SAVE FILE
        ############################################################################################
        final=pd.DataFrame(dict(Impression_cbs = score1_1, Impression_cbsx = score1_2,   
                                Variety_cbs = score2_1,Variety_cbsx = score2_2,
                                Association_cbs = score3_1,Association_cbsx = score3_2,
                               count_window=count_windows))
        final['date']=date
        cbs_per_day.append(final)
        
    CBS = pd.concat(cbs_per_day)
    
    return CBS

# 4. 점수 계산 & 파일 저장

In [None]:
from datetime import date, timedelta

window_size = 90
start_date = date(2020, 8,1)
end_date = date(2021, 8, 1)

cbs = get_CBS(df_for_cbs, start_date, end_date, window_size)
cbs= cbs.reset_index().merge(count_df.reset_index(),on=['prod_nm','date'])
cbs = cbs.set_index('prod_nm')

cbs_final = cbs.merge(product_dictionary, left_index=True, right_on='prod_nm')
cbs_final = cbs_final[['date','brand','big_category','small_category','prod_nm','count','count_window','Impression_cbs','Impression_cbsx','Variety_cbs','Variety_cbsx','Association_cbs','Association_cbsx']]

cbs_final['CBS'] = (cbs_final['Impression_cbs']+cbs_final['Variety_cbs']+cbs_final['Association_cbs']) 
cbs_final['CBSx'] = (cbs_final['Impression_cbsx']+cbs_final['Variety_cbsx']+cbs_final['Association_cbsx']) 

cbs_final.to_csv('final_cbs_{}.csv'.format(window_size),index=False)

365it [36:47,  6.05s/it]


In [None]:
cbs_final

Unnamed: 0,date,brand,big_category,small_category,prod_nm,count,count_window,Impression_cbs,Impression_cbsx,Variety_cbs,Variety_cbsx,Association_cbs,Association_cbsx,CBS,CBSx
250,2020-08-01,iope,catA,cat3,UV쉴드 선 스틱 SPF+PA++++,0,10,-1.5015,-1.073708,0.398417,0.284975,1.666667,1.666667,0.563584,0.877934
250,2020-08-02,iope,catA,cat3,UV쉴드 선 스틱 SPF+PA++++,0,10,-1.5015,-1.073708,0.398417,0.284975,1.666667,1.666667,0.563584,0.877934
250,2020-08-03,iope,catA,cat3,UV쉴드 선 스틱 SPF+PA++++,1,10,-1.5015,-1.073708,0.398417,0.284975,1.666667,1.666667,0.563584,0.877934
250,2020-08-04,iope,catA,cat3,UV쉴드 선 스틱 SPF+PA++++,0,11,-1.8290,-1.307289,0.517078,0.369607,1.833333,1.833333,0.521412,0.895651
250,2020-08-05,iope,catA,cat3,UV쉴드 선 스틱 SPF+PA++++,0,11,-1.8290,-1.307289,0.517078,0.369607,1.833333,1.833333,0.521412,0.895651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,2021-07-29,mamonde,catA,cat2,로즈 기획 구성,1,1,0.7595,0.549557,0.115667,0.083694,0.166667,0.166667,1.041833,0.799918
305,2021-07-30,mamonde,catA,cat2,로즈 기획 구성,0,2,1.4125,0.982925,0.217814,0.151484,0.333333,0.333333,1.963647,1.467743
305,2021-07-31,mamonde,catA,cat2,로즈 기획 구성,0,2,1.4125,0.982925,0.217814,0.151484,0.333333,0.333333,1.963647,1.467743
276,2021-07-30,laneige,catC,cat5,립 슬리핑 마스크 EX,0,1,0.0895,0.069742,-0.078164,-0.060909,0.066667,0.066667,0.078003,0.075500
