## 공통 전처리

In [None]:
import re
import pandas as pd
from soynlp.normalizer import repeat_normalize

# https://mr-doosun.tistory.com/24 Stopwords BASE & Custom

with open('./data/stopwords_post_position.txt', 'r') as f:
    josa_lst = f.readlines()

with open('./data/stopword_conjunction.txt', 'r') as f:
    conjunction_lst = f.readline().split(', ')

# 불용어 처리
stopwords_pPosition = []
for josa in josa_lst:
    josa = re.sub('\n|\t', '', josa)
    if '/' in josa:
        josa_words = josa.split('/')
    else:
        josa_words = [josa]

    [stopwords_pPosition.append(word) for word in josa_words]

def pp_stopwords_pposition(txt, stopwords = stopwords_pPosition):
    
    split_words = txt.split()

    result = []
    for word in split_words:
        for length in range(max(map(len, stopwords)),0 , -1):
            if word[-length:] in stopwords:
                result.append(word[:-length])
                break
            elif length == 1:
                result.append(word)

    result = ' '.join(result)

    return result



def pp_stopwords_conjunction(txt, stopwords = conjunction_lst):
    for stopword in stopwords:
        if stopword in txt:

            # Stopword의 위치 찾기
            check_before_idx = re.search(stopword, txt).start() -1
            check_after_idx = re.search(stopword, txt).end() # idx가 아니라 번째 개념으로 자동으로 +1 되어있음

            # 시작위치가 첫번째일떄 예외처리
            if check_before_idx == -1:
                check_before_blank = True
            else:
                check_before_blank = True if txt[check_before_idx] == ' ' else False
            
            #종료지점이 끝위치일떄 예외처리
            if check_after_idx == len(txt):
                check_after_blank = True
            else:
                check_after_blank = True if txt[check_after_idx] == ' ' else False
            
            if check_before_blank and check_after_blank:
                txt = re.sub(stopword, ' ', txt).strip()
        
    return txt

def del_stopwords(txt):
    txt = pp_stopwords_conjunction(txt) # 접속사 제거
    txt = pp_stopwords_pposition(txt) # 조사 제거
    txt = re.sub('[^가-힣]', ' ', txt).strip() # 한글 제외 제거
    txt = repeat_normalize(txt, num_repeats=3)
    return txt


In [None]:
data = pd.read_csv('./data/reivews_df_preprocssing_ver.csv')
data['content'] = data['content'].apply(del_stopwords)

# KeyBERT

In [None]:
import numpy as np
import itertools

from konlpy.tag import Okt, Mecab
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

b_a = data[data['app_name'] == '블루아카이브'].sample(n=6000).reset_index()
n_k = data[data['app_name'] == '니케'].sample(n=6000).reset_index()
o_g = data[data['app_name'] == '원신'].sample(n=6000).reset_index()
d_s = data[data['app_name'] == '붕괴:스타레일'].sample(n=6000).reset_index()

b_a_all = b_a['content']
n_k_all = n_k['content']
o_g_all = o_g['content']
d_s_all = d_s['content']

b_a_pos = b_a[b_a['score'] > 3]['content']
b_a_na = n_k[n_k['score'] < 3]['content']
n_k_pos = n_k[n_k['score'] > 3]['content']
n_k_na = b_a[b_a['score'] < 3]['content']
o_g_pos = o_g[o_g['score'] > 3]['content']
o_g_na = o_g[o_g['score'] < 3]['content']
d_s_pos = d_s[d_s['score'] > 3]['content']
d_s_na = d_s[d_s['score'] < 3]['content']

In [None]:
class CustomKeyBERT():
    def __init__(self, reviews, model):
        self.reivews = reviews
        self.model = SentenceTransformer(model)

        self.okt = Okt()
        self.mecab = Mecab()
        
        print('Preprocessing Start')
        self.preprocessing()
        print('Preprocessing Fin')

    def preprocessing(self):
        tokenized_okt = self.okt.pos(self.reivews)
        tokenized_mecab = self.mecab.nouns(self.reivews)
        tokenized_okt = ' '.join([word[0] for word in tokenized_okt if word[1] == 'Noun'])
        tokenized_mecab = ' '.join(tokenized_mecab)


        count_okt = CountVectorizer(ngram_range= (3,3)).fit([tokenized_okt])
        count_mecab = CountVectorizer(ngram_range= (3,3)).fit([tokenized_mecab])
        self.candidates1 = count_okt.get_feature_names_out()
        self.candidates2 = count_mecab.get_feature_names_out()

        self.doc_embedding = self.model.encode([self.reivews])
        self.candidate_embeddings_okt = self.model.encode(self.candidates1)
        self.candidate_embeddings_mecab = self.model.encode(self.candidates2)


    def keyBert_Noraml_result(self, top_n):

        distances_okt = cosine_similarity(self.doc_embedding, self.candidate_embeddings_okt)
        distances_mecab = cosine_similarity(self.doc_embedding, self.candidate_embeddings_mecab)

        keywords_okt = [self.candidates1[index] for index in distances_okt.argsort()[0][-top_n:]]
        keywords_mecab = [self.candidates2[index] for index in distances_mecab.argsort()[0][-top_n:]]

        return {'Okt' : keywords_okt, 'Mecab' : keywords_mecab}

    def max_sum_sim(self, tagger, top_n, nr_candidates):

        if tagger == 'Okt':
            candidate_embeddings = self.candidate_embeddings_okt
            words = self.candidates1

        elif tagger == 'Mecab':
            candidate_embeddings = self.candidate_embeddings_mecab
            words = self.candidates2

        else:
            return '올바른 형태소 분석기를 선택하세요.(Okt, Mecab)'


        # 문서와 각 키워드들 간의 유사도
        distances = cosine_similarity(self.doc_embedding, candidate_embeddings)

        # 각 키워드들 간의 유사도
        distances_candidates = cosine_similarity(candidate_embeddings, candidate_embeddings)

        words_idx = list(distances.argsort()[0][-nr_candidates:])
        words_vals = [words[index] for index in words_idx]
        distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

        # 각 키워들 중에서 가장 덜 유사한 키워드들간의 조합을 계산
        min_sim = np.inf
        candidate = None
        for combination in itertools.combinations(range(len(words_idx)), top_n):
            sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])

            if sim < min_sim:
                candidate = combination
                min_sim = sim

        return [words_vals[idx] for idx in candidate]

    def mmr(self, tagger, top_n, diversity):

        if tagger == 'Okt':
            candidate_embeddings = self.candidate_embeddings_okt
            words = self.candidates1

        elif tagger == 'Mecab':
            candidate_embeddings = self.candidate_embeddings_mecab
            words = self.candidates2
        
        else:
            return '올바른 형태소 분석기를 선택하세요.(Okt, Mecab)'

        # 문서와 각 키워드들 간의 유사도 리스트
        word_doc_similarity = cosine_similarity(candidate_embeddings, self.doc_embedding)

        # 키워간 유사도
        word_similarity = cosine_similarity(candidate_embeddings)

        # 문서와 가장 유사도가 높은 인덱스 추출
        keywords_idx = [np.argmax(word_doc_similarity)]

        # 가장 높은 유사도 인덱스를 제외한 idx리스트
        candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

        for _ in range(top_n-1):
            candidate_similarites = word_doc_similarity[candidates_idx, :]
            target_similarities = np.max(word_similarity[candidates_idx][:,keywords_idx], axis=1)

            mmr = (1-diversity) * candidate_similarites - diversity * target_similarities.reshape(-1,1)
            mmr_idx = candidates_idx[np.argmax(mmr)]

            keywords_idx.append(mmr_idx)
            candidates_idx.remove(mmr_idx)

        return [words[idx] for idx in keywords_idx]



In [None]:
# CustomKeyBERT(리뷰 데이터, 모델이름)
MODEL_NAME = 'sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens'

# 블루아카
b_a_content_doc = ' '.join(list(b_a_all.values))
b_a_content_pos_doc = ' '.join(list(b_a_pos.values))
b_a_content_na_doc = ' '.join(list(b_a_na.values))

b_a_KeyBERT = CustomKeyBERT(b_a_content_doc, MODEL_NAME)
b_a_KeyBERT_pos = CustomKeyBERT(b_a_content_pos_doc, MODEL_NAME)
b_a_KeyBERT_na = CustomKeyBERT(b_a_content_na_doc, MODEL_NAME)

print('코사인 유사도 - 전체')
print(b_a_KeyBERT.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(b_a_KeyBERT.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(b_a_KeyBERT.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(b_a_KeyBERT.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(b_a_KeyBERT.mmr('Okt', 5, 0.7))


print()
print('코사인 유사도 - 긍정')
print(b_a_KeyBERT_pos.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(b_a_KeyBERT_pos.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(b_a_KeyBERT_pos.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(b_a_KeyBERT_pos.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(b_a_KeyBERT_pos.mmr('Okt', 5, 0.7))

print()
print('코사인 유사도 - 부정')
print(b_a_KeyBERT_na.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(b_a_KeyBERT_na.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(b_a_KeyBERT_na.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(b_a_KeyBERT_na.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(b_a_KeyBERT_na.mmr('Okt', 5, 0.7))


In [None]:
# CustomKeyBERT(리뷰 데이터, 모델이름)
MODEL_NAME = 'sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens'

# 블루아카
content_doc = ' '.join(list(n_k_all.values))
content_pos_doc = ' '.join(list(n_k_pos.values))
content_na_doc = ' '.join(list(n_k_na.values))

KeyBERT = CustomKeyBERT(content_doc, MODEL_NAME)
KeyBERT_pos = CustomKeyBERT(content_pos_doc, MODEL_NAME)
KeyBERT_na = CustomKeyBERT(content_na_doc, MODEL_NAME)

print('코사인 유사도 - 전체')
print(KeyBERT.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT.mmr('Okt', 5, 0.7))


print()
print('코사인 유사도 - 긍정')
print(KeyBERT_pos.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT_pos.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT_pos.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT_pos.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT_pos.mmr('Okt', 5, 0.7))

print()
print('코사인 유사도 - 부정')
print(KeyBERT_na.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT_na.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT_na.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT_na.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT_na.mmr('Okt', 5, 0.7))


In [None]:
# CustomKeyBERT(리뷰 데이터, 모델이름)
MODEL_NAME = 'sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens'

# 블루아카
content_doc = ' '.join(list(o_g_all.values))
content_pos_doc = ' '.join(list(o_g_pos.values))
content_na_doc = ' '.join(list(o_g_na.values))

KeyBERT = CustomKeyBERT(content_doc, MODEL_NAME)
KeyBERT_pos = CustomKeyBERT(content_pos_doc, MODEL_NAME)
KeyBERT_na = CustomKeyBERT(content_na_doc, MODEL_NAME)

print('코사인 유사도 - 전체')
print(KeyBERT.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT.mmr('Okt', 5, 0.7))


print()
print('코사인 유사도 - 긍정')
print(KeyBERT_pos.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT_pos.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT_pos.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT_pos.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT_pos.mmr('Okt', 5, 0.7))

print()
print('코사인 유사도 - 부정')
print(KeyBERT_na.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT_na.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT_na.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT_na.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT_na.mmr('Okt', 5, 0.7))


In [None]:
# CustomKeyBERT(리뷰 데이터, 모델이름)
MODEL_NAME = 'sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens'

# 블루아카
content_doc = ' '.join(list(d_s_all.values))
content_pos_doc = ' '.join(list(d_s_pos.values))
content_na_doc = ' '.join(list(d_s_na.values))

KeyBERT = CustomKeyBERT(content_doc, MODEL_NAME)
KeyBERT_pos = CustomKeyBERT(content_pos_doc, MODEL_NAME)
KeyBERT_na = CustomKeyBERT(content_na_doc, MODEL_NAME)

print('코사인 유사도 - 전체')
print(KeyBERT.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT.mmr('Okt', 5, 0.7))


print()
print('코사인 유사도 - 긍정')
print(KeyBERT_pos.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT_pos.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT_pos.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT_pos.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT_pos.mmr('Okt', 5, 0.7))

print()
print('코사인 유사도 - 부정')
print(KeyBERT_na.keyBert_Noraml_result(5))
print()
print('MSS(Mecab)')
print(KeyBERT_na.max_sum_sim('Mecab', 5, 10))
print('MSS(Okt)')
print(KeyBERT_na.max_sum_sim('Okt', 5, 10))
print()
print('MMR(Mecab)')
print(KeyBERT_na.mmr('Mecab', 5, 0.7))
print('MMR(Okt)')
print(KeyBERT_na.mmr('Okt', 5, 0.7))
