In [None]:
!pip install contextualized-topic-models==2.2.0
!pip install pyldavis
 

# 공통 전처리

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import re
import pandas as pd
from soynlp.normalizer import repeat_normalize

# https://mr-doosun.tistory.com/24 Stopwords BASE & Custom

with open('./data/stopwords_post_position.txt', 'r') as f:
    josa_lst = f.readlines()

with open('./data/stopword_conjunction.txt', 'r') as f:
    conjunction_lst = f.readline().split(', ')

# 불용어 처리
stopwords_pPosition = []
for josa in josa_lst:
    josa = re.sub('\n|\t', '', josa)
    if '/' in josa:
        josa_words = josa.split('/')
    else:
        josa_words = [josa]

    [stopwords_pPosition.append(word) for word in josa_words]

def pp_stopwords_pposition(txt, stopwords = stopwords_pPosition):
    
    split_words = txt.split()

    result = []
    for word in split_words:
        for length in range(max(map(len, stopwords)),0 , -1):
            if word[-length:] in stopwords:
                result.append(word[:-length])
                break
            elif length == 1:
                result.append(word)

    result = ' '.join(result)

    return result



def pp_stopwords_conjunction(txt, stopwords = conjunction_lst):
    for stopword in stopwords:
        if stopword in txt:

            # Stopword의 위치 찾기
            check_before_idx = re.search(stopword, txt).start() -1
            check_after_idx = re.search(stopword, txt).end() # idx가 아니라 번째 개념으로 자동으로 +1 되어있음

            # 시작위치가 첫번째일떄 예외처리
            if check_before_idx == -1:
                check_before_blank = True
            else:
                check_before_blank = True if txt[check_before_idx] == ' ' else False
            
            #종료지점이 끝위치일떄 예외처리
            if check_after_idx == len(txt):
                check_after_blank = True
            else:
                check_after_blank = True if txt[check_after_idx] == ' ' else False
            
            if check_before_blank and check_after_blank:
                txt = re.sub(stopword, ' ', txt).strip()
        
    return txt

def del_stopwords(txt):
    txt = pp_stopwords_conjunction(txt) # 접속사 제거
    txt = pp_stopwords_pposition(txt) # 조사 제거
    txt = re.sub('[^가-힣]', ' ', txt).strip() # 한글 제외 제거
    txt = repeat_normalize(txt, num_repeats=3)
    return txt


In [2]:
data = pd.read_csv('./data/reivews_df_preprocssing_ver.csv')
data['content'] = data['content'].apply(del_stopwords)

## CTM(BERT-base)

CTM - ZeroSoht&Mecab

In [7]:
from contextualized_topic_models.models.ctm import CombinedTM, ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab, Okt
from tqdm import tqdm
import pandas as pd


class CumstomCTM():
    def __init__(self, reviews, tagger):
        self.reviews = reviews
        self.custom_tokenizer = CustomTokenizer(tagger)
        self.qt = TopicModelDataPreparation()
    
    def preprocessing(self, max_features):
        self.preprocessed_documents = []
        for line in tqdm(self.reviews):
            if line and not line.replace(' ', '').isdecimal():
                self.preprocessed_documents.append(line)

        vectorizer = CountVectorizer(tokenizer=self.custom_tokenizer, max_features=max_features)
        self.train_bow_embeddings = vectorizer.fit_transform(self.preprocessed_documents)
        self.vocab = vectorizer.get_feature_names_out()
        self.id2token = {k: v for k, v in zip(range(0, len(self.vocab)), self.vocab)}


    def modeling(self, MODEL_NAME, n_components, n_epochs):
        train_contextualized_embeddings = bert_embeddings_from_list(self.preprocessed_documents, MODEL_NAME)
        training_dataset = self.qt.load(train_contextualized_embeddings, self.train_bow_embeddings, self.id2token)
        
        self.ctm_zeroshot = ZeroShotTM(bow_size=len(self.vocab), contextual_size=768, n_components=n_components, num_epochs = n_epochs)
        self.ctm_zeroshot.fit(training_dataset)

    
    def getTopics(self, n_top):
        return self.ctm_zeroshot.get_topics(n_top)

class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        word_tokens = self.tagger.nouns(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result

In [10]:
b_a = data[data['app_name'] == '블루아카이브'].reset_index()['content']
n_k = data[data['app_name'] == '니케'].reset_index()['content']
o_g = data[data['app_name'] == '원신'].reset_index()['content']
d_s = data[data['app_name'] == '붕괴:스타레일'].reset_index()['content']

ba = data[data['app_name'] == '블루아카이브'].reset_index()
nk = data[data['app_name'] == '니케'].reset_index()
og = data[data['app_name'] == '원신'].reset_index()
ds = data[data['app_name'] == '붕괴:스타레일'].reset_index()

b_a_pos = ba[ba['score'] > 3]['content']
b_a_na = ba[ba['score'] < 3]['content']
n_k_pos = nk[nk['score'] > 3]['content']
n_k_na = nk[nk['score'] < 3]['content']
o_g_pos = og[og['score'] > 3]['content']
o_g_na = og[og['score'] < 3]['content']
d_s_pos = ds[ds['score'] > 3]['content']
d_s_na = ds[ds['score'] < 3]['content']

In [12]:
def getCTMResult(total_data, pos_data, na_data):
    mecab = Mecab()
    okt = Okt()
    MODEL_NAME = 'sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens'
    n_components = 20
    n_epochs = 50

    print('Create Class')
    CTM_mecab = CumstomCTM(total_data.values, mecab)
    CTM_okt = CumstomCTM(total_data.values, okt)
    CTM_pos_mecab = CumstomCTM(pos_data.values, mecab)
    CTM_pos_okt = CumstomCTM(pos_data.values, okt)
    CTM_na_mecab = CumstomCTM(na_data.values, mecab)
    CTM_na_okt = CumstomCTM(na_data.values, okt)

    print('Preprocessing Class')
    CTM_mecab.preprocessing(2000) # 피처는 2000개
    CTM_okt.preprocessing(2000) # 피처는 2000개
    CTM_pos_mecab.preprocessing(2000)
    CTM_na_mecab.preprocessing(2000)
    CTM_pos_okt.preprocessing(2000)
    CTM_na_okt.preprocessing(2000)

    print('Modeling Class')
    CTM_mecab.modeling(MODEL_NAME, n_components, n_epochs)
    CTM_okt.modeling(MODEL_NAME, n_components, n_epochs)
    CTM_pos_mecab.modeling(MODEL_NAME, n_components, n_epochs)
    CTM_na_mecab.modeling(MODEL_NAME, n_components, n_epochs)
    CTM_pos_okt.modeling(MODEL_NAME, n_components, n_epochs)
    CTM_na_okt.modeling(MODEL_NAME, n_components, n_epochs)


    print('::::::Mecab::::::')
    print('TOTAL')
    print('TOPICS : ', CTM_mecab.getTopics(5))
    print()
    print('POS')
    print('TOPICS : ', CTM_pos_mecab.getTopics(5))
    print()
    print('NA')
    print('TOPICS : ', CTM_na_mecab.getTopics(5))
    print('::::::Okt::::::')
    print('TOTAL')
    print('TOPICS : ', CTM_okt.getTopics(5))
    print()
    print('POS')
    print('TOPICS : ', CTM_pos_okt.getTopics(5))
    print()
    print('NA')
    print('TOPICS : ', CTM_na_okt.getTopics(5))



In [13]:
getCTMResult(b_a, b_a_pos, b_a_na)

Create Class
Preprocessing Class


100%|██████████| 9515/9515 [00:00<00:00, 121071.51it/s]
100%|██████████| 9515/9515 [00:00<00:00, 623448.40it/s]
100%|██████████| 6341/6341 [00:00<00:00, 725242.19it/s]
100%|██████████| 2673/2673 [00:00<00:00, 877740.12it/s]
100%|██████████| 6341/6341 [00:00<00:00, 876427.92it/s]
100%|██████████| 2673/2673 [00:00<00:00, 898203.38it/s]


Modeling Class


Batches:   0%|          | 0/48 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [470250/470250]	Train Loss: 38.48781980994152	Time: 0:00:02.778940: : 50it [02:56,  3.53s/it] 


Batches:   0%|          | 0/48 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [470250/470250]	Train Loss: 42.98229615750432	Time: 0:00:02.377092: : 50it [02:32,  3.04s/it] 


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [311900/311900]	Train Loss: 33.44626436664646	Time: 0:00:01.765880: : 50it [01:35,  1.92s/it] 


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [133300/133300]	Train Loss: 47.99953067490505	Time: 0:00:00.776477: : 50it [00:45,  1.10it/s] 


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [311900/311900]	Train Loss: 37.12793992199398	Time: 0:00:01.914696: : 50it [01:42,  2.05s/it] 


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [133300/133300]	Train Loss: 54.15951682317552	Time: 0:00:01.133211: : 50it [00:58,  1.18s/it] 


::::::Mecab::::::
TOTAL
TOPICS :  defaultdict(<class 'list'>, {0: ['수월', '소총', '주력', '야전', '색깔'], 1: ['최적화', '화면', '플레이', '출력', '업데이트'], 2: ['오류', '유저', '운영', '개발', '제재'], 3: ['스토리', '캐릭터', '개성', '보이스', '디자인'], 4: ['최고', '대한민국', '희망', '한국어', '시대'], 5: ['캐릭터', '과금', '확률', '캐릭', '픽업'], 6: ['업데이트', '다운로드', '화면', '애플리케이션', '접속'], 7: ['리뷰', '삭제', '호우', '세션', '지급'], 8: ['확률', '픽업', '히후', '천장', '리세'], 9: ['야전', '수월', '비나', '색깔', '서포터'], 10: ['게임', '아카이브', '블루', '최고', '인생'], 11: ['동일', '경험', '의도', '제발', '그것'], 12: ['개발', '김용하', '포항항', '발사', '운영'], 13: ['발열', '배터리', '엔진', '최적', '로딩'], 14: ['게임', '과금', '확률', '컨텐츠', '픽업'], 15: ['게임', '캐릭터', '스토리', '상성', '컨텐츠'], 16: ['서포터', '지향', '사라지', '수고', '우롱'], 17: ['등급', '최악', '최소한', '수고', '시도'], 18: ['사랑', '용하', '블루아', '블루', '우리'], 19: ['게임', '캐릭터', '스토리', '컨텐츠', '과금']})

POS
TOPICS :  defaultdict(<class 'list'>, {0: ['최고', '인생', '세상', '세기', '시대'], 1: ['관위', '친절', '선생', '테러', '천장'], 2: ['호우', '오류', '진행', '세션', '업데이트'], 3: ['하지', '구경', '열광', '소재', '행사'], 4: 

In [14]:
# 니케
getCTMResult(n_k, n_k_pos, n_k_na)

Create Class
Preprocessing Class


100%|██████████| 11339/11339 [00:00<00:00, 241353.62it/s]
100%|██████████| 11339/11339 [00:00<00:00, 332891.52it/s]
100%|██████████| 5033/5033 [00:00<00:00, 398210.44it/s]
100%|██████████| 5255/5255 [00:00<00:00, 379175.07it/s]
100%|██████████| 5033/5033 [00:00<00:00, 458722.09it/s]
100%|██████████| 5255/5255 [00:00<00:00, 757086.78it/s]


Modeling Class


Batches:   0%|          | 0/57 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [560450/560450]	Train Loss: 48.20578443445714	Time: 0:00:04.181280: : 50it [02:50,  3.40s/it] 


Batches:   0%|          | 0/57 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [560450/560450]	Train Loss: 53.61916445043725	Time: 0:00:03.500286: : 50it [04:00,  4.82s/it] 


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [246600/246600]	Train Loss: 33.26172103943713	Time: 0:00:01.477121: : 50it [01:27,  1.75s/it] 


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [261500/261500]	Train Loss: 61.40741258551924	Time: 0:00:04.507754: : 50it [02:47,  3.34s/it] 


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [246600/246600]	Train Loss: 36.811764306395595	Time: 0:00:01.481404: : 50it [01:46,  2.14s/it]


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [261500/261500]	Train Loss: 68.4885944782206	Time: 0:00:01.299762: : 50it [01:07,  1.35s/it] 

::::::Mecab::::::
TOTAL
TOPICS :  defaultdict(<class 'list'>, {0: ['게임', '운영', '유저', '중국', '개발'], 1: ['확률', '조작', '해명', '무료', '선착순'], 2: ['스토리', '캐릭터', '과금', '재미', '연기'], 3: ['영역', '소린', '모름', '버스터', '편의'], 4: ['과금', '게임', '재화', '컨텐츠', '확률'], 5: ['최고', '그래픽', '인생', '감사', '타격'], 6: ['업데이트', '접속', '파일', '다운로드', '로딩'], 7: ['편의', '방향', '영역', '버스터', '기회'], 8: ['에피소드', '상담', '오류', '캐릭터', '보상'], 9: ['오류', '운영', '유저', '확률', '조작'], 10: ['홍련', '사랑', '모름', '쥐똥', '버스터'], 11: ['천장', '가격', '사료', '보석', '현질'], 12: ['게임', '캐릭터', '스토리', '과금', '생각'], 13: ['게임', '과금', '유저', '확률', '개꿀'], 14: ['게임', '오류', '문제', '로딩', '유저'], 15: ['편의', '모름', '버스터', '독창', '소린'], 16: ['계정', '신청', '게스트', '아이디', '예약'], 17: ['최악', '쓰레기', '검열', '적대', '실망'], 18: ['편의', '버스터', '소린', '모름', '자폭'], 19: ['로딩', '화면', '전투', '터치', '스테이지']})

POS
TOPICS :  defaultdict(<class 'list'>, {0: ['게임', '로딩', '캐릭터', '오류', '생각'], 1: ['게임', '스토리', '캐릭터', '과금', '생각'], 2: ['오류', '캐릭터', '에피소드', '상담', '진행'], 3: ['길드', '올리', '평균점', '나머지', '정해주'], 4: ['올리',




In [15]:
# 스타레일
getCTMResult(d_s, d_s_pos, d_s_na)

Create Class
Preprocessing Class


100%|██████████| 6257/6257 [00:00<00:00, 133642.41it/s]
100%|██████████| 6257/6257 [00:00<00:00, 570219.02it/s]
100%|██████████| 3232/3232 [00:00<00:00, 305942.15it/s]
100%|██████████| 2699/2699 [00:00<00:00, 766551.09it/s]
100%|██████████| 3232/3232 [00:00<00:00, 776402.66it/s]
100%|██████████| 2699/2699 [00:00<00:00, 575341.86it/s]


Modeling Class


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [308350/308350]	Train Loss: 46.67097098293462	Time: 0:00:01.627893: : 50it [01:47,  2.16s/it] 


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [308350/308350]	Train Loss: 50.61202507581938	Time: 0:00:01.355133: : 50it [01:16,  1.53s/it] 


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [158200/158200]	Train Loss: 38.103279567096386	Time: 0:00:00.776660: : 50it [00:46,  1.08it/s]


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [133950/133950]	Train Loss: 57.252156894480215	Time: 0:00:01.006106: : 50it [00:39,  1.26it/s]


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [158200/158200]	Train Loss: 41.459155906166046	Time: 0:00:01.114482: : 50it [00:59,  1.19s/it]


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: [50/50]	 Seen Samples: [133950/133950]	Train Loss: 61.493081862110394	Time: 0:00:01.508462: : 50it [00:43,  1.15it/s]


::::::Mecab::::::
TOTAL
TOPICS :  defaultdict(<class 'list'>, {0: ['사료', '티켓', '보급', '고민', '무료'], 1: ['스토리', '캐릭터', '게임', '오역', '번역'], 2: ['한국어', '번역기', '제대', '엉망', '조선족'], 3: ['번역', '오역', '업체', '문제', '제대'], 4: ['밀면', '연관', '고비', '향상', '무조건'], 5: ['형상', '자막', '밀면', '한정판', '연관'], 6: ['연관', '형상', '작품', '각안', '밀면'], 7: ['연관', '형상', '이하', '차원', '일부'], 8: ['오류', '코드', '로그인', '기차', '화면'], 9: ['제발', '쓰레기', '연관', '밀면', '이해수'], 10: ['게임', '캐릭터', '스토리', '원신', '컨텐츠'], 11: ['원신', '확률', '붕괴', '재미', '스타'], 12: ['스토리', '스킵', '컨텐츠', '업데이트', '캐릭터'], 13: ['연관', '지금', '형상', '밀면', '역체'], 14: ['스토리', '오역', '번역', '캐릭터', '게임'], 15: ['원신', '붕괴', '레일', '스타', '캐릭터'], 16: ['스마트폰', '업데이트', '오류', '접속', '로그인'], 17: ['번역', '문제', '오역', '한국어', '정도'], 18: ['인생', '최고', '중국', '사랑', '버스'], 19: ['캐릭터', '원신', '게임', '전투', '붕괴']})

POS
TOPICS :  defaultdict(<class 'list'>, {0: ['스토리', '캐릭터', '컨텐츠', '과금', '그래픽'], 1: ['오류', '해결', '로그인', '부탁', '오역'], 2: ['신화', '심정', '융합', '정석', '안구'], 3: ['부담', '신화', '실장', '장신구', '지옥'], 4: ['오역'

In [None]:
mecab = Mecab()
okt = Okt()
MODEL_NAME = 'sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens'
n_components = 20
n_epochs = 50

# 블루아카
b_a_CTM_mecab = CumstomCTM(b_a.values, mecab)
b_a_CTM_okt = CumstomCTM(b_a.values, okt)
b_a_CTM_pos_mecab = CumstomCTM(b_a_pos.values, mecab)
b_a_CTM_pos_okt = CumstomCTM(b_a_pos.values, okt)
b_a_CTM_na_mecab = CumstomCTM(b_a_na.values, mecab)
b_a_CTM_na_okt = CumstomCTM(b_a_na.values, okt)


b_a_CTM_mecab.preprocessing(2000) # 피처는 2000개
b_a_CTM_okt.preprocessing(2000) # 피처는 2000개
b_a_CTM_pos_mecab.preprocessing(2000)
b_a_CTM_na_mecab.preprocessing(2000)
b_a_CTM_pos_okt.preprocessing(2000)
b_a_CTM_na_okt.preprocessing(2000)

b_a_CTM_mecab.modeling(MODEL_NAME, n_components, n_epochs)
b_a_CTM_okt.modeling(MODEL_NAME, n_components, n_epochs)

print('Mecab')
print('TOPICS : ', b_a_CTM_mecab.getTopics(5))
print('Okt')
print('TOPICS : ', b_a_CTM_okt.getTopics(5))


In [None]:
# 니케
n_k_CTM = CumstomCTM(n_k.values, mecab)
n_k_CTM.preprocessing(2000)
n_k_CTM.modeling(MODEL_NAME, n_components, n_epochs)
print('TOPICS : ', n_k_CTM.getTopics(5))

with open('./reuslt/CTM_n_k.txt', 'a+') as f:
    top5Topics = ''
    for key, value in n_k_CTM.getTopics(5).items():
        top5Topics += 'Topic'+str(key)+' : '+ ','.join(value) + '\t'

    write_txt = MODEL_NAME + '+n_components=' + str(n_components) + '+topics5_' + top5Topics

In [None]:
# 원신
o_g_CTM = CumstomCTM(o_g.values, mecab)
o_g_CTM.preprocessing(2000)
o_g_CTM.modeling(MODEL_NAME, n_components, n_epochs)
print('TOPICS : ', o_g_CTM.getTopics(5))

with open('./reuslt/CTM_o_g.txt', 'a+') as f:
    top5Topics = ''
    for key, value in o_g_CTM.getTopics(5).items():
        top5Topics += 'Topic'+str(key)+' : '+ ','.join(value) + '\t'

    write_txt = MODEL_NAME + '+n_components=' + str(n_components) + '+topics5_' + top5Topics

In [None]:
# 붕괴:스타레일
d_s_CTM = CumstomCTM(d_s.values, mecab)
d_s_CTM.preprocessing(2000)
d_s_CTM.modeling(MODEL_NAME, n_components, n_epochs)
print('TOPICS : ', d_s_CTM.getTopics(5))

with open('./reuslt/CTM_d_s.txt', 'a+') as f:
    top5Topics = ''
    for key, value in d_s_CTM.getTopics(5).items():
        top5Topics += 'Topic'+str(key)+' : '+ ','.join(value) + '\t'

    write_txt = MODEL_NAME + '+n_components=' + str(n_components) + '+topics5_' + top5Topics