# 전처리 이용하지 않는, perplexity 최적화 토픽 수 찾기
# 리턴값 = [[토픽1의 토큰들], [토픽2의 토큰들], ... ]
# 작동 확인 시 순서대로 실행!

In [4]:
def sklda(plaintext, n_top_words=5, n_iter=30):
    from time import time
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    import pickle as pkl
    from kiwipiepy import Kiwi
    kiwi = Kiwi()
    kiwi.prepare()
      
    n_features = 1000
    doc = plaintext.replace('#','').split('HOTKEY123!@')

    print("\nExtracting kiwi features for LDA...")
    t0 = time()
    t1 = time()

    def tokenize_ko(doc):
        tokens = kiwi.tokenize(doc)
        tagset = {'VA-I',  'MAG', 'XR', 'NNP', 'NNG'}
        results = []
        for token in tokens:
            if token.tag in tagset:
                results.append(token.form)
        return results
    kiwi_vectorizer = CountVectorizer(min_df=2, max_features=n_features, tokenizer=tokenize_ko)
    kiwivoca = kiwi_vectorizer.fit_transform(doc)
    print("done in %0.3fs." % (time() - t0))

    print("\nFinding the optimal number of topics...")
    t0 = time()
    perplexity = []
    for i in range(2,6):
        lda = LatentDirichletAllocation(
            n_components=i,
            max_iter=n_iter,
            learning_method="online",
            learning_offset=50.0,
            random_state=0,
        )
        lda.fit(kiwivoca)
        perplexity.append(lda.perplexity(kiwivoca))
    n_topics=perplexity.index(min(perplexity))+2
    print("done in %0.3fs." % (time() - t0), f"the optimal number of topics is {n_topics}")

    print("\nFitting LDA models with KIWI features, n_features=%d, number of topics=%d, max_iter=%d" % (n_features, n_topics, n_iter))
    t0 = time()
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=n_iter,
        learning_method="online",
        learning_offset=50.0,
        random_state=0,
    )
    lda.fit(kiwivoca)

    kiwi_feature_names = kiwi_vectorizer.get_feature_names_out()
    topic_list = []
    for topic_idx, topic in enumerate(lda.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [kiwi_feature_names[i] for i in top_features_ind]
        topic_list.append(top_features)
        print('Topic {}: {}'.format(topic_idx+1, ' '.join(top_features))) 
        
    print("done in %0.3fs." % (time() - t0))

    print("in total, %0.3fs." % (time() - t1))
    return topic_list

In [5]:
with open('월드컵_1203.txt','r',encoding='utf-8') as file:
    plaintext = file.read()

In [6]:
sklda(plaintext)


Extracting kiwi features for LDA...
done in 1.095s.

Finding the optimal number of topics...
done in 4.247s. the optimal number of topics is 3

Fitting LDA models with KIWI features, n_features=1000, number of topics=3, max_iter=30
Topic 1: 다시 붉은악마 기분 날 꿈
Topic 2: 월드컵 대한민국 강 진출 축구
Topic 3: 월드컵 스타 그램 유머 축구
done in 1.039s.
in total, 6.380s.


[['다시', '붉은악마', '기분', '날', '꿈'],
 ['월드컵', '대한민국', '강', '진출', '축구'],
 ['월드컵', '스타', '그램', '유머', '축구']]

# 전처리함수 이용하기, 원문반환 추가 필요

In [None]:
def sklda(plaintext, n_top_words=5, n_iter=30):
    from time import time
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    import pickle as pkl
    from kiwipiepy import Kiwi
    kiwi = Kiwi()
    kiwi.prepare()
      
    n_features = 1000
#   원문반환하는 전처리함수 처리 필요
    doc = preprocess(plaintext, sep='HOTKEY123!@#', 원문반환)

    print("\nExtracting kiwi features for LDA...")
    t0 = time()
    t1 = time()

    def tokenize_ko(doc):
        tokens = kiwi.tokenize(doc)
        tagset = {'VA-I',  'MAG', 'XR', 'NNP', 'NNG'}
        results = []
        for token in tokens:
            if token.tag in tagset:
                results.append(token.form)
        return results
    kiwi_vectorizer = CountVectorizer(min_df=2, max_features=n_features, tokenizer=tokenize_ko)
    kiwivoca = kiwi_vectorizer.fit_transform(doc)
    print("done in %0.3fs." % (time() - t0))

    print("\nFinding the optimal number of topics...")
    t0 = time()
    perplexity = []
    for i in range(2,6):
        lda = LatentDirichletAllocation(
            n_components=i,
            max_iter=n_iter,
            learning_method="online",
            learning_offset=50.0,
            random_state=0,
        )
        lda.fit(kiwivoca)
        perplexity.append(lda.perplexity(kiwivoca))
    n_topics=perplexity.index(min(perplexity))+2
    print("done in %0.3fs." % (time() - t0), f"the optimal number of topics is {n_topics}")

    print("\nFitting LDA models with KIWI features, n_features=%d, number of topics=%d, max_iter=%d" % (n_features, n_topics, n_iter))
    t0 = time()
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=n_iter,
        learning_method="online",
        learning_offset=50.0,
        random_state=0,
    )
    lda.fit(kiwivoca)

    kiwi_feature_names = kiwi_vectorizer.get_feature_names_out()
    topic_list = []
    for topic_idx, topic in enumerate(lda.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [kiwi_feature_names[i] for i in top_features_ind]
        topic_list.append(top_features)
        print('Topic {}: {}'.format(topic_idx+1, ' '.join(top_features))) 
        
    print("done in %0.3fs." % (time() - t0))

    print("in total, %0.3fs." % (time() - t1))
    return topic_list

# 전처리함수

In [30]:
from kiwipiepy import Kiwi
from math import log1p
import numpy as np
import konlpy
import nltk
import re

def preprocess(plaintext, sep,
               morphemeAnalyzer='kiwi', targetMorphs=['NNP','NNG'],removehashtag=True, returnMorph=False,
               returnEnglishMorph=False, eeTagRule={'NNP':'NNP'},
               filterMorphemeAnalyzer='kiwi',filterTargetMorphs=['NNP','NNG','W_HASHTAG'],
               k_1Filter=1.5 ,bFilter=0.75,
               filterThreshold = 3.315):
    
    # 형태소 분석기를 텍스트로 정의 시 해당 형태소 분석기를 할당
    tma = set_morpheme_analyzer(morphemeAnalyzer)
    fma = set_morpheme_analyzer(filterMorphemeAnalyzer)
    
    # 서버에서 "글1 구분자 글2 구분자" 형식의 데이터를 받는다고 가정
    data = plaintext.split(sep)
    # 구분자로 데이터를 받으면 글이 존재하지 않는 마지막 부분을 삭제
    
    if data[-1] in ('',' ','\n',[],['\n'],[' ']):
        data=data[:-1]
    
    if removehashtag == True:
        if '#' in sep:
            sep = sep.replace('#','')
        tdata = plaintext.replace('#',' ').split(sep)
        if tdata[-1] in ('',' ','\n',[],['\n'],[' ']):
            tdata=tdata[:-1]
    else:
        tdata = data*1
            
    postLens = list()
    for post in data:
        postLens.append(len(post))

    flag = (morphemeAnalyzer==filterMorphemeAnalyzer, set(targetMorphs)==set(filterTargetMorphs))
    
    
    ftok = data_tokenize(data,fma,filterTargetMorphs,
                         returnMorph=False,
                         returnEnglishMorph=True,
                         eeTagRule={'W_HASHTAG':'W_HASHTAG'})
    ttok = data_tokenize(tdata,tma,targetMorphs,
                         returnMorph=returnMorph,
                         returnEnglishMorph=returnEnglishMorph, eeTagRule=eeTagRule)
    
    filterScores = BM25(ftok, postLens, k_1=k_1Filter, b=bFilter)
    
    spamCount=0
    for idx in range(len(filterScores)):
        if filterScores[idx] < filterThreshold:
            spamCount+=1
            ttok.pop(idx-spamCount)
    print("%s 개의 데이터가 삭제되었습니다."%spamCount)
    
    return ttok

def set_morpheme_analyzer(maText):
    if maText in ['kiwi','Kiwi','KIWI','키위']:
        return Kiwi()
    elif maText in ['Hannanum', 'hannanum', 'HANNANUM','한나눔']:
        return konlpy.tag.Hannanum()
    elif maText in ['Komoran','KOMORAN','komoran','코모란']:
        return konlpy.tag.Komoran()
    elif maText in ['Kkma','KKMA','kkma','꼬꼬마']:
        return konlpy.tag.Kkma()
    elif maText in ['Okt','OKT','okt','오픈코리안텍스트','트위터']:
        return konlpy.tag.Okt()
    elif maText in ['Mecab','mecab','MECAB','미캐브']:
        return konlpy.tag.Mecab()
    else:
        raise Exception('No such morpheme analyzer\nSupported morpheme analyzers are Kiwi, KoNLPy(Hannanum, Komoran, Kkma, Okt, Mecab)')
        
def data_tokenize(data,morphemeAnalyzer,targetMorph,
                  returnMorph=False,
                  returnEnglishMorph=False,
                  eeTagRule={'NNP':'NNP'}):
    
    returnData = list()
    
    maDir = dir(morphemeAnalyzer)
    if 'pos' in maDir:
        for post in data:
            selected = list()
            if returnEnglishMorph==True:
                eeTags, post = emoji_english_preprocess(post, tagRule=eeTagRule, returnMorph=returnMorph)
                selected += eeTags
            
            for tok in morphemeAnalyzer.pos(post):
                if tok[1] in targetMorph:
                    if returnMorph==True:
                        selected.append((tok[0],tok[1]))
                    else:
                        selected.append(tok[0])
            returnData.append(selected)
    
    elif 'tokenize' in maDir:
        for post in data:
            selected = list()
            if returnEnglishMorph==True:
                eeTags, post = emoji_english_preprocess(post, tagRule=eeTagRule, returnMorph=returnMorph)
                selected += eeTags
            
            for tok in morphemeAnalyzer.tokenize(post):
                if tok.tag in targetMorph:
                    if returnMorph==True:
                        selected.append((tok.form,tok.tag))
                    else:
                        selected.append(tok.form)
            returnData.append(selected)
    else:
        raise Exception('Not supported morpheme analyzer instance')
    return returnData

def BM25(data, postLens, k_1=1.5, b=0.75):
    avgPostLen = np.mean(postLens)
    
    N = len(data)
    
    n = dict()
    for post in data:
        uniqueToks = set(post)
        for tok in uniqueToks:
            try:
                n[tok]+=1
            except:
                n[tok] = 1
    
    IDF = dict()
    for tok in n.keys():
        IDF[tok] = log1p((N-n[tok]+0.5)/(n[tok]+0.5))


    filterScores = list()

    for postidx, post in enumerate(data):
        postScore = 0
        for tok in post:
            tokCount = post.count(tok)
            postScore += (IDF[tok] * (
                (tokCount*(k_1+1))/(
                    tokCount+(k_1*(1-b+(b*(postLens[postidx]/avgPostLen)))))))
        try:
            filterScores.append((postScore/len(post)))
        except:
            filterScores.append(0)

    return filterScores

def emoji_english_preprocess(post, tagRule={'NNP':'NNP'}, returnMorph=True):
    returnData = list()
    
    emojis = re.findall(':[_A-Za-z]+:',post)
    for emoji in set(emojis):
        emojiCounts = post.count(emoji)
        post = post.replace(emoji,'')
        returnData+=([(emoji,'EMJ')]*emojiCounts)
        
    hashTags = re.findall('#[_A-Za-z]',post)
    for hashTag in set(hashTags):
        hTagCounts = post.count(hashTag)
        post = post.replace(hashTag,'')
        returnData+=([(hashTag,'W_HASHTAG')]*hTagCounts)

    
    engChunks = re.findall('[A-Za-z]+[\' ]?[A-Za-z]+',post)
    for engChunk in set(engChunks):
        chunkCounts = post.count(engChunk)
        post = post.replace(engChunk,'')

        targetToken = list()
        for token in nltk.pos_tag(nltk.word_tokenize(engChunk)):
            if token[1] in tagRule: 
                targetToken.append((token[0],token[1]))
        returnData+=(targetToken*chunkCounts)

    filterData = list()
    for token in returnData:
        if token[1] in tagRule:
            if returnMorph==True:
                filterData.append((token[0],tagRule[token[1]]))
            else:
                filterData.append(token[0])
        
    return filterData, post
