In [1]:
from kiwipiepy import Kiwi
from math import log1p
import numpy as np
import konlpy
import nltk
import re
import emoji

In [2]:
def preprocess(plaintext, sep,
               morphemeAnalyzer='kiwi',morphemeAnalyzerParams=None, targetMorphs=['NNP','NNG'],
               returnEnglishMorph=False, EETagRule={'NNP':'NNP'},
               removeHashTag=True, returnPlain=False ,returnMorph=False,
               filterMorphemeAnalyzer='kiwi', filterMorphemeAnalyzerParams=None, filterTargetMorphs=['NNP','NNG','W_HASHTAG'],
               filterEnglishMorph=False, filterEETagRule={'NNP':'NNP'},
               k_1Filter=1.5 ,bFilter=0.75,filterThreshold = 3.315):
    
    '''
    t- 로 시작하는 변수들은 target, 실제로 반환되는 데이터
    f- 로 시작하는 변수들은 filter, 내부적으로 BM25를 통해 필터링을 할 때 사용되는 데이터
    '''
    
    # 형태소 분석기 인스턴스 생성
    tma = setMorphemeAnalyzer(morphemeAnalyzer, morphemeAnalyzerParams)
    fma = setMorphemeAnalyzer(filterMorphemeAnalyzer, filterMorphemeAnalyzerParams)
    
    # 구분자가 마지막에도 붙어있어 data 마지막에 비어있는 포스트가 있을 경우 이를 제거
    data = plaintext.split(sep)
    if data[-1] == '':
        data=data[:-1]
    
    # 해쉬태그를 구성하는 '#'을 제거하고 싶을 경우 이를 제거
    # 구분자에도 '#'이 포함되어 있을 경우 이 또한 제외
    if removeHashTag == True:
        if '#' in sep:
            newSep = sep.replace('#','')
        tdata = plaintext.replace('#',' ').split(newSep)
        if tdata[-1] == '':
            tdata=tdata[:-1]
            
    # 해쉬태그 처리가 없으면 기존의 위의 data 변수를 복제하여 사용
    else:
        tdata = data*1
    
    # BM25에서 사용하기 위한 원문서들의 길이를 저장
    postLens = list()
    for post in data:
        postLens.append(len(post))
        
        
    ftok = data_tokenize(data,fma,filterTargetMorphs,
                         returnMorph=False,returnEnglishMorph=True,eeTagRule={'W_HASHTAG':'W_HASHTAG'})
    
    # 만약 모든 결과 분석의 조건들이 필터 분석의 조건들과 일치하면 이전의 토큰화 결과를 그대로 사용함
    if (removeHashTag==False,
        morphemeAnalyzer==filterMorphemeAnalyzer,
        morphemeAnalyzerParams==filterMorphemeAnalyzerParams,
        targetMorphs==filterTargetMorphs,
        returnMorph==False,
        returnEnglishMorph==filterEnglishMorph,
        EETagRule==filterEETagRule):
        flag=True
        
    filterScores = BM25(ftok, postLens, k_1=k_1Filter, b=bFilter)
    spamCount = 0
    
    
    
    '''
    if flag=True:
        tdata = '''
    
    idx=0
    dataLen = len(postLens)
    
    while idx<dataLen:
        if filterScores[idx] < filterThreshold:
            spamCount+=1
            tdata.pop(idx)
            filterScores.pop(idx)
            idx-=1
            dataLen-=1
        idx+=1
        
    print("%s 개의 데이터가 삭제되었습니다."%spamCount)
    
    if returnPlain==True:
        return sep.join(tdata)

In [3]:
def data_tokenize(data,morphemeAnalyzer,
                  targetMorphs=['NNP','NNG'],
                  returnMorph=False,
                  returnEnglishMorph=False,
                  eeTagRule={'NLTK_NNP':'NNP','NLTK_NN'
                             'R_W_HASHTAG':'W_HASHTAG'}):
    
    returnData = list()
    
    if returnEnglishMorph == True:
        for post in data:
            partialReturn = list()
            tokenizedData=HEMEK_tokenize(post,morphemeAnalyzer,nltkMA())
            
            for tok in tokenizedData:
                if tok[1] in eeTagRule:
                    tok[1] = eeTagRule[tok[1]]
                if tok[1] in targetMorphs:
                    if returnMorph == True:
                        partialReturn.append(tok)
                    else:
                        partialReturn.append(tok[0])
            returnData.append(partialReturn)
     
    else:
        for post in data:
            partialReturn=list()
            tokenizedData = morphemeAnalyzer(post)
            for tok in tokenizedData:
                if tok[1] in targetMorphs:
                    if returnMorph == True:
                        partialReturn.append(tok)
                    else:
                        partialReturn.append(tok[0])
            returnData.append(partialReturn)
    
    return returnData

In [4]:
def get_demojized_set():
    return set(re.findall("'en': '(:[^:]+:)'",str(emoji.EMOJI_DATA.values())))

In [5]:
class nltkMA:
    def __init__(self,
                 morph_header='NLTK_',
                 word_tokenize_language='english',
                 word_tokenize_preserve_line=False,
                 pos_tag_tagset=None,
                 pos_tag_lang='eng'):
        
        self.morph_header = morph_header
        self.word_tokenize_language = word_tokenize_language
        self.word_tokenize_preserve_line = word_tokenize_preserve_line
        self.pos_tag_tagset = pos_tag_tagset
        self.pos_tag_lang = pos_tag_lang
        
    def __call__(self,text):
        result = list()
        for token in nltk.pos_tag(nltk.word_tokenize(text,
                                                     language=self.word_tokenize_language,
                                                     preserve_line=self.word_tokenize_preserve_line),
                                  tagset=self.pos_tag_tagset,
                                  lang=self.pos_tag_lang):
            result.append([token[0],self.morph_header+token[1]])
        return result

In [6]:
class setMorphemeAnalyzer:
    def __init__(self, maText,maParamDict=None):
        if maText in ['kiwi','Kiwi','KIWI','키위']:
            if maParamDict==None:
                self.ma = Kiwi().tokenize
            else:
                if 'num_workers' in maParamDict:
                    num_workers = maParamDict['num_workers']
                else:
                    num_workers = None

                if 'model_path' in maParamDict:
                    model_path = maParamDict['model_path']
                else:
                    model_path = None

                if 'options' in maParamDict:
                    options = maParamDict['options']
                else:
                    options = None

                if 'integrate_allomorph' in maParamDict:
                    integrate_allomorph = maParamDict['integrate_allomorph']
                else:
                    integrate_allomorph = None

                if 'load_default_dict' in maParamDict:
                    load_default_dict = maParamDict['load_default_dict']
                else:
                    load_default_dict = None

                if 'load_typo_dict' in maParamDict:
                    load_typo_dict = maParamDict['load_typo_dict']
                else:
                    load_typo_dict = None,

                if 'model_type' in maParamDict:
                    model_type = maParamDict['model_type']
                else:
                    model_type = 'knlm',

                if 'typos' in maParamDict:
                    typos = maParamDict['typos']
                else:
                    typos = None,

                if 'typo_cost_threshold' in maParamDict:
                    typo_cost_threshold = maParamDict['typo_cost_threshold']
                else:
                    typo_cost_threshold = 2.5

                self.ma = Kiwi(num_workers=num_workers,model_path=model_path,
                               options=options,integrate_allomorph=integrate_allomorph,
                               load_default_dict=load_default_dict,load_typo_dict=load_typo_dict,
                               model_type=model_type, typos=typos, typo_cost_threshold=typo_cost_threshold).tokenize

        elif maText in ['Hannanum', 'hannanum', 'HANNANUM','한나눔']:
            if maParamDict==None:
                self.ma = konlpy.tag.Hannanum().pos
            else:
                if 'jvmpath' in maParamDict:
                    jvmpath = maParamDict['jvmpath']
                else:
                    jvmpath=None

                if 'max_heap_size' in maParamDict:
                    max_heap_size = maParamDict['max_heap_size']
                else:
                    max_heap_size=1024

                self.ma = konlpy.tag.Hannanum(jvmpath=jvmpath, max_heap_size=max_heap_size).pos

        elif maText in ['Komoran','KOMORAN','komoran','코모란']:
            if maParamDict == None:
                self.ma = konlpy.tag.Komoran().pos
            else:
                if 'jvmpath' in maParamDict:
                    jvmpath = maParamDict['jvmpath']
                else:
                    jvmpath=None

                if 'userdic' in maParamDict:
                    userdic = maParamDict['userdic']
                else:
                    userdic=None

                if 'modelpath' in maParamDict:
                    modelpath = maParamDict['modelpath']
                else:
                    modelpath=None

                if 'max_heap_size' in maParamDict:
                    max_heap_size = maParamDict['max_heap_size']
                else:
                    max_heap_size=1024

                self.ma = konlpy.tag.Komoran(jvmpath=jvmpath, userdic=userdic,
                                             modelpath=modelpath, max_heap_size=max_heap_size).pos

        elif maText in ['Kkma','KKMA','kkma','꼬꼬마']:
            if maParamDict == None:
                self.ma = konlpy.tag.Kkma().pos
            else:
                if 'jvmpath' in maParamDict:
                    jvmpath = maParamDict['jvmpath']
                else:
                    jvmpath=None

                if 'max_heap_size' in maParamDict:
                    max_heap_size = maParamDict['max_heap_size']
                else:
                    max_heap_size=1024

                self.ma = konlpy.tag.Kkma(jvmpath=jvmpath, max_heap_size=max_heap_size).pos


        elif maText in ['Okt','OKT','okt','오픈코리안텍스트','트위터']:
            if maParamDict == None:
                self.ma = konlpy.tag.Okt().pos
            else:
                if 'jvmpath' in maParamDict:
                    jvmpath = maParamDict['jvmpath']
                else:
                    jvmpath=None

                if 'max_heap_size' in maParamDict:
                    max_heap_size = maParamDict['max_heap_size']
                else:
                    max_heap_size=1024

                self.ma = konlpy.tag.Okt(jvmpath=jvmpath, max_heap_size=max_heap_size).pos

        elif maText in ['Mecab','mecab','MECAB','미캐브']:
            if maParamDict == None:
                self.ma = konlpy.tag.Mecab().pos
            else:
                if 'dicpath' in maParamDict:
                    dicpath = maParamDict['dicpath']
                else:
                    dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic'

                self.ma = konlpy.tag.Mecab(dicpath=dicpath).pos

        else:
            raise Exception('No such morpheme analyzer\nSupported morpheme analyzers are Kiwi, KoNLPy(Hannanum, Komoran, Kkma, Okt, Mecab)')

    def __call__(self,text):
        result = list()
        for token in self.ma(text):
            result.append([token[0],token[1]])
        return result

In [7]:
def HEMEK_tokenize(text,KRmorphemeAnalyzer,NKRmorphemeAnalyzer):
    emojis = get_demojized_set()
    
    chunks = list()
    prevEnd = 0
    for found in re.finditer(':[^: ]+:',text):
        if found.group() not in emojis:
            continue
        start = found.start()
        end = found.end()
        if prevEnd!=start:
            chunks.append([text[prevEnd:start],'CHUNK'])
        chunks.append([text[start:end],'R_W_EMJ'])
        prevEnd = end
    if prevEnd != len(text):
        chunks.append([text[prevEnd:],'CHUNK'])

    HEMc = list()
    for chunk in chunks:
        if chunk[1] == 'CHUNK':
            text = chunk[0]
            chunkResult = list()
            foundDict=dict()
            for found in re.finditer('[#][^#@ ]+|#$',text):
                foundDict[found.start()] = (found.end(),'R_W_HASHTAG')
            for found in re.finditer('[@][^#@ㄱ-ㅎ가-힣 ]+|@$',text):
                foundDict[found.start()] = (found.end(),'R_W_MENTION')

            starts = list(foundDict.keys())
            starts.sort()

            prevEnd = 0
            for start in starts:
                end = foundDict[start][0]
                morph = foundDict[start][1] 
                if prevEnd != start:
                    chunkResult.append([text[prevEnd:start],'CHUNK'])
                chunkResult.append([text[start:end],morph])
                prevEnd = end

            if prevEnd != len(text):
                chunkResult.append([text[prevEnd:],'CHUNK'])

            HEMc+=chunkResult

        else:
            HEMc.append(chunk)
            
    cursor = 0
    flag = False
    lenHEMc = len(HEMc)
    while cursor < lenHEMc:
        if HEMc[cursor][1] in ('R_W_HASHTAG','R_W_MENTION'):
            if flag==True:
                for merge in range(mergeCount):
                    HEMc[mergePos][0] += HEMc.pop(mergePos+1)[0]
                cursor-=mergeCount
                lenHEMc-=mergeCount

            mergePos = cursor*1
            mergeCount = 0
            flag=True

        elif HEMc[cursor][1] == 'R_W_EMJ':
            if flag==True:
                mergeCount+=1
        else:
            if flag==True:
                for merge in range(mergeCount):
                    HEMc[mergePos][0] += HEMc.pop(mergePos+1)[0]
                cursor-=mergeCount
                lenHEMc-=mergeCount
                flag=False
        cursor+=1

    if flag==True:
        for merge in range(mergeCount):
            tok, morph = HEMc.pop(mergePos+1)
            HEMc[mergePos][0] += tok
        


    HEMEK = list()
    
    for chunk in HEMc:
        text = chunk[0]
        chunkResult = list()
        foundDict=dict()
        if chunk[1] == 'CHUNK':
            for found in re.finditer('[ㄱ-ㅎ가-힣0-9\,\.\/\\\;\'\[\]\`\-\=\<\>\?\:\"\{\}\|\~\!\@\#\$\%\^\&\*\(\)\_\+\"\' ]+',text):
                foundDict[found.start()] = (found.end(),'KR_CHUNK')
            
            prevEnd=0
            for start in foundDict:
                end = foundDict[start][0]
                morph = foundDict[start][1] 
                if prevEnd != start:
                    chunkResult.append([text[prevEnd:start],'NKR_CHUNK'])
                chunkResult.append([text[start:end],morph])
                prevEnd = end
            if prevEnd != len(text):
                chunkResult.append([text[prevEnd:],'NKR_CHUNK'])
            HEMEK+=chunkResult
        
        else:
            HEMEK.append(chunk)
    
    result = []
    for chunk in HEMEK:
        text = chunk[0]
        if re.fullmatch('[ ]+||[\n]+',text):
            continue
        elif chunk[1] == 'KR_CHUNK':
            for token in KRmorphemeAnalyzer(text):
                result.append([token[0],token[1]])
            
        elif chunk[1] == 'NKR_CHUNK':
            for token in NKRmorphemeAnalyzer(text):
                result.append([token[0],token[1]])
        else:
            result.append(chunk)
        
    
    return result

In [8]:
def BM25(data, postLens, k_1=1.5, b=0.75):
    avgPostLen = np.mean(postLens)
    
    N = len(data)
    
    n = dict()
    for post in data:
        uniqueToks = set(post)
        for tok in uniqueToks:
            try:
                n[tok]+=1
            except:
                n[tok] = 1
    
    IDF = dict()
    for tok in n.keys():
        IDF[tok] = log1p((N-n[tok]+0.5)/(n[tok]+0.5))


    filterScores = list()

    for postidx, post in enumerate(data):
        postScore = 0
        for tok in post:
            tokCount = post.count(tok)
            postScore += (IDF[tok] * (
                (tokCount*(k_1+1))/(
                    tokCount+(k_1*(1-b+(b*(postLens[postidx]/avgPostLen)))))))
        try:
            filterScores.append((postScore/len(post)))
        except:
            filterScores.append(0)

    return filterScores

In [9]:
def emoji_english_preprocess(post, tagRule={'NNP':'NNP'}, returnMorph=True):
    returnData = list()
    
    emojis = re.findall(':[_A-Za-z]+:',post)
    for emoji in set(emojis):
        emojiCounts = post.count(emoji)
        post = post.replace(emoji,'')
        returnData+=([(emoji,'EMJ')]*emojiCounts)
        
    hashTags = re.findall('#[_A-Za-z]',post)
    for hashTag in set(hashTags):
        hTagCounts = post.count(hashTag)
        post = post.replace(hashTag,'')
        returnData+=([(hashTag,'W_HASHTAG')]*hTagCounts)

    
    engChunks = re.findall('[A-Za-z]+[\' ]?[A-Za-z]+',post)
    for engChunk in set(engChunks):
        chunkCounts = post.count(engChunk)
        post = post.replace(engChunk,'')

        targetToken = list()
        for token in nltk.pos_tag(nltk.word_tokenize(engChunk)):
            if token[1] in tagRule: 
                targetToken.append((token[0],token[1]))
        returnData+=(targetToken*chunkCounts)

    filterData = list()
    for token in returnData:
        if token[1] in tagRule:
            if returnMorph==True:
                filterData.append((token[0],tagRule[token[1]]))
            else:
                filterData.append(token[0])
        
    return filterData, post