In [1]:
from konlpy.tag import Kkma, Hannanum, Komoran, Mecab, Twitter
import sentencepiece as spm
import re
import string
from bs4 import BeautifulSoup
import urllib.request
import feedparser
import requests
import pickle
from nltk.corpus import words
from tqdm.notebook import tqdm

In [2]:
parameter = '--input={} --model_prefix={} --vocab_size={} --user_defined_symbols={}'

input_file = 'dc_data_9.txt'
vocab_size = 6000
prefix = 'dc_lol'
user_defined_symbols = '[UNK]'
cmd = parameter.format(input_file, prefix, vocab_size,user_defined_symbols)

spm.SentencePieceTrainer.Train(cmd)

True

In [3]:
def remove_whitespace(text):
    whitespace = re.compile(r'\s+')
    return whitespace.sub(r' ', text)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            u"\U00002702-\U000027B0"
#                            u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def clean_korean_emoji (text):
    em = re.compile(r'[ㅠㅜㅡ>ㄷ].[ㅠㅜㅡ<ㄷ]').findall(text)
    for thing in em:
        text = text.replace(thing, thing[0]+thing[-1])
    return text

def clean_repeated_chosung (text):
    em = re.compile(r'[ㅋㄷㅎ]{3,}').findall(text)
    for thing in em:
        text = text.replace(thing, thing[:2])
    return text

def clean_number (text):
    num = re.compile(r'[0-9]+').findall(text)
    for thing in num:
        text = text.replace(thing, '<UNK>')
    return text

def clean_unicode (text):
    unicode = re.compile(r'(\xa0|\u200b)')
    return unicode.sub(r'', text)

def clean_html_elements (text):
    elems = re.compile(r'(&lt;|&gt;|&quo|%)')
    return elems.sub(r'', text)

def clean_DC_identifier (text):
    ids = re.compile(r'(dc|official|App|국내 최대 커뮤니티 포털 디시인사이드 힛갤러리 유저이슈 등 인터넷 트렌드 총 집합)')
    return ids.sub(r'', text)
def cleaning (text):
    return clean_DC_identifier(
                clean_unicode(
                            clean_repeated_chosung (
#                                 remove_punct (
                                    remove_emoji (
                                            clean_html_elements(
                                            remove_html (
                                                remove_whitespace (
                                                    remove_URL(
                                                text))))))))

In [4]:
sp = spm.SentencePieceProcessor()
sp.Load("dc_lol.model")

sents = []

with open('./dc_data_9.txt', 'r') as f:
    for line in f:
        sents.append(line.strip())
f.close()
sents = list(map(lambda x: cleaning(x), sents))
sents = list(map(lambda x: sp.EncodeAsPieces(x), sents))

whole_sent = []
for sent in sents:
    sent = [w.replace('▁', '') for w in sent]
    sent = [w for w in sent if len(w) > 1]
    whole_sent.extend(sent)
    
import nltk
fd = nltk.FreqDist(whole_sent)

In [5]:
with open('DC_LOL_preprocessed.pkl', 'wb') as f:
    pickle.dump(sents, f)

In [6]:
fd.most_common(500)

[('ᄏᄏ', 307955),
 ('진짜', 31478),
 ('ᄅᄋ', 29744),
 ('에서', 26058),
 ('인데', 25612),
 ('으로', 21902),
 ('는데', 21373),
 ('존나', 21156),
 ('하는', 20616),
 ('씨발', 19582),
 ('하고', 19324),
 ('...', 19095),
 ('한테', 18334),
 ('슼갈', 18092),
 ('..', 17891),
 ('새끼', 16373),
 ('시발', 15090),
 ('페이커', 14538),
 ('그냥', 14235),
 ('새끼들', 13864),
 ('팩트', 13711),
 ('아니', 13327),
 ('해서', 13176),
 ('근데', 13058),
 ('ᄃᄃ', 12896),
 ('병신', 12610),
 ('하면', 12540),
 ('는거', 12539),
 ('하네', 11931),
 ('우승', 11878),
 ('대깨맥', 11622),
 ('소드', 11378),
 ('씨맥', 11012),
 ('^^', 11003),
 ('보다', 10824),
 ('까지', 10627),
 ('??', 10371),
 ('이랑', 10334),
 ('롤드컵', 10136),
 ('으면', 9737),
 ('이다', 9687),
 ('라고', 9629),
 ('ᄋᄋ', 9557),
 ('생각', 9441),
 ('경기', 9310),
 ('보고', 9079),
 ('롤갤', 8950),
 ('는게', 8939),
 ('미드', 8901),
 ('이거', 8703),
 ('이네', 8637),
 ('이라', 8546),
 ('보면', 8472),
 ('니까', 8439),
 ('!!', 8360),
 ('하노', 8144),
 ('이지', 8063),
 ('선수', 7987),
 ('지랄', 7987),
 ('같은', 7931),
 ('하는거', 7694),
 ('들이', 7637),
 ('ᄉᄇ', 7543),
 ('이건', 7

In [7]:
#국립국어대사전에 따라 해당 단어가 사전에 존재하는 지 확인.
def find_word (word):
    encText = urllib.parse.quote(word)
    encKey = urllib.parse.quote('발급받은키를 넣으시오')
    url = "https://stdict.korean.go.kr/api/search.do?key="+ encKey + "&q=" + encText

    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()
    if(rescode==200):
        response_body = response.read()
        soup = BeautifulSoup(response_body, 'html.parser')
        return int(soup.find_all('total')[0].text)
    else:
        print("Error Code:" + rescode)
        return -1

In [8]:
from tqdm.notebook import tqdm
new_words = []
for word in tqdm(list(fd)):
    if '%' in word:
        continue
    if len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A30-9a-zA-Z]+', word)) > 0 and not find_word(word):
        new_words.append(word)

HBox(children=(IntProgress(value=0, max=3837), HTML(value='')))




In [9]:
len(new_words)

2053

In [10]:
# Konlpy 상의 사전에 존재하지 않는 단어로 추리기.
dict_words = []
with open('./dic_system.txt', 'r', encoding = 'utf-8') as f:
    for line in f.readlines():
        dict_words.append(line.split('\t')[0])

new_words = [w for w in new_words if w not in dict_words]
len(new_words)

1849

In [11]:
new_words

['보이노',
 '정도면',
 '아닌가',
 '피지컬',
 '인데',
 '하려고',
 '안됨',
 '하는',
 '맞는데',
 '롤갤',
 '이런애들',
 '봐도',
 '모르노',
 '재밋',
 '씹덕',
 '젠라도',
 '페이커',
 '하나도',
 '는거',
 '개웃기네',
 '세체원',
 'DRX',
 '새끼들',
 '도못',
 '좆망',
 '하는거',
 '보단',
 '좋은',
 '오면',
 '100',
 '쓰고',
 '넥서스',
 '거같은데',
 '놈이',
 '비씹덕',
 '분들',
 '슼갈새끼들',
 '아펠',
 '있노',
 '월클',
 '좆으로',
 '보이냐',
 '놓고',
 '하노',
 '큐베',
 '하는거냐',
 '넛신',
 '챌코',
 '데리고',
 '씨발',
 '먹는',
 '새끼들이',
 '아니고',
 '20',
 '쳐먹',
 '스파',
 '있으면',
 '모를',
 '겠다',
 '있었',
 '만들어',
 '겠지',
 '르고',
 '감독이',
 '진심으로',
 '범인찾기',
 '했다고',
 '개병신',
 '그걸',
 '당해서',
 '좆병신',
 '레전드',
 '노ᄏᄏ',
 '더샤이',
 '새키',
 '샤이',
 '데프트',
 '레고',
 '아직도',
 '잘하긴',
 '데프트가',
 '들어간',
 '같음',
 '첫짤',
 '존나',
 '사랑해',
 '씨맥이랑',
 '애들은',
 'skt',
 '부두술',
 '잘하는데',
 '슼한테',
 '못알아',
 '말을',
 '비슷한',
 '을때',
 '둘다',
 '티어',
 '했는데',
 '찐따',
 '이런거',
 '그당시',
 '롤드컵에서',
 '있을',
 '정도로',
 'op',
 '다이브',
 '젠갈',
 '존갈',
 '하니까',
 '슼갈',
 '네ᄏᄏ',
 '대깨맥',
 '상관없',
 '45',
 '고닉',
 '한다',
 '클리드',
 '비디디',
 '노페',
 '적으로',
 '하는데',
 '하네',
 '같은',
 '했음',
 '욕처먹',
 '틀딱',
 '아무것도',
 '아닌',
 '탑솔러',
 '비벼'

In [12]:
# Naver open API를 통해 백과사전에 등재되었는 지 여부 확인.
# 정확도에 대한 신빙성 (단어를 쳤을 때 의도하는 뜻이 아닌데 뜨면 거르도록 되어있어서) 이 검증되지 못해서 일단 보류.
def find_word_naver (word, verbose = False):
    client_id = "" # 애플리케이션 등록시 발급 받은 값 입력
    client_secret = "" # 애플리케이션 등록시 발급 받은 값 입력
    headers = {'X-Naver-Client-Id' : client_id, 'X-Naver-Client-Secret': client_secret}
    params = {'query': word, 'sort' :'count', 'display' : 100}
    BASE_URL = "https://openapi.naver.com/v1/search/encyc.xml"
    resp = requests.get(BASE_URL, params=params, headers=headers)
    if (resp.status_code == 200):
        css = feedparser.parse(resp.content)
        if verbose:
            print(css['entries'])
        if any(map(str.isdigit, word)):
            return len([1 for article in css['entries'] if word in article['title']])
        else:
            return len(css['entries'])
    else:
        print("Error Code:" + str(resp.status_code))
        return -1

In [18]:
import time
final_words = []
dictionary = words.words()

def isEnglish(word):
    try:
        return word.encode('ascii').isalpha()
    except:
        return False

threshold = 5
for w in tqdm(new_words):
    if len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3!@#$%^&*()~`_=+]+', w)) > 0 and find_word_naver(w) < threshold:
#     if len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', w)) > 0:
        final_words.append(w)
    elif isEnglish(w) and w not in dictionary:
        final_words.append(w)
    time.sleep(.03)

HBox(children=(IntProgress(value=0, max=1849), HTML(value='')))




In [19]:
def find_derivatives (mc, final_words):
    derivatives = []
    duplicates = []
    
    final_words = sorted(final_words)
    prev_morphs = mc.morphs(final_words[0])
    for i in range(1, len(final_words)):
        current = final_words[i]
        current_morphs = mc.morphs(current)
        if prev in current:
            derivatives.append((prev, current))
            duplicates.append(current)
        prev_morphs = current_morphs
        
    originals = [w for w in final_words if w not in duplicates]
    return derivatives, duplicates, originals
    
def check_list (word_list):
    prev = word_list[0]
    for i in range(1, len(word_list)):
        current = word_list[i]
        if prev in current:
            return False
        prev = current
    return True

In [20]:
#중복 단어를 빼는 과정에서 필요한 파생어들이 빠지는 사례 존재 (e.g. 세체 -> 세체원, 세체미, 세체탑 누락 등)
# duplicates = []
# derivatives = []
# while not check_list(final_words):
#     print("AA")
#     der, dup, final_words = find_derivatives(final_words)
#     derivatives.extend(der)
#     duplicates.extend(dup)

In [21]:
final_words.append('페이커')
with open('dc_LOL_newWord.pkl', 'wb') as f:
    pickle.dump(final_words, f)
# with open('dc_LOL_newWord_derivatives.pkl', 'wb') as f:
#     pickle.dump(derivatives, f)