In [110]:
from konlpy.tag import Kkma, Hannanum, Komoran, Mecab, Twitter
import sentencepiece as spm
import re
import string
from bs4 import BeautifulSoup
import urllib.request
import feedparser
import requests

In [2]:
def remove_whitespace(text):
    whitespace = re.compile(r'\s+')
    return whitespace.sub(r' ', text)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            u"\U00002702-\U000027B0"
#                            u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def clean_korean_emoji (text):
    em = re.compile(r'[ㅠㅜㅡ>ㄷ].[ㅠㅜㅡ<ㄷ]').findall(text)
    for thing in em:
        text = text.replace(thing, thing[0]+thing[-1])
    return text

def clean_repeated_chosung (text):
    em = re.compile(r'[ㄱ-ㅎㅏ-ㅣ]{3,}').findall(text)
    for thing in em:
        text = text.replace(thing, thing[:2])
    return text

def clean_number (text):
    num = re.compile(r'[0-9]+').findall(text)
    for thing in num:
        text = text.replace(thing, '<UNK>')
    return text

def clean_unicode (text):
    unicode = re.compile(r'(\xa0|\u200b)')
    return unicode.sub(r'', text)

def clean_html_elements (text):
    elems = re.compile(r'(&lt;|&gt;|&quo|%)')
    return elems.sub(r'', text)

def clean_DC_identifier (text):
    ids = re.compile(r'(dc|official|App|국내 최대 커뮤니티 포털 디시인사이드 힛갤러리 유저이슈 등 인터넷 트렌드 총 집합)')
    return ids.sub(r'', text)
def cleaning (text):
    return clean_DC_identifier(
                clean_unicode(
                            clean_repeated_chosung (
#                                 remove_punct (
                                    remove_emoji (
                                            clean_html_elements(
                                            remove_html (
                                                remove_whitespace (
                                                    remove_URL(
                                                text))))))))

In [3]:
parameter = '--input={} --model_prefix={} --vocab_size={} --user_defined_symbols={}'

input_file = 'dc_data_9.txt'
vocab_size = 10000
prefix = 'dc_lol'
user_defined_symbols = '[UNK]'
cmd = parameter.format(input_file, prefix, vocab_size,user_defined_symbols)

spm.SentencePieceTrainer.Train(cmd)

True

In [4]:
sp = spm.SentencePieceProcessor()
sp.Load("dc_lol.model")

sents = []

with open('./dc_data_9.txt', 'r') as f:
    for line in f:
        sents.append(line.strip())
f.close()
sents = list(map(lambda x: cleaning(x), sents))
sents = list(map(lambda x: sp.EncodeAsPieces(x), sents))

whole_sent = []
for sent in sents:
    sent = [w.replace('▁', '') for w in sent]
    sent = [w for w in sent if len(w) > 1]
    whole_sent.extend(sent)
    
import nltk
fd = nltk.FreqDist(whole_sent)

In [5]:
words = dict(fd)

In [6]:
fd.most_common(500)

[('ᄏᄏ', 293461),
 ('진짜', 31477),
 ('ᄅᄋ', 30915),
 ('인데', 24123),
 ('에서', 21560),
 ('존나', 20991),
 ('으로', 20737),
 ('하고', 17884),
 ('..', 17879),
 ('는데', 17617),
 ('...', 17589),
 ('하는', 17372),
 ('한테', 17082),
 ('슼갈', 16530),
 ('씨발', 16483),
 ('그냥', 14235),
 ('새끼', 13924),
 ('페이커', 13195),
 ('근데', 13058),
 ('시발', 12964),
 ('ᄃᄃ', 12860),
 ('새끼들', 12757),
 ('팩트', 12378),
 ('해서', 12101),
 ('아니', 11367),
 ('대깨맥', 10868),
 ('씨맥', 10268),
 ('우승', 10260),
 ('??', 10240),
 ('병신', 10203),
 ('하면', 10158),
 ('롤드컵', 10136),
 ('하네', 10097),
 ('^^', 9845),
 ('까지', 9648),
 ('소드', 9476),
 ('보다', 9414),
 ('는거', 9337),
 ('이랑', 9252),
 ('ᄋᄋ', 9086),
 ('라고', 9003),
 ('경기', 8723),
 ('이다', 8647),
 ('보고', 8631),
 ('이거', 8575),
 ('보면', 8398),
 ('!!', 8311),
 ('이네', 8303),
 ('미드', 8033),
 ('롤갤', 7672),
 ('이라', 7641),
 ('니까', 7555),
 ('선수', 7515),
 ('이건', 7509),
 ('는게', 7310),
 ('이지', 7309),
 ('ᄉᄇ', 7222),
 ('같은', 7212),
 ('쵸비', 7135),
 ('이게', 7119),
 ('ᄋᄌ', 7075),
 ('지금', 6987),
 ('수준', 6941),
 ('생각', 6919),
 

In [38]:
#국립국어대사전에 따라 해당 단어가 사전에 존재하는 지 확인.
def find_word (word):
    encText = urllib.parse.quote(word)
    url = "https://stdict.korean.go.kr/api/search.do?key=0D4613D9E093577855EB25A9F3C001AB&q=" + encText

    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()
    if(rescode==200):
        response_body = response.read()
        soup = BeautifulSoup(response_body, 'html.parser')
        return int(soup.find_all('total')[0].text)
    else:
        print("Error Code:" + rescode)
        return -1

In [61]:
from tqdm.notebook import tqdm
new_words = []
for word in tqdm(list(fd)):
    if '%' in word:
        continue
    if len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', word)) > 0 and not find_word(word):
        new_words.append(word)

HBox(children=(IntProgress(value=0, max=7042), HTML(value='')))




In [105]:
len(new_words)

3635

In [71]:
# Konlpy 상의 사전에 존재하지 않는 단어로 추리기.
dict_words = []
with open('./dic_system.txt', 'r', encoding = 'utf-8') as f:
    for line in f.readlines():
        dict_words.append(line.split('\t')[0])

new_words = [w for w in new_words if w not in dict_words]
len(new_words)

In [148]:
#Naver open API를 통해 백과사전에 등재되었는 지 여부 확인.
def find_word_naver (word, verbose = False):
    client_id = "O7cwe4q5L4CUKnE6c0Dr" # 애플리케이션 등록시 발급 받은 값 입력
    client_secret = "sRimQiEulv" # 애플리케이션 등록시 발급 받은 값 입력
    headers = {'X-Naver-Client-Id' : client_id, 'X-Naver-Client-Secret': client_secret}
    params = {'query': word, 'sort' :'count', 'display' : 100}
    BASE_URL = "https://openapi.naver.com/v1/search/encyc.xml"
    resp = requests.get(BASE_URL, params=params, headers=headers)
    if (resp.status_code == 200):
        css = feedparser.parse(resp.content)
        if verbose:
            print(css['entries'])
        return len(css['entries'])

    else:
        print("Error Code:" + str(resp.status_code))
        return -1

In [150]:
import time
final_words = []
for w in tqdm(new_words):
    if len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', w)) > 0 and not find_word_naver(w):
        final_words.append(w)
#     time.sleep(1)

HBox(children=(IntProgress(value=0, max=3635), HTML(value='')))

Error Code:429
Error Code:429
Error Code:429
Error Code:429
Error Code:429
Error Code:429
Error Code:429
Error Code:429
Error Code:429



In [155]:
import pickle
with open('dc_LOL_newWord_naver.pkl', 'wb') as f:
    pickle.dump(final_words, f)
with open('dc_LOL_newWord.pkl', 'wb') as f:
    pickle.dump(new_words, f)

In [154]:
len(final_words)
final_words

['롤갤',
 '씹덕',
 '페이커도',
 '는거',
 '개웃기네',
 '아ᄏᄏ',
 '나보지',
 '비씹덕',
 '슼갈새끼들',
 '월클',
 '좆으로',
 '넛신',
 '쳐먹',
 '개병신새끼',
 '좆병신',
 '노ᄏᄏ',
 '더샤이',
 '보여줬',
 '첫짤',
 '씨맥이랑',
 '슼한테',
 '슼갈',
 '네ᄏᄏ',
 '대깨맥',
 'ᄋᄌ한다',
 '욕처먹',
 '애미창년',
 '롤갤은',
 '씹소리',
 '잘생겼',
 '젠첩',
 '씨맥',
 '했누',
 '개좆슼',
 '해줬',
 '팀운',
 '개역겹네',
 '괴물쥐',
 '였누',
 '롤붕이들',
 '존나웃기네',
 '씹ᄏᄏ',
 '엌ᄏᄏᄏ',
 '보지년',
 '슼팬',
 '그미드',
 '씨발년아',
 '맥천지',
 '도없는',
 '까내리',
 '였노',
 '슼갈들',
 '대깨맥들',
 '미쳤노',
 '역겹네',
 '뽑았',
 '쉴드치',
 '챔폭',
 '어돈집',
 '씨발ᄏᄏ',
 '부리그',
 '씨맥이',
 '하네ᄏᄏ',
 '슼갈인데',
 '페독들',
 '처발',
 '대깨맥들은',
 '유일갈',
 'ᄋᄌ이지',
 '개역겹',
 '고있네',
 '틀타판',
 '헬피엔딩',
 '뎊맘',
 '는거보면',
 '마렵네',
 '소름돋',
 '긴하네',
 '개많',
 '무위키',
 '원챔',
 '것같',
 '슼선족',
 '는거랑',
 '줄알았',
 '애미뒤진',
 '지말라고',
 '엠생',
 '안읽',
 '슥갈',
 '는거지',
 '왜케',
 '억까',
 '고싶어서',
 '페까',
 '맥갈',
 '위딱',
 '긴함',
 '개빡',
 '지ᄏᄏ',
 '웃기노',
 '념글',
 '대깨맥들아',
 '리헨즈',
 '왤케',
 '맥뚜기',
 '솔킬',
 '틀타',
 '할말없',
 'ᄅᄋ루',
 '팡진이',
 '기방패',
 '깨맥',
 '잖음',
 '딱대',
 '트타쿠',
 '브실골',
 '지않',
 '겠노',
 '씨발련',
 '웃음벨',
 '주작질',
 '웃겼',
 '씹좆',
 '슼갈이',
 '슼마갤',
 '임ᄏᄏ',