In [1]:
from konlpy.tag import Kkma, Hannanum, Komoran, Mecab, Twitter
import sentencepiece as spm
import re
import string

In [2]:
def remove_whitespace(text):
    whitespace = re.compile(r'\s+')
    return whitespace.sub(r' ', text)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            u"\U00002702-\U000027B0"
#                            u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def clean_korean_emoji (text):
    em = re.compile(r'[ㅠㅜㅡ>ㄷ].[ㅠㅜㅡ<ㄷ]').findall(text)
    for thing in em:
        text = text.replace(thing, thing[0]+thing[-1])
    return text

def clean_repeated_chosung (text):
    em = re.compile(r'[ㄱ-ㅎㅏ-ㅣ]{3,}').findall(text)
    for thing in em:
        text = text.replace(thing, thing[:2])
    return text

def clean_number (text):
    num = re.compile(r'[0-9]+').findall(text)
    for thing in num:
        text = text.replace(thing, '<UNK>')
    return text

def clean_unicode (text):
    unicode = re.compile(r'(\xa0|\u200b)')
    return unicode.sub(r'', text)

def clean_html_elements (text):
    elems = re.compile(r'(&lt;|&gt;|&quo)')
    return elems.sub(r'', text)

def clean_DC_identifier (text):
    ids = re.compile(r'(dc|official|App|국내 최대 커뮤니티 포털 디시인사이드 힛갤러리 유저이슈 등 인터넷 트렌드 총 집합)')
    return ids.sub(r'', text)
def cleaning (text):
    return clean_DC_identifier(
                clean_unicode(
                            clean_repeated_chosung (
#                                 remove_punct (
                                    remove_emoji (
                                            clean_html_elements(
                                            remove_html (
                                                remove_whitespace (
                                                    remove_URL(
                                                text))))))))

In [3]:
parameter = '--input={} --model_prefix={} --vocab_size={} --user_defined_symbols={}'

input_file = 'dc_data_9.txt'
vocab_size = 10000
prefix = 'dc_lol'
user_defined_symbols = '[UNK]'
cmd = parameter.format(input_file, prefix, vocab_size,user_defined_symbols)

spm.SentencePieceTrainer.Train(cmd)

True

In [4]:
sp = spm.SentencePieceProcessor()
sp.Load("dc_lol.model")

sents = []

with open('./dc_data_9.txt', 'r') as f:
    for line in f:
        sents.append(line.strip())
f.close()
sents = list(map(lambda x: cleaning(x), sents))
sents = list(map(lambda x: sp.EncodeAsPieces(x), sents))

whole_sent = []
for sent in sents:
    sent = [w.replace('▁', '') for w in sent]
    sent = [w for w in sent if len(w) > 1]
    whole_sent.extend(sent)
    
import nltk
fd = nltk.FreqDist(whole_sent)

In [5]:
words = dict(fd)

In [6]:
fd.most_common(500)

[('ᄏᄏ', 293461),
 ('진짜', 31477),
 ('ᄅᄋ', 30915),
 ('인데', 24123),
 ('에서', 21560),
 ('존나', 20991),
 ('으로', 20737),
 ('하고', 17884),
 ('..', 17879),
 ('는데', 17617),
 ('...', 17589),
 ('하는', 17372),
 ('한테', 17082),
 ('슼갈', 16530),
 ('씨발', 16483),
 ('그냥', 14235),
 ('새끼', 13924),
 ('페이커', 13195),
 ('근데', 13058),
 ('시발', 12964),
 ('ᄃᄃ', 12860),
 ('새끼들', 12757),
 ('팩트', 12378),
 ('해서', 12101),
 ('아니', 11367),
 ('대깨맥', 10868),
 ('씨맥', 10268),
 ('우승', 10260),
 ('??', 10240),
 ('병신', 10203),
 ('하면', 10158),
 ('롤드컵', 10136),
 ('하네', 10097),
 ('^^', 9845),
 ('까지', 9648),
 ('소드', 9476),
 ('보다', 9414),
 ('는거', 9337),
 ('이랑', 9252),
 ('ᄋᄋ', 9086),
 ('라고', 9003),
 ('경기', 8723),
 ('이다', 8647),
 ('보고', 8631),
 ('이거', 8575),
 ('보면', 8398),
 ('!!', 8311),
 ('이네', 8303),
 ('미드', 8033),
 ('롤갤', 7672),
 ('이라', 7641),
 ('니까', 7555),
 ('선수', 7515),
 ('이건', 7509),
 ('는게', 7310),
 ('이지', 7309),
 ('ᄉᄇ', 7222),
 ('같은', 7212),
 ('쵸비', 7135),
 ('이게', 7119),
 ('ᄋᄌ', 7075),
 ('지금', 6987),
 ('수준', 6941),
 ('생각', 6919),
 

In [7]:
#Naver Open API 사용해보기
import urllib.request
client_id = "O7cwe4q5L4CUKnE6c0Dr" # 애플리케이션 등록시 발급 받은 값 입력
client_secret = "sRimQiEulv" # 애플리케이션 등록시 발급 받은 값 입력
encText = urllib.parse.quote("페이커")
url = "https://openapi.naver.com/v1/search/book?query=" + encText +"&sort=count"
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
response = urllib.request.urlopen(request)
rescode = response.getcode()
if(rescode==200):
    response_body = response.read()
    print(response_body.decode('utf-8'))
else:
    print("Error Code:" + rescode)

{
"lastBuildDate": "Thu, 30 Apr 2020 16:42:03 +0900",
"total": 1039,
"start": 1,
"display": 3,
"items": [
{
"title": "혼자 공부하는 <b>파이썬</b> (<b>파이썬</b> 최신 버전 반영)",
"link": "http://book.naver.com/bookdb/book_detail.php?bid=15028688",
"image": "https://bookthumb-phinf.pstatic.net/cover/150/286/15028688.jpg?type=m1&udate=20200321",
"author": "윤인성",
"price": "18000",
"discount": "16200",
"publisher": "한빛미디어",
"pubdate": "20190610",
"isbn": "1162241888 9791162241882",
"description": "1:1 과외하듯 배우는 <b>파이썬</b> 프로그래밍 자습서(<b>파이썬</b> 최신 버전 반영)\n이 책은 독학으로 프로그래밍 언어를 처음 배우려는 입문자가, 혹은 <b>파이썬</b>을 배우려는 입문자가 ‘꼭 필요한 내용을 제대로’ 학습할 수 있도록 구성했다. ‘무엇을’, ‘어떻게’ 학습해야 할지조차 모르는 입문자의 막연한 마음을 살펴... "

},
{
"title": "Do it! 점프 투 <b>파이썬</b>",
"link": "http://book.naver.com/bookdb/book_detail.php?bid=15052904",
"image": "https://bookthumb-phinf.pstatic.net/cover/150/529/15052904.jpg?type=m1&udate=20200418",
"author": "박응용",
"price": "18800",
"discount": "16920",
"publisher": "이지스퍼블리싱",
"pubdate": "20190620",
"isbn": "1163