In [70]:
from konlpy.tag import Kkma, Hannanum, Komoran, Mecab, Twitter
import sentencepiece as spm
import re
import string
from bs4 import BeautifulSoup
import urllib.request
import feedparser
import requests
from collections import defaultdict
from operator import itemgetter
from tqdm.notebook import tqdm
from itertools import product
import pandas as pd
import pickle

In [95]:
with open('./nate_newWord.pkl', 'rb') as f:
    words = pickle.load(f)
with open('./nate_preprocessed.pkl', 'rb') as f:
    sents = pickle.load(f)

In [96]:
def recover(tokens):
    sent = ''.join(tokens)
    sent = sent.replace('▁', ' ')
    return sent[1:]

def example_of_word (sents, words, idx = None, by_word = None, N = 15):
    if by_word is not None:
        word = by_word
        print(f'Target word: {word}')
        print(f'Word in new word list: {word in words}')
    elif idx is not None:
        word = words[idx]
        print(f'Target word: {word}')
    
    answer = []
    cowords = defaultdict(int)
    for sent in sents:
        comparable = list(map(lambda x: x.replace('▁', ''), sent))
        if word in comparable:
            answer.append(recover(sent))
            for w in comparable:
                cowords[w] += 1
    print(f'TOTAL SENTENCES: {len(answer)}')
    sorted_cowords = list(map(lambda x: x[0], sorted(cowords.items(), key = itemgetter(1), reverse = True)))
    new_cowords = {w : cowords[w] for w in sorted_cowords if w in words and w != word}
    TopN = dict(sorted(new_cowords.items(), key=itemgetter(1), reverse=True)[:N])
    print(f'TOP {N} NEAR NEW WORDS: {TopN}')
    return answer

In [97]:
result = example_of_word(sents, words, by_word = '페이커')

Target word: 페이커
Word in new word list: False
TOTAL SENTENCES: 0
TOP 15 NEAR NEW WORDS: {}


In [98]:
import json
with open('senti_info.json', encoding='utf-8-sig', mode='r') as f:
    data = json.load(f)
with open('abusivewords.txt', 'r', encoding='UTF-8') as f:
    wikidata = list(map(lambda x: x.replace('\n',''), f.readlines()))

def word_polarity(wordname):
    result = None
    if data.get(wordname):
        return int(data[wordname])
    return result

from collections import Counter
def sent_with_new_word (sents, word):
    '''
    sents: 문장 모음.
    word: 타겟으로 하는 신조어
    '''
    answer = []
    for sent in sents:
        comparable = list(map(lambda x: x.replace('▁', ''), sent))
        if word in comparable:
            answer += comparable
    return Counter(answer)

def is_slang(word):
    for w in wikidata:
        if w in word:
            return -3
    return None

def new_word_sentiment (word):
    point_sum = []
    cnt = 0
    samples = sent_with_new_word(sents, word)
    for w in list(samples):
        point1 = word_polarity(w)
        point2 = is_slang(w)
        
        if point2:
            point = point2
        elif point1:
            point = point1
        else:
            point = None
        
        if point:
            point_sum.append(point * samples[w])
            cnt += samples[w]
    if cnt:
        return sum(point_sum) / cnt
    return None

In [99]:
scores = {}
for word in tqdm(words):
    score = new_word_sentiment(word)
    if score:
        scores[word] = score
        tqdm.write('{}: {}'.format(word, scores[word]))

HBox(children=(FloatProgress(value=0.0, max=454.0), HTML(value='')))

는거: -1.0476190476190477
그랬: -1.5681818181818181
모르겠: -1.2380952380952381
이였: -1.3095238095238095
갑수목장: -0.6666666666666666
지않: -1.0
찾아봤: -1.0
비어있: -2.0
들한테: -2.0
관심없: -1.3076923076923077
실망하: -0.6666666666666666
리니까: -2.0
나봐: -1.3333333333333333
소름돋: -0.2857142857142857
쁘단: -1.0
은듯: -1.0
개극혐: -2.5
라짐: -2.3333333333333335
다보니: -1.0714285714285714
욕심안내고: -2.4285714285714284
좋겠: -0.5
았어: -2.5
전체적으: -2.0
좋겠당: 2.0
챙겼: -1.8888888888888888
안챙기: -1.5714285714285714
기로했: 1.0
정떨어: -3.0
해놓: -2.0
기억하: -0.8666666666666667
결혼하: -1.0
었습니다: -1.4388888888888889
숨겼: -3.0
않았습니: -1.125925925925926
고있습니다: -2.5
것같: -1.2
고싶은데: -0.5
게된: -2.0
무시하: -1.8333333333333333
조롱하: -1.8333333333333333
임ᄋᄋ: -1.8333333333333333
얘기하: -1.5
려서: -0.5
중고딩: 2.0
학년때: -3.0
져있: -0.6470588235294118
끝났: -2.0
망했: -2.5
죽고싶: -3.0
공개하: -1.3333333333333333
ᄋᄌᄅ임: -1.2
틀딱: -1.0
어놓고: -1.0
다녔: -1.28125
했겠: -2.769230769230769
받고싶: -2.769230769230769
깊티: -0.08333333333333333
고마웡: -0.4
퍼가지마: -0.4
안챙: -0.4
거야ᄉᄇ: -0.4
설렜: -0.4
같이쓰: -0.4
설레ᅲᅲᅲᅲ: 0

In [88]:
with open('nate_scores.pkl', 'wb') as f:
    pickle.dump(scores, f)

In [118]:
with open('dc_iu_scores.pkl', 'rb') as f:
    lol_scores = pickle.load(f)
n = sorted(lol_scores.items(), key = (lambda x:x[1]))
with open('dc_iu_sorted_scores.txt', 'w', encoding = 'utf-8') as f:
    for w, s in n:
        f.write(w+': '+str(s)+'\n')

EOFError: Ran out of input

In [81]:
with open('dc_iu_scores.pkl', 'rb') as f:
    iu_scores = pickle.load(f)
sorted(iu_scores.items(), key = (lambda x:x[1]))
with open('dc_iu_scores.txt', 'rb') as f:
    f.write(iu_scores)

[('미친', -3.0),
 ('보소', -3.0),
 ('MV', -3.0),
 ('이드', -3.0),
 ('가즈아', -3.0),
 ('개추', -3.0),
 ('좋겠다', -3.0),
 ('ost', -3.0),
 ('됐다', -3.0),
 ('나의', -3.0),
 ('저씨', -3.0),
 ('몇개', -3.0),
 ('넣어', -3.0),
 ('xt', -3.0),
 ('빠졌네요', -3.0),
 ('해석해봄', -3.0),
 ('실화냐', -3.0),
 ('잘만', -3.0),
 ('일시후원', -3.0),
 ('는거지', -3.0),
 ('안뇽', -3.0),
 ('는건', -3.0),
 ('첫댓글', -3.0),
 ('바람직', -3.0),
 ('존나', -3.0),
 ('없었', -3.0),
 ('개쩔어', -3.0),
 ('해봤음', -3.0),
 ('어그로', -3.0),
 ('개띵곡', -3.0),
 ('좋네', -3.0),
 ('EDAM', -3.0),
 ('세계최', -3.0),
 ('위키', -3.0),
 ('우리유', -2.6666666666666665),
 ('했던', -2.6),
 ('움짤', -2.5),
 ('jpg', -2.5),
 ('팔레트', -2.5),
 ('챗셔', -2.5),
 ('ed', -2.5),
 ('알아', -2.466666666666667),
 ('충격이', -2.466666666666667),
 ('개독회사에서', -2.4444444444444446),
 ('낫다하하하', -2.4444444444444446),
 ('먹어라', -2.4444444444444446),
 ('하십시요', -2.4444444444444446),
 ('적극홍보', -2.4444444444444446),
 ('것들', -2.4285714285714284),
 ('디어', -2.4285714285714284),
 ('정신차려', -2.4285714285714284),
 ('부탁합니다ᅳᅳᅳᅳᅳ', -2.428571428571428

In [89]:
with open('nate_scores.pkl', 'rb') as f:
    lol_scores = pickle.load(f)
sorted(lol_scores.items(), key = (lambda x:x[1]))

[('정떨어', -3.0),
 ('숨겼', -3.0),
 ('학년때', -3.0),
 ('죽고싶', -3.0),
 ('개설레', -3.0),
 ('nn', -3.0),
 ('개웃기네', -3.0),
 ('어놨', -3.0),
 ('릴듯', -3.0),
 ('몸매좋', -3.0),
 ('찾았', -3.0),
 ('안했', -3.0),
 ('입었', -3.0),
 ('을거같은', -3.0),
 ('겠네', -3.0),
 ('안오', -3.0),
 ('부럽네', -3.0),
 ('할거같', -3.0),
 ('는거아님', -3.0),
 ('대깨문', -3.0),
 ('려구', -3.0),
 ('얼마안', -3.0),
 ('네ᅲ', -3.0),
 ('개설렌', -3.0),
 ('개빡', -3.0),
 ('치겠', -3.0),
 ('했겠', -2.769230769230769),
 ('받고싶', -2.769230769230769),
 ('좋았', -2.75),
 ('함ᄏᄏ', -2.75),
 ('쳐늙어', -2.75),
 ('이클', -2.727272727272727),
 ('겠음', -2.6666666666666665),
 ('꺼지라', -2.6666666666666665),
 ('ou', -2.5714285714285716),
 ('vers', -2.5714285714285716),
 ('개극혐', -2.5),
 ('았어', -2.5),
 ('고있습니다', -2.5),
 ('망했', -2.5),
 ('못받', -2.5),
 ('게임하', -2.5),
 ('은건', -2.5),
 ('욕심안내고', -2.4285714285714284),
 ('Church', -2.4),
 ('ed', -2.4),
 ('강조하', -2.4),
 ('주문하', -2.4),
 ('ial', -2.4),
 ('pel', -2.4),
 ('ph', -2.4),
 ('라짐', -2.3333333333333335),
 ('웹드', -2.3333333333333335),
 ('당했', -2.333333