In [33]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
from kiwipiepy import Kiwi

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools

import matplotlib.pyplot as plt


import pickle
with open('./korean_stopwords.txt', 'rb') as f:
    stopwords = pickle.load(f)


kiwi = Kiwi()
print('kwi')
kiwi.prepare()

def tokenize_for_countvectorizer(sent_df):
    result = []
    for i in range(len(sent_df)):
        sent = sent_df[i]
        result.append(' '.join(tokenize(sent)))
    return result

def tokenize(sent):
    res, score = kiwi.analyze(sent)[0] # 첫번째 결과를 사용
    return [word + ('다' if tag.startswith('V') else '') # 동사에는 '다'를 붙여줌
            for word, tag, _, _ in res
            if not tag.startswith('E') and not tag.startswith('J') and not tag.startswith('S') and word not in stopwords] # 조사, 어미, 특수기호는 제거


            
df = pd.read_csv('D:/crawling/crawl_육군_2020-01-01_2020-05-25/NC/temp/url/naver_comment_url육군_2020.01.09.csv')

kwi


In [27]:
data_lemmatized = tokenize_for_countvectorizer(df.content)

In [39]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=4,                        # minimum reqd occurences of a word 
                             max_df = 0.9,
                             stop_words= stopwords,             # remove stop words
                             token_pattern='[가-힣]{2,}',  # 두 글자 이상 단어만. 한글로 된 단어만.
                             # max_features=50000,             # max number of uniq words
                            )



In [40]:
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [43]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  14.175367568719368 %


In [56]:
#gensim을 위해 데이터 형식을 변환.
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

# DTM matrix를 gensim corpus (= list of (word_id, frequency) tuples )
corpus_vect_gensim = gensim.matutils.Sparse2Corpus(data_vectorized , documents_columns=False)

In [57]:
id2word

{280: '본보기',
 148: '뉴스',
 325: '서울',
 114: '기자',
 634: '출연',
 425: '오후',
 250: '방송',
 672: '프로그램',
 503: '입다',
 190: '등장',
 171: '대한민국',
 456: '위하다',
 307: '사용',
 605: '참여',
 347: '소개',
 1: '가능',
 536: '전역',
 421: '예정',
 248: '밝히다',
 706: '협조',
 244: '받다',
 194: '떨어지다',
 397: '없다',
 336: '설명',
 283: '부대',
 360: '시선',
 363: '시작',
 193: '떠나다',
 377: '아주',
 118: '깊다',
 483: '이유',
 682: '한편',
 366: '시키',
 692: '허브',
 65: '관련',
 109: '기사',
 269: '보다',
 139: '네이버',
 211: '메인',
 73: '구독',
 641: '코리아',
 221: '무단',
 537: '전재',
 256: '배포',
 101: '금지',
 445: '원보',
 266: '보기',
 32: '검사',
 247: '발표',
 64: '관계자',
 36: '결과',
 429: '올해',
 571: '준비',
 493: '인상',
 102: '기간',
 23: '개월',
 83: '군대',
 644: '크다',
 178: '도움',
 183: '되다',
 446: '원본',
 368: '신문',
 595: '집단',
 197: '마을',
 525: '재산권',
 690: '행사',
 88: '권리',
 714: '회복',
 448: '원주',
 13: '강원',
 449: '원주시',
 302: '사령부',
 568: '주변',
 308: '사유지',
 169: '대하다',
 86: '군사',
 361: '시설',
 278: '보호',
 74: '구역',
 688: '해제',
 625: '최종',
 37: '결정',
 80: '국방부',
