In [40]:
import load_data as ld
import numpy as np
import pandas as pd
from eunjeon import Mecab
from soynlp.normalizer import *
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction.text import CountVectorizer
import re   

def clean_text(str):
    txt = re.sub('[-=+,#/\?:^@*\"※~ㆍ!』‘|\(\)\[\]`\'…》\”\“\’·]', ' ', str)
    txt = only_text(txt)
    return txt

def get_nouns(str):   # 형용사 등도 이용할 수 있도록
    tagger = Mecab()
    nouns_list = tagger.nouns(str)
    return [ noun for noun in nouns_list if len(noun) != 1 ]

def get_stcs_nouns(ListOfSentence):
    return list(map(get_nouns, ListOfSentence))

def make_dic_count(word_list):
    word_counts = dict()
    for word in word_list:
        word_counts[word] = word_counts.get(word, 0) + 1    # 단어의 카운트 증가
    return word_counts

def get_stcs_dic(ListOfSentence):
    return list(map(make_dic_count, ListOfSentence))
    

def para2stcs(paragraph):
    return paragraph.split('. ')

def lst2str(lst):
    return ' '.join(lst)

def cal_tfidf(nouns_paras):  # TF-IDF뿐만 아니라 textrank 등도 이용할 수 있도록
    paragraphs = list(map(lst2str, nouns_paras))
    vectorizer = TfidfVectorizer()
    dtm = vectorizer.fit_transform(paragraphs)  # Document Term Matrix

    return dtm


def preprocessing(paragraphs):  # paragraphs : ["문단1", "문단2", ...]
    
    combine_para = ' '.join(paragraphs)
    
    clean_comb = clean_text(combine_para)
    clean_paras = list(map(clean_text, paragraphs))     # [ [문단1], [문단2], ...]
    clean_stcs = list(map(para2stcs, clean_paras))      # [ [문단1의 문장1, 문단1의 문장2, ...], [문단2의 문장1, ...] , ...]

    nouns_comb = get_nouns(clean_comb)
    nouns_paras = list(map(get_nouns, clean_paras))     # [ [문단1의 명사들], [문단2의 명사들], ... ]
    nouns_stcs = list(map(get_stcs_nouns, clean_stcs))  # [ [ [문단1의 문장1의 명사들], [문단1의 문장2의 명사들], ...], []]

    dic_comb = make_dic_count(nouns_comb)
    dic_paras = list(map(make_dic_count, nouns_paras))
    dic_stcs = list(map(get_stcs_dic, nouns_stcs))
    return nouns_comb, nouns_paras, nouns_stcs #dic_comb, dic_paras, dic_stcs
    

In [41]:
paragraphs = ld.read_txt("홍차.txt")     # [ 문단1, 문단2, ...]
combine_para = ' '.join(paragraphs)


In [42]:

clean_comb = clean_text(combine_para)
clean_paras = list(map(clean_text, paragraphs))     # [ [문단1], [문단2], ...]
clean_stcs = list(map(para2stcs, clean_paras))      # [ [문단1의 문장1, 문단1의 문장2, ...], [문단2의 문장1, ...] , ...]

nouns_comb = get_nouns(clean_comb)
nouns_paras = list(map(get_nouns, clean_paras))     # [ [문단1의 명사들], [문단2의 명사들], ... ]
nouns_stcs = list(map(get_stcs_nouns, clean_stcs))  # [ [ [문단1의 문장1의 명사들], [문단1의 문장2의 명사들], ...], []]

dic_comb = make_dic_count(nouns_comb)
#dic_paras = list(map(make_dic_count, nouns_paras))
#dic_stcs = list(map(get_stcs_dic, nouns_stcs))

after word :  홍차 count :  1
after word :  차 count :  1
after word :  잎 count :  1
after word :  내부 count :  1
after word :  성분 count :  1
after word :  자체 count :  1
after word :  효소 count :  1
after word :  산 count :  1
after word :  빛 count :  1
after word :  차 count :  2
after word :  뜻 count :  1
after word :  녹차 count :  1
after word :  보이차 count :  1
after word :  효소 count :  2
after word :  작용 count :  1
after word :  중지 count :  1
after word :  쇄 count :  1
after word :  청 count :  1
after word :  햇볕 count :  1
after word :  말림 count :  1
after word :  과정 count :  1
after word :  때문 count :  1
after word :  잎 count :  2
after word :  자체 count :  2
after word :  효소 count :  3
after word :  산화가 count :  1
after word :  것 count :  1
after word :  동양 count :  1
after word :  차 count :  3
after word :  빛깔 count :  1
after word :  붉은색 count :  1
after word :  홍차 count :  2
after word :  서양 count :  1
after word :  찻잎 count :  1
after word :  색깔 count :  1
after word :  검은색 count :  1

In [36]:
nouns_comb[:32]

['홍차',
 '차',
 '잎',
 '내부',
 '성분',
 '자체',
 '효소',
 '산',
 '빛',
 '차',
 '뜻',
 '녹차',
 '보이차',
 '효소',
 '작용',
 '중지',
 '쇄',
 '청',
 '햇볕',
 '말림',
 '과정',
 '때문',
 '잎',
 '자체',
 '효소',
 '산화가',
 '것',
 '동양',
 '차',
 '빛깔',
 '붉은색',
 '홍차']

In [27]:
nouns_stcs

[[['차나무', '잎', '식물', '재료', '음료'],
  ['커피', '코코아', '인류', '대', '알콜', '기호음료', '중', '하나'],
  ['카페인', '카테킨', '테아닌', '등', '다량', '함유', '특유', '맛', '향', '나', '사람'],
  ['테아닌',
   '심신',
   '안정',
   '효과',
   '기대',
   '수',
   '정신',
   '안정',
   '필요',
   '사람',
   '한편',
   '도적',
   '카페인',
   '찻잎',
   '영국',
   '브렉',
   '퍼스트',
   '티',
   '아침',
   '잠',
   '용도'],
  ['차',
   '커피',
   '카테킨',
   '성분',
   '카페인',
   '체내',
   '흡수',
   '것',
   '때문',
   '커피',
   '카페인',
   '부작용',
   '자유'],
  ['카테킨',
   '폴리',
   '페놀',
   '폴리',
   '페놀',
   '항산',
   '물질',
   '신체',
   '전반',
   '건강',
   '혈관',
   '피부',
   '등',
   '도움'],
  ['효능', '때', '차', '이유', '절반', '정도', '테아닌', '때문', '말', '과언'],
  ['테아닌', '함유량', '녹차', '비발효차', '우롱차', '반', '발효차', '홍차', '발효차', '정도'],
  ['점', '활용', '테아닌', '농축', '슬로우', '카우라', '제품'],
  ['차', '본래', '차나무', '말'],
  ['소나무', '솔', '대나무', '대', '차', '본래', '차나무', '이름', '나무'],
  ['차나무', '잎', '찻잎', '문화', '자리', '찻잎', '우린', '물', '차'],
  ['나중',
   '차나무',
   '잎',
   '풀잎',
   '꽃',
   '뿌리',
   '등',
   '식물',
   '재료',
   '우린'