# 4. 토큰화 Tokenizing

### 데이터 읽어오기

In [7]:
# 데이터 읽어오기
import pandas as pd
df = pd.read_csv("3_spaced_text.csv", encoding="utf-8")
df.tail(3)

Unnamed: 0,search_keyword,date_created,time_created,writer,is_reply,id,spaced_text
1152309,휴젤,2020-04-29,11:55:10,ㅇㅇ(123.215),0,241400125127,미국주식보다 한국꺼보면 ㄹㅇ 암걸릴 것 같음 특히 휴젤 저거 아는분이 임원진인데 회...
1152310,휴젤,2020-04-29,11:56:16,ㅇㄷ(175.223),1,241400125127,한국 상장회사들중에 ㅂㅅ은곳이 넘 많음...
1152311,휴젤,2020-04-29,11:56:38,ㅇㅇ(223.33),1,241400125127,신흥국 회사 투자 망설여지는 게 이거 때문임. 주식시장에 대한 개념이 안 잡혀있음.


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1152312 entries, 0 to 1152311
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   search_keyword  1152312 non-null  object
 1   date_created    1152312 non-null  object
 2   time_created    1152312 non-null  object
 3   writer          1152312 non-null  object
 4   is_reply        1152312 non-null  int64 
 5   id              1152312 non-null  int64 
 6   spaced_text     1152312 non-null  object
dtypes: int64(2), object(5)
memory usage: 61.5+ MB


### 단어 학습을 위한 형태로 저장

In [12]:
texts = df['spaced_text'].tolist()

# txt 파일로 저장
with open('4_spaced_corpus.txt', 'w', encoding='utf-8') as file:
    for text in texts:
        file.write(text + '\n')

### Noun Extraction

In [5]:
# soynlp 버전 확인
import sys
sys.path.append('../')

import soynlp
print(soynlp.__version__)

0.0.493


In [13]:
from soynlp.utils import DoublespaceLineCorpus
from soynlp.noun import LRNounExtractor_v2

# 말뭉치 설정
corpus_path = '4_spaced_corpus.txt'
sents = DoublespaceLineCorpus(corpus_path, iter_sent=True)

In [14]:
# 학습
%%time
noun_extractor = LRNounExtractor_v2(verbose=True, extract_compound=True)
noun_extractor.train(sents)
nouns = noun_extractor.extract()  # nouns 는 {str: NounScore} 형식의 dict 

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2418267 from 1152312 sents. mem=0.846 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=19305814, mem=6.595 Gb
[Noun Extractor] batch prediction was completed for 611496 words
[Noun Extractor] checked compounds. discovered 744427 compounds
[Noun Extractor] postprocessing detaching_features : 605683 -> 507133
[Noun Extractor] postprocessing ignore_features : 507133 -> 506628
[Noun Extractor] postprocessing ignore_NJ : 506628 -> 500257
[Noun Extractor] 500257 nouns (744427 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=7.520 Gb                    
[Noun Extractor] 76.92 % eojeols are covered
CPU times: total: 4min 9s
Wall time: 4min 9s


In [18]:
list(noun_extractor._compounds_components.items())[:100]

[('시리즈....txt', ('시리즈', '....', 'txt')),
 ('하루.....txt', ('하루', '.....', 'txt')),
 ('마인크래프트....', ('마인크래프트', '....')),
 ('지수추종투자자들', ('지수추종', '투자자들')),
 ('불가능하다해킹', ('불가능하다', '해킹')),
 ('K-메타버스MZ(이', ('K-메타버스', 'MZ', '(이')),
 ('레고켐바이오(141', ('레고켐바이오', '(1', '41')),
 ('위메이드(11204', ('위메이드', '(112', '04')),
 ('알테오젠(19617', ('알테오젠', '(1', '96', '17')),
 ('현대미포조선차트보고', ('현대미포조선', '차트보고')),
 ('캐릭터상품애니메이션', ('캐릭터상품', '애니메이션')),
 ('주주우대환원친화적', ('주주우대', '환원', '친화적')),
 ('회사)하이닉스에스엠', ('회사)', '하이닉스', '에스엠')),
 ('나라입니다그것', ('나라', '입니다', '그것')),
 ('중국남자입니다솔직히', ('중국남자', '입니다', '솔직히')),
 ('프로게이머인터넷방송', ('프로게이머', '인터넷방송')),
 ('쿠션파운데이션', ('쿠션', '파운데이션')),
 ('호식이jyp와플대학', ('호식이', 'jyp', '와플', '대학')),
 ('미디어컨텐츠(빅히트', ('미디어', '컨텐츠', '(빅히트')),
 ('6500018000', ('65000', '18000')),
 ('네이버바이넥스레버리', ('네이버', '바이넥스', '레버리')),
 ('화요일skc7700', ('화요일', 'skc', '7700')),
 ('안산다그룹평균키트와', ('안산다', '그룹', '평균키', '트와')),
 ('남은돈..마지막', ('남은돈', '..', '마지막')),
 ('가수준비중...인생', ('가수', '준비', '중...', '인생')),
 ('챠트1위..일본시장', ('챠

##### Noun Extraction는 정제된 언어에 적합한 방법인 듯 하다.
##### 커뮤니티 text는 특성상 조사가 많이 생략되어있다.
##### Word Extraction 을 사용해보자

### Word Extraction 

In [23]:
# txt 파일 열기
with open('4_spaced_corpus.txt', 'r', encoding='utf-8') as file:
    # 파일의 각 줄을 읽어 리스트에 저장
    text_list = [line.strip() for line in file]

In [24]:
from soynlp.word import WordExtractor

word_extractor = WordExtractor(min_frequency=100,
    min_cohesion_forward=0.05, 
    min_right_branching_entropy=0.0
)
word_extractor.train(text_list) # list of str or like
words = word_extractor.extract()

training was done. used memory 11.130 Gbuse memory 12.026 Gb
all cohesion probabilities was computed. # words = 67400
all branching entropies was computed # words = 2148488
all accessor variety was computed # words = 2148488


In [27]:
words['주식']

Scores(cohesion_forward=0.33797058206485, cohesion_backward=0.10211642559233294, left_branching_entropy=4.931513173639989, right_branching_entropy=4.637964581849671, left_accessor_variety=683, right_accessor_variety=655, leftside_frequency=72493, rightside_frequency=4603)

In [28]:
len(words)

31658

In [32]:
# 결과 출력

import math

def word_score(score):
    return (score.cohesion_forward * math.exp(score.right_branching_entropy))

print('단어   (빈도수, cohesion, branching entropy)\n')
for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:300]:
    print('%s     (%d, %.3f, %.3f)' % (
            word, 
            score.leftside_frequency, 
            score.cohesion_forward,
            score.right_branching_entropy
            )
         )

단어   (빈도수, cohesion, branching entropy)

..     (7922, 0.842, 5.504)
ㅋㅋㅋ     (121993, 0.817, 5.336)
...     (4516, 0.693, 5.478)
맨날     (2884, 0.814, 5.234)
갑자기     (3942, 0.742, 5.326)
때문에     (12438, 0.610, 5.493)
먼저     (4257, 0.720, 5.287)
!!     (1337, 0.748, 5.214)
ㄹㅇ     (20549, 0.982, 4.876)
펄어비스     (26674, 0.965, 4.881)
^^     (2183, 0.861, 4.992)
죄다     (2282, 0.663, 5.245)
솔직히     (5819, 0.722, 5.141)
존나     (28271, 0.703, 5.097)
카카오     (96629, 0.906, 4.820)
ㅠㅠ     (12462, 0.897, 4.827)
진짜     (32831, 0.462, 5.481)
ㅜㅜ     (3600, 0.878, 4.828)
땜에     (542, 0.771, 4.933)
캬ㅋㅋㅋ     (6933, 0.956, 4.710)
근데     (22345, 0.636, 5.111)
....     (1540, 0.547, 5.260)
셀트리온     (54208, 0.801, 4.867)
잖아     (104, 0.782, 4.878)
함께     (6068, 0.510, 5.303)
ㅇㅇ     (22735, 0.695, 4.990)
넷마블     (14876, 0.911, 4.693)
됩니다.     (1274, 0.875, 4.720)
ㅉㅉ     (1327, 0.945, 4.642)
굳이     (2083, 0.707, 4.933)
jyp     (4353, 0.891, 4.676)
밝혔다.     (6178, 0.787, 4.792)
하이닉스     (46749, 0.540, 5.144)
된

##### 구두점, 느낌표가 연달아 있는 걸 1개로 줄여야 한다
##### 괄호는 ‘ ‘으로 치환해도 될 듯 하다
##### 3_spaced_text.csv 파일을 한 번 더 전처리한다

### 전처리 : 3_spaced_text.csv