# 텍스트 전처리
---
- 패키지 설치
    * NLTK : pip install nltk
    * KoNLPy : pip install konlpy

In [1]:
# # NLTK 패키지 설치
# 영어
# !pip install nltk 

In [2]:
# 한글
# !pip install konlpy

## [1] 토큰화(Tokenization)
---
- 문장/문서를 의미를 지닌 작은 단위로 나누는 것
- 나누어진 단어를 토큰(Token)이라 함
- 종류
    * 문장 토큰화
    * 단어 토큰화

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
import nltk

# NLTK Corpus 말뭉치 데이터셋 다운로드 받기
nltk.download('punkt') # all

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
raw_text1="This particular tokenizer requires the sentence.\
tokenization models to be installed.\
This particular tokenizer requires the sentence."

raw_text2="Introduction to machine learning with python.\
statistical data analysis with r.\
This particular tokenizer requires the sentence."

In [6]:
# 단어 단위 토큰화
result1=word_tokenize(raw_text1)

In [7]:
print(result1)

['This', 'particular', 'tokenizer', 'requires', 'the', 'sentence.tokenization', 'models', 'to', 'be', 'installed.This', 'particular', 'tokenizer', 'requires', 'the', 'sentence', '.']


In [8]:
# 문장 단위 토큰화
raw_text=[raw_text1, raw_text2]

In [9]:
raw_text

['This particular tokenizer requires the sentence.tokenization models to be installed.This particular tokenizer requires the sentence.',
 'Introduction to machine learning with python.statistical data analysis with r.This particular tokenizer requires the sentence.']

In [10]:
result=sent_tokenize(raw_text1)

In [11]:
print(result, len(result))

['This particular tokenizer requires the sentence.tokenization models to be installed.This particular tokenizer requires the sentence.'] 1


In [12]:
# sent_result1=sent_tokenize("aaaa")
# print(sent_result1)

## 여러 문장에 대한 토큰 추출
---

In [13]:
# 문장 단위로 추출
for sent in raw_text:
    total_token=[]
    # 문장 추출
    sentResult=sent_tokenize(sent)
    
    # 문장에서 추출한 토큰
    print(f'sent => {sentResult}')
    
    for ele in sentResult:
        print(f'ele => {ele}')
        wordResult=word_tokenize(ele)
        print(f'wordResult => {wordResult}')
        total_token.append(wordResult)
    
#     sentToken=word_tokenize(sent)
#     print(f'sentToken, => {sentToken}')
#     # 모든 문장의 토큰에 추가
#     total_token.append(sentToken)
    
print('---------------------------------------------------')
print(total_token)

sent => ['This particular tokenizer requires the sentence.tokenization models to be installed.This particular tokenizer requires the sentence.']
ele => This particular tokenizer requires the sentence.tokenization models to be installed.This particular tokenizer requires the sentence.
wordResult => ['This', 'particular', 'tokenizer', 'requires', 'the', 'sentence.tokenization', 'models', 'to', 'be', 'installed.This', 'particular', 'tokenizer', 'requires', 'the', 'sentence', '.']
sent => ['Introduction to machine learning with python.statistical data analysis with r.This particular tokenizer requires the sentence.']
ele => Introduction to machine learning with python.statistical data analysis with r.This particular tokenizer requires the sentence.
wordResult => ['Introduction', 'to', 'machine', 'learning', 'with', 'python.statistical', 'data', 'analysis', 'with', 'r.This', 'particular', 'tokenizer', 'requires', 'the', 'sentence', '.']
---------------------------------------------------
[[

#### 한글
---

In [14]:
from konlpy.tag import Okt

In [15]:
# 형태소 분리 객체
okt=Okt()

In [16]:
# 형태소 분리
result=okt.morphs("오늘은 월요일입니다.")
print(result)

['오늘', '은', '월요일', '입니다', '.']


In [17]:
# 형태소 분리 후 태깅(Tagging) => 품사
result2=okt.pos("오늘은 월요일입니다.")

In [18]:
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('입니다', 'Adjective'), ('.', 'Punctuation')]


In [19]:
result2=okt.pos("오늘은 월요일입니다.", stem=True)

In [20]:
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('이다', 'Adjective'), ('.', 'Punctuation')]


### [2] 정제 & 정규화
---
- 불용어 제거 => 노이즈 제거
- 텍스트의 동일화
    * 대문자 또는 소문자로 통일
    * 문장의 길이

### [2-1] 불용어 (Stopword)

In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
en_stopwords=nltk.corpus.stopwords.words('english')

In [23]:
len(en_stopwords)

179

In [24]:
en_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### [2-2] 어간 및 표제어 처리
---

In [25]:
from nltk.stem import LancasterStemmer

In [26]:
# 어간 추출
lstem=LancasterStemmer()

In [27]:
lstem.stem('working'), lstem.stem('workded'), lstem.stem('worken')

('work', 'workd', 'work')

In [28]:
lstem.stem('happy'), lstem.stem('happiness')

('happy', 'happy')

In [29]:
lstem.stem('amuse'), lstem.stem('amused')

('amus', 'amus')

In [30]:
# 표제어(사전에 등록된 단어 추출)
from nltk.stem import WordNetLemmatizer

In [31]:
wlemma=WordNetLemmatizer()

In [32]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Pac

[nltk_data]    |   Unzipping misc\mwa_ppdb.zip.
[nltk_data]    | Downloading package names to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\names.zip.
[nltk_data]    | Downloading package nombank.1.0 to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package nonbreaking_prefixes to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\nonbreaking_prefixes.zip.
[nltk_data]    | Downloading package nps_chat to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\nps_chat.zip.
[nltk_data]    | Downloading package omw to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package omw-1.4 to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package opinion_lexicon to
[nltk_data]    |     C:\Users\User\AppData\Roaming

[nltk_data]    | Downloading package verbnet to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\verbnet.zip.
[nltk_data]    | Downloading package verbnet3 to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\verbnet3.zip.
[nltk_data]    | Downloading package webtext to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\webtext.zip.
[nltk_data]    | Downloading package wmt15_eval to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping models\wmt15_eval.zip.
[nltk_data]    | Downloading package word2vec_sample to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping models\word2vec_sample.zip.
[nltk_data]    | Downloading package wordnet to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package wordnet is already up-to-date

True

In [33]:
wlemma.lemmatize('working', 'v'), wlemma.lemmatize('working', 'v')

('work', 'work')

In [34]:
wlemma.lemmatize('amusing', 'v'), wlemma.lemmatize('amused', 'v')

('amuse', 'amuse')

### [3] 텍스트 백터화
---
- 텍스트 = 수치화
- 희소행렬(OHE) : BOW 방식 --> Count기반, TF-IDF 기반
- 밀집백터:Embedding qkdtlr, Wor2Vect

In [35]:
from sklearn.feature_extraction. text import CountVectorizer, TfidfVectorizer 

In [36]:
corpus=[raw_text1, raw_text2]

In [37]:
ohe=CountVectorizer()

In [38]:
ret=ohe.transform(corpus)

NotFittedError: Vocabulary not fitted or provided

In [None]:
print(type(ret, ret, sep='\n')

In [None]:
ret=ret.toarray()

In [None]:
print(ret.shape, ret)

In [None]:
print(type(ret, ret, sep='\n')

In [None]:
# TF-IDF 기반
tfldf=TfidfVectorizer()

In [None]:
tf_corpus=tfldf.fit()

In [None]:
type(tf_corpous)

In [None]:
tf_corpus=tr_corpus.toarray()

In [None]:
print(tf_corpus)

### 정수