# 1. 토큰화


In [2]:
sentence = """Thomas Jefferson began building Monticello at the age of 26."""

sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [3]:
import re

tokens = re.split(r'[-\s.,;!?]', sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [4]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [5]:
from nltk.tokenize import word_tokenize

text1 = "Hello! I can't wait to try the word_tokenize, WordPunctTokenizer, and TreebankWordTokenizer."
text2 = "They'll save and re-use this file."

In [6]:
word_tokenize1 = word_tokenize(text1)
word_tokenize2 = word_tokenize(text2)

print(word_tokenize1)
print(word_tokenize2)


['Hello', '!', 'I', 'ca', "n't", 'wait', 'to', 'try', 'the', 'word_tokenize', ',', 'WordPunctTokenizer', ',', 'and', 'TreebankWordTokenizer', '.']
['They', "'ll", 'save', 'and', 're-use', 'this', 'file', '.']


In [7]:
print(tokenizer.tokenize(text1))

['Hello', '!', 'I', 'can', "'t", 'wait', 'to', 'try', 'the', 'word_tokenize', ',', 'WordPunctTokenizer', ',', 'and', 'TreebankWordTokenizer', '.']


In [8]:
from nltk.tokenize import WordPunctTokenizer

word_punc_token = WordPunctTokenizer()
print(word_punc_token.tokenize(text1))
print(word_punc_token.tokenize(text2))

['Hello', '!', 'I', 'can', "'", 't', 'wait', 'to', 'try', 'the', 'word_tokenize', ',', 'WordPunctTokenizer', ',', 'and', 'TreebankWordTokenizer', '.']
['They', "'", 'll', 'save', 'and', 're', '-', 'use', 'this', 'file', '.']


In [9]:
# TreebankWordTokenizer (RegexpTokenizer보다 강력)
from nltk.tokenize import TreebankWordTokenizer

# 예제 text 
sentence = """Monticello wasn't designated as UNESCO World Heritage Site until 1987."""
sentence2 = "Hello! I can't wait to try the word_tokenize, WordPunctTokenizer, and TreebankWordTokenizer."
sentence3 = "They'll save and re-use this file."

In [10]:
tree_bank = TreebankWordTokenizer()

print(tree_bank.tokenize(sentence))
print(tree_bank.tokenize(sentence2))
print(tree_bank.tokenize(sentence3))

['Monticello', 'was', "n't", 'designated', 'as', 'UNESCO', 'World', 'Heritage', 'Site', 'until', '1987', '.']
['Hello', '!', 'I', 'ca', "n't", 'wait', 'to', 'try', 'the', 'word_tokenize', ',', 'WordPunctTokenizer', ',', 'and', 'TreebankWordTokenizer', '.']
['They', "'ll", 'save', 'and', 're-use', 'this', 'file', '.']


In [None]:
from nltk.tokenize import casual_tokenize

message = """RT @TJMonticello Best day everrrrrrr at Monticello.\
    Awesommmmmeeeeee day :*)"""

casual_tokenize(message, reduce_len = True, strip_handles=True))

['RT',
 '@TJMonticello',
 'Best',
 'day',
 'everrrrrrr',
 'at',
 'Monticello',
 '.',
 'Awesommmmmeeeeee',
 'day',
 ':*)']

In [12]:
# 예제 2 - 문서를 토큰화한 후 불용어 제거해보기 

# 불용어 리스트 (불용어로 간주할 단어들)
stopwords = ['the', 'is', 'in', 'and', 'to', 'a', 'of']

text = "The quick brown fox jumps over the lazy dog. The dog barked loudly at the fox in the park."

In [13]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)
result = [token for token in tokens if token not in stopwords]
result


['The',
 'quick',
 'brown',
 'fox',
 'jumps',
 'over',
 'lazy',
 'dog',
 '.',
 'The',
 'dog',
 'barked',
 'loudly',
 'at',
 'fox',
 'park',
 '.']

In [16]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

[stemmer.stem(token).strip("'") for token in "dish washer's washed dishes".split()]


['dish', 'washer', 'wash', 'dish']

In [18]:
' '.join([stemmer.stem(token).strip("'") for token in "dish washer's washed dishes".split()])

'dish washer wash dish'

In [19]:
# running, beautiful, believes, using, conversation, organization, studies 원형 복원
print("running -> " + stemmer.stem("running"))
print("beautiful -> " + stemmer.stem("beautiful"))
print("believes -> " + stemmer.stem("believes"))
print("using -> " + stemmer.stem("using"))
print("conversation -> " + stemmer.stem("conversation"))
print("organization -> " + stemmer.stem("organization"))
print("studies -> " + stemmer.stem("studies"))

running -> run
beautiful -> beauti
believes -> believ
using -> use
conversation -> convers
organization -> organ
studies -> studi


In [21]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize('better', pos='a')

'good'

In [22]:
# 품사를 명시하여 lemmatize 적용
from nltk.corpus import wordnet
print("동사 형태 : running -> " + lemmatizer.lemmatize("running", pos=wordnet.VERB))  # 동사 형태로 추출
print("형용사 형태 : beautiful -> " + lemmatizer.lemmatize("beautiful", pos=wordnet.ADJ))  # 형용사 형태로 추출
print("명사 형태 : geese -> " + lemmatizer.lemmatize("geese", pos=wordnet.NOUN))  # 명사 형태로 추출

동사 형태 : running -> run
형용사 형태 : beautiful -> beautiful
명사 형태 : geese -> goose


In [24]:
kor_text = "인간이 컴퓨터와 대화하고 있다는 것을 깨닫지 못하고 인간과 대화를 계속할 수 있다면 컴퓨터는 지능적인 것으로 간주될 수 있습니다."

In [25]:
from konlpy.tag import Komoran

komoran = Komoran()
result = komoran.morphs(kor_text)
print(result)

['인간', '이', '컴퓨터', '와', '대화', '하', '고', '있', '다는', '것', '을', '깨닫', '지', '못하', '고', '인간', '과', '대화', '를', '계속', '하', 'ㄹ', '수', '있', '다면', '컴퓨터', '는', '지능', '적', '이', 'ㄴ', '것', '으로', '간주', '되', 'ㄹ', '수', '있', '습니다', '.']


In [26]:
from konlpy.tag import Hannanum

hannanum = Hannanum()
result = hannanum.morphs(kor_text)
print(result)

['인간', '이', '컴퓨터', '와', '대화', '하고', '있', '다는', '것', '을', '깨닫', '지', '못하', '고', '인간', '과', '대화', '를', '계속', '하', 'ㄹ', '수', '있', '다면', '컴퓨터', '는', '지능적', '이', 'ㄴ', '것', '으로', '간주', '되', 'ㄹ', '수', '있', '습니다', '.']


# 1. BoW

## 1.1 sklearn

In [2]:
# corpus(말뭉치)

docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
BoW = count_vect.fit_transform(docs)

print(BoW.toarray())

[[1 0 1 1 1 0 0 0]
 [1 0 2 1 0 0 0 1]
 [1 2 0 0 0 1 1 0]]


In [5]:
# df화

# 어휘집 생성
vocab = count_vect.get_feature_names_out()
print(vocab)

['동물원에서' '바나나를' '봤어' '오늘' '원숭이를' '원숭이에게' '줬어' '코끼리를']


In [6]:
import pandas as pd
from IPython.display import display

for i in range (len(docs)):
    print('{} : {}'.format(i, docs [i]))

    display(pd.DataFrame([BoW.toarray()[i]], columns=vocab))
    print("\n\n")

0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,0,1,1,1,0,0,0





1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,0,2,1,0,0,0,1





2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,2,0,0,0,1,1,0







## 1.2 gensim

In [8]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [10]:
import gensim
import numpy as np
from gensim import corpora

In [None]:
# 토큰화
doc_ls = [doc.split]