In [1]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

In [2]:
text = 'Instagram is no longer going to be a photo-sharing app, Instagram boss Adam Mosseri announced earlier this summer.\
        So what is Instagram going to be instead? Mosseri talks to Recode’s Peter Kafka about his plans to emphasize video and \
        commerce for the service, competing with Tik-Tok, and looming regulation.'
print(tokenizer.tokenize(text))

['Instagram', 'is', 'no', 'longer', 'going', 'to', 'be', 'a', 'photo-sharing', 'app', ',', 'Instagram', 'boss', 'Adam', 'Mosseri', 'announced', 'earlier', 'this', 'summer.', 'So', 'what', 'is', 'Instagram', 'going', 'to', 'be', 'instead', '?', 'Mosseri', 'talks', 'to', 'Recode’s', 'Peter', 'Kafka', 'about', 'his', 'plans', 'to', 'emphasize', 'video', 'and', 'commerce', 'for', 'the', 'service', ',', 'competing', 'with', 'Tik-Tok', ',', 'and', 'looming', 'regulation', '.']


In [4]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. \
        Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. \
        He looked about, to make sure no one was near."
sent_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stard\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['His barber kept his word.',
 'But keeping such a huge secret to himself was driving him crazy.',
 'Finally, the barber went up a mountain and almost to the edge of a cliff.',
 'He dug a hole in the midst of some reeds.',
 'He looked about, to make sure no one was near.']

In [5]:
text_with_period = "I am actively looking for Ph.D. students. and you are a Ph.D student."
sent_tokenize(text_with_period)

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']

# 품사 태깅

In [10]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('averaged_perceptron_tagger')

text = 'Instagram is no longer going to be a photo-sharing app, Instagram boss Adam Mosseri announced earlier this summer.\
        So what is Instagram going to be instead? Mosseri talks to Recode’s Peter Kafka about his plans to emphasize video and \
        commerce for the service, competing with Tik-Tok, and looming regulation.'

pos_tag(word_tokenize(text))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\stard\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('Instagram', 'NNP'),
 ('is', 'VBZ'),
 ('no', 'RB'),
 ('longer', 'RB'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('photo-sharing', 'JJ'),
 ('app', 'NN'),
 (',', ','),
 ('Instagram', 'NNP'),
 ('boss', 'IN'),
 ('Adam', 'NNP'),
 ('Mosseri', 'NNP'),
 ('announced', 'VBD'),
 ('earlier', 'RBR'),
 ('this', 'DT'),
 ('summer', 'NN'),
 ('.', '.'),
 ('So', 'IN'),
 ('what', 'WP'),
 ('is', 'VBZ'),
 ('Instagram', 'NNP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('instead', 'RB'),
 ('?', '.'),
 ('Mosseri', 'NNP'),
 ('talks', 'NNS'),
 ('to', 'TO'),
 ('Recode', 'NNP'),
 ('’', 'NNP'),
 ('s', 'VBP'),
 ('Peter', 'NNP'),
 ('Kafka', 'NNP'),
 ('about', 'IN'),
 ('his', 'PRP$'),
 ('plans', 'NNS'),
 ('to', 'TO'),
 ('emphasize', 'VB'),
 ('video', 'NN'),
 ('and', 'CC'),
 ('commerce', 'NN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('service', 'NN'),
 (',', ','),
 ('competing', 'VBG'),
 ('with', 'IN'),
 ('Tik-Tok', 'NNP'),
 (',', ','),
 ('and', 'CC'),
 ('looming', 'VBG'),
 ('regulation', 'N

- PRP: 인칭 대명사
- VBP: 동사
- RB: 부사
- VBG: 현재부사IN은 전치사
- NNP: 고유 명사
- NNS: 복수형 명사
- CC: 접속사
- DT: 관사

## 한국어의 경우

### Okt

In [3]:
from konlpy.tag import Okt
okt = Okt()

In [5]:
text = "국내 거래소에서 가상자산인 비트코인 가격이 5개월 만에 7천만 원대를 회복했습니다.\
        가상자산 거래소 업비트에 따르면 오늘(11일) 오후 3시 6분 현재 1비트코인은 6천970만4천 원입니다."

#### 형태소 추출

In [6]:
okt.morphs(text)

['국내',
 '거래소',
 '에서',
 '가상',
 '자산',
 '인',
 '비트코인',
 '가격',
 '이',
 '5',
 '개월',
 '만에',
 '7천만',
 '원대',
 '를',
 '회복했습니다',
 '.',
 '가상',
 '자산',
 '거래소',
 '업',
 '비트',
 '에',
 '따르면',
 '오늘',
 '(',
 '11일',
 ')',
 '오후',
 '3시',
 '6분',
 '현재',
 '1',
 '비트코인',
 '은',
 '6천',
 '970만',
 '4천',
 '원',
 '입니다',
 '.']

#### 품사 태깅

In [7]:
okt.pos(text)

[('국내', 'Noun'),
 ('거래소', 'Noun'),
 ('에서', 'Josa'),
 ('가상', 'Noun'),
 ('자산', 'Noun'),
 ('인', 'Josa'),
 ('비트코인', 'Noun'),
 ('가격', 'Noun'),
 ('이', 'Josa'),
 ('5', 'Number'),
 ('개월', 'Noun'),
 ('만에', 'Josa'),
 ('7천만', 'Number'),
 ('원대', 'Noun'),
 ('를', 'Josa'),
 ('회복했습니다', 'Adjective'),
 ('.', 'Punctuation'),
 ('가상', 'Noun'),
 ('자산', 'Noun'),
 ('거래소', 'Noun'),
 ('업', 'Noun'),
 ('비트', 'Noun'),
 ('에', 'Josa'),
 ('따르면', 'Verb'),
 ('오늘', 'Noun'),
 ('(', 'Punctuation'),
 ('11일', 'Number'),
 (')', 'Punctuation'),
 ('오후', 'Noun'),
 ('3시', 'Number'),
 ('6분', 'Number'),
 ('현재', 'Noun'),
 ('1', 'Number'),
 ('비트코인', 'Noun'),
 ('은', 'Josa'),
 ('6천', 'Number'),
 ('970만', 'Number'),
 ('4천', 'Number'),
 ('원', 'Noun'),
 ('입니다', 'Adjective'),
 ('.', 'Punctuation')]

#### 명사 추출

In [9]:
okt.nouns(text)

['국내',
 '거래소',
 '가상',
 '자산',
 '비트코인',
 '가격',
 '개월',
 '원대',
 '가상',
 '자산',
 '거래소',
 '업',
 '비트',
 '오늘',
 '오후',
 '현재',
 '비트코인',
 '원']

### 꼬꼬마

In [10]:
from konlpy.tag import Kkma

kkma = Kkma()

In [11]:
kkma.morphs(text)

['국내',
 '거래소',
 '에서',
 '가상',
 '자산',
 '이',
 'ㄴ',
 '비트',
 '코',
 '이',
 'ㄴ',
 '가격',
 '이',
 '5',
 '개월',
 '만',
 '에',
 '7',
 '천만',
 '원대',
 '를',
 '회복',
 '하',
 '었',
 '습니다',
 '.',
 '가상',
 '자산',
 '거래소',
 '업',
 '비트',
 '에',
 '따르',
 '면',
 '오늘',
 '(',
 '11',
 '일',
 ')',
 '오후',
 '3',
 '시',
 '6',
 '분',
 '현재',
 '1',
 '비트',
 '코',
 '인은',
 '6',
 '천',
 '970',
 '만',
 '4',
 '천',
 '원',
 '이',
 'ㅂ니다',
 '.']

In [12]:
kkma.pos(text)

[('국내', 'NNG'),
 ('거래소', 'NNG'),
 ('에서', 'JKM'),
 ('가상', 'NNG'),
 ('자산', 'NNG'),
 ('이', 'VCP'),
 ('ㄴ', 'ETD'),
 ('비트', 'NNG'),
 ('코', 'NNG'),
 ('이', 'VCP'),
 ('ㄴ', 'ETD'),
 ('가격', 'NNG'),
 ('이', 'JKS'),
 ('5', 'NR'),
 ('개월', 'NNM'),
 ('만', 'NNB'),
 ('에', 'JKM'),
 ('7', 'NR'),
 ('천만', 'NR'),
 ('원대', 'NNG'),
 ('를', 'JKO'),
 ('회복', 'NNG'),
 ('하', 'XSV'),
 ('었', 'EPT'),
 ('습니다', 'EFN'),
 ('.', 'SF'),
 ('가상', 'NNG'),
 ('자산', 'NNG'),
 ('거래소', 'NNG'),
 ('업', 'NNG'),
 ('비트', 'NNG'),
 ('에', 'JKM'),
 ('따르', 'VV'),
 ('면', 'ECE'),
 ('오늘', 'NNG'),
 ('(', 'SS'),
 ('11', 'NR'),
 ('일', 'NNM'),
 (')', 'SS'),
 ('오후', 'NNG'),
 ('3', 'NR'),
 ('시', 'NNM'),
 ('6', 'NR'),
 ('분', 'NNM'),
 ('현재', 'MAG'),
 ('1', 'NR'),
 ('비트', 'NNG'),
 ('코', 'NNG'),
 ('인은', 'NNG'),
 ('6', 'NR'),
 ('천', 'NR'),
 ('970', 'NR'),
 ('만', 'NR'),
 ('4', 'NR'),
 ('천', 'NR'),
 ('원', 'NNM'),
 ('이', 'VCP'),
 ('ㅂ니다', 'EFN'),
 ('.', 'SF')]

In [13]:
kkma.nouns(text)

['국내',
 '거래소',
 '가상',
 '가상자산',
 '자산',
 '비트',
 '비트코',
 '코',
 '가격',
 '5',
 '5개월',
 '개월',
 '만',
 '7',
 '7천만',
 '천만',
 '원대',
 '회복',
 '업',
 '업비트',
 '오늘',
 '11',
 '11일',
 '일',
 '오후',
 '3',
 '3시',
 '시',
 '6',
 '6분',
 '분',
 '1',
 '1비트코인은',
 '인은',
 '6천970만',
 '천',
 '970',
 '만',
 '4',
 '4천',
 '원']