In [1]:
!pip install nltk



In [3]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn

#強迫下載語料庫
wn.ensure_loaded()

[nltk_data] Downloading package wordnet to /Users/Jerry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#找有哪些同義詞
wn.synsets('car')

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [14]:
#看定義
car = wn.synset('car.n.01')
car.definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [10]:
#找同樣意義的詞
car.lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [22]:
#詞意相似度
car = wn.synset('car.n.01')
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')
motocycle = wn.synset('motorcycle.n.01')

print('car.path_similarity(dog):')
print(car.path_similarity(dog))
print()
print('car.path_similarity(cat):')
print(car.path_similarity(cat))
print()
print('car.path_similarity(motocycle):')
print(car.path_similarity(motocycle))
print()

car.path_similarity(dog):
0.07692307692307693

car.path_similarity(cat):
0.05555555555555555

car.path_similarity(motocycle):
0.3333333333333333



In [25]:
#查上位詞
car.hypernym_paths()[0]

#越往上，越抽象 ; 越往下，越具體
#類似界門綱目科屬種

[Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('object.n.01'),
 Synset('whole.n.02'),
 Synset('artifact.n.01'),
 Synset('instrumentality.n.03'),
 Synset('container.n.01'),
 Synset('wheeled_vehicle.n.01'),
 Synset('self-propelled_vehicle.n.01'),
 Synset('motor_vehicle.n.01'),
 Synset('car.n.01')]

## 分詞

In [30]:
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# https://www.space.com/norad-tracks-santa-claus-trip-to-international-space-station
string = "NORAD regularly tracks Santa's trip around the world each Christmas, but this year is a bit different."

[nltk_data] Downloading package punkt to /Users/Jerry/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [31]:
#分詞 word_tokenize
word_tokenize(string)

['NORAD',
 'regularly',
 'tracks',
 'Santa',
 "'s",
 'trip',
 'around',
 'the',
 'world',
 'each',
 'Christmas',
 ',',
 'but',
 'this',
 'year',
 'is',
 'a',
 'bit',
 'different',
 '.']

## 詞性標註

#### 詞性說明: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [32]:
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Jerry/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [33]:
pos_tag(word_tokenize(string))

[('NORAD', 'NNP'),
 ('regularly', 'RB'),
 ('tracks', 'VBZ'),
 ('Santa', 'NNP'),
 ("'s", 'POS'),
 ('trip', 'NN'),
 ('around', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('each', 'DT'),
 ('Christmas', 'NNP'),
 (',', ','),
 ('but', 'CC'),
 ('this', 'DT'),
 ('year', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('bit', 'RB'),
 ('different', 'JJ'),
 ('.', '.')]

## Stemming 詞幹提取

In [34]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

# stemming -> PorterStemmer().stem()
[porter.stem(word) for word in word_tokenize(string)]

['norad',
 'regularli',
 'track',
 'santa',
 "'s",
 'trip',
 'around',
 'the',
 'world',
 'each',
 'christma',
 ',',
 'but',
 'thi',
 'year',
 'is',
 'a',
 'bit',
 'differ',
 '.']

In [35]:
#詞性標註 pos_tag
pos_tag([porter.stem(word) for word in word_tokenize(string)])

[('norad', 'JJ'),
 ('regularli', 'NN'),
 ('track', 'NN'),
 ('santa', 'NN'),
 ("'s", 'POS'),
 ('trip', 'NN'),
 ('around', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('each', 'DT'),
 ('christma', 'NN'),
 (',', ','),
 ('but', 'CC'),
 ('thi', 'JJ'),
 ('year', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('bit', 'NN'),
 ('differ', 'NN'),
 ('.', '.')]

## 分句

In [36]:
strs = "NORAD regularly tracks Santa's trip around the world each Christmas, but this year is a bit different. On Wednesday (Dec. 23), the Federal Aviation Administration gave Santa and his reindeer-powered sleigh an official commercial space license for launches and landings"

In [37]:
from nltk.tokenize import sent_tokenize

#分句 sent_tokenize()
sent_tokenize(strs)

["NORAD regularly tracks Santa's trip around the world each Christmas, but this year is a bit different.",
 'On Wednesday (Dec. 23), the Federal Aviation Administration gave Santa and his reindeer-powered sleigh an official commercial space license for launches and landings']