# chapter_01 Tokenizing Text and WordNet Basics

## Tokenizing Text into sentences

In [17]:
para1 = "Hello World. It's good to see you. Thanks for buying this book."
# sent_tokenize用来分句子
from nltk.tokenize import sent_tokenize
sent_tokenize(para1)
# 我们看到这段话被分成了3句
# sent_tokenize是怎么做到的呢？
# 它用了nltk.tokenize.punkt模块，这个模块已经被训练了，会识别句子的开始和结束。

['Hello World.', "It's good to see you.", 'Thanks for buying this book.']

In [11]:
para2 = '大家好，我叫李白。这是我的好朋友，杜甫。我们都是唐朝著名的诗人，哈哈哈！' 
sent_tokenize(para2)
# 好坑，发现sent_tokenize对汉字段落的分句能力不高

['大家好，我叫李白。这是我的好朋友，杜甫。我们都是唐朝著名的诗人，哈哈哈！']

In [18]:
# 如果要分句的段落较长，那么直接把punkt模块下的pickle文件load进来，效率会更高
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer.tokenize(para1)

['Hello World.', "It's good to see you.", 'Thanks for buying this book.']

In [19]:
# 如果要分句的段落是其他语言，那么可以把其他语言的pickle文件load进来
# 西班牙语
para3 = 'hola amigo. estoy bien.'
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
spanish_tokenizer.tokenize(para3)

['hola amigo.', 'estoy bien.']

## Tokenizing sentences into words

In [1]:
from nltk.tokenize import word_tokenize
word_tokenize('hello world.')

['hello', 'world', '.']

In [2]:
# word_tokenize调用了TreebankWordTokenizer这个类
# 因此下面的代码也可以实现word tokenize
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('hello world.')
# 原理：根据空格和标点符号来分词
# 默认的，标点符号也分出来了
# 根源：TokenizerI, 分出三类：PunktWordTokenizer、TreebankWordTokenizer、RegexpTokenizer
# 从RegexpTokenizer又有两类：WordPuncktTokenizer、WhitespaceTokenizer

['hello', 'world', '.']

In [3]:
# separating contractions 对缩略词进行分词
word_tokenize("can't")
# OMG!没有识别出是缩略词，这完全不可接受呀，怎么办？
# 可以用regexp tokenizer

['ca', "n't"]

In [4]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("can't is a contraction")

['can', "'", 't', 'is', 'a', 'contraction']

## Tokenizing sentences using regular expressions

In [5]:
# 用正则表达式会使分词变得复杂，效率降低，因此建议只有当之前的分词结果都不可接受时才使用
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction']

In [6]:
# 下面的代码是相同的结果
from nltk.tokenize import regexp_tokenize
regexp_tokenize("Can't is a contraction.", "[\w']+")

["Can't", 'is', 'a', 'contraction']

In [7]:
# simple whitespace tokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True)
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction.']

## Training a sentence tokenizer

In [10]:
# 文本的格式不是传统的文章格式，比如对话，那么可以用自己训练好的tokenizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
text = webtext.raw('overheard.txt')
# 用PunktSentenceTokenizer训练
sent_tokenizer = PunktSentenceTokenizer(text)

In [14]:
text[0:200]

'White guy: So, do you have any plans for this evening?\nAsian girl: Yeah, being angry!\nWhite guy: Oh, that sounds good.\n\nGuy #1: So this Jack guy is basically the luckiest man in the world.\nGuy #2: Why'

In [15]:
# 分词，看结果
sents1 = sent_tokenizer.tokenize(text)
sents1[0:10]

['White guy: So, do you have any plans for this evening?',
 'Asian girl: Yeah, being angry!',
 'White guy: Oh, that sounds good.',
 'Guy #1: So this Jack guy is basically the luckiest man in the world.',
 "Guy #2: Why, because he's survived like 5 attempts on his life and it's not even noon?",
 'Guy #1: No; he could totally nail those two chicks.',
 'Dad: Could you tell me where the auditorium is?',
 "Security guy: It's on the second floor.",
 "Dad: Wait, you mean it's actually in the building?",
 "Girl: But, I mean, it's not like I ever plan on giving birth."]

In [16]:
# 用默认的sent_tokenize
from nltk.tokenize import sent_tokenize
sents2 = sent_tokenize(text)

In [17]:
sents2[0:10]
# 好像没什么差别？

['White guy: So, do you have any plans for this evening?',
 'Asian girl: Yeah, being angry!',
 'White guy: Oh, that sounds good.',
 'Guy #1: So this Jack guy is basically the luckiest man in the world.',
 "Guy #2: Why, because he's survived like 5 attempts on his life and it's not even noon?",
 'Guy #1: No; he could totally nail those two chicks.',
 'Dad: Could you tell me where the auditorium is?',
 "Security guy: It's on the second floor.",
 "Dad: Wait, you mean it's actually in the building?",
 "Girl: But, I mean, it's not like I ever plan on giving birth."]

In [18]:
sents1[678]

'Girl: But you already have a Big Mac...'

In [19]:
sents2[678]
# 非也，发现训练的分词还是更有效，默认的把第二行也分一起了。
# 训练的原理：利用非监督算法，学得句子的边界

'Girl: But you already have a Big Mac...\nHobo: Oh, this is all theatrical.'

In [23]:
# 可以从本地读取corpus用来训练
with open('C:/Users/dakongyi/AppData/Roaming/nltk_data/corpora/webtext/overheard.txt', 
          encoding = 'ISO-8859-2') as f:
    text = f.read()
sent_tokenizer = PunktSentenceTokenizer(text)
sents1 = sent_tokenizer.tokenize(text)
sents1[0:10]

['White guy: So, do you have any plans for this evening?',
 'Asian girl: Yeah, being angry!',
 'White guy: Oh, that sounds good.',
 'Guy #1: So this Jack guy is basically the luckiest man in the world.',
 "Guy #2: Why, because he's survived like 5 attempts on his life and it's not even noon?",
 'Guy #1: No; he could totally nail those two chicks.',
 'Dad: Could you tell me where the auditorium is?',
 "Security guy: It's on the second floor.",
 "Dad: Wait, you mean it's actually in the building?",
 "Girl: But, I mean, it's not like I ever plan on giving birth."]

In [24]:
sents1[678]

'Girl: But you already have a Big Mac...'

## Filtering stopwords in a tokenized sentence

In [25]:
# stopword的含义不用多解释
# stopword放在nltk_data/corpora/stopwords/目录下
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
words = ["Can't", 'is', 'a', 'contraction']
[word for word in words if word not in english_stops]

["Can't", 'contraction']

In [26]:
# 各个语言的停止词文件
stopwords.fileids()

['arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'kazakh',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish',
 'turkish']

In [27]:
# 法语的停止词列表
stopwords.words('french')

['au',
 'aux',
 'avec',
 'ce',
 'ces',
 'dans',
 'de',
 'des',
 'du',
 'elle',
 'en',
 'et',
 'eux',
 'il',
 'je',
 'la',
 'le',
 'leur',
 'lui',
 'ma',
 'mais',
 'me',
 'même',
 'mes',
 'moi',
 'mon',
 'ne',
 'nos',
 'notre',
 'nous',
 'on',
 'ou',
 'par',
 'pas',
 'pour',
 'qu',
 'que',
 'qui',
 'sa',
 'se',
 'ses',
 'son',
 'sur',
 'ta',
 'te',
 'tes',
 'toi',
 'ton',
 'tu',
 'un',
 'une',
 'vos',
 'votre',
 'vous',
 'c',
 'd',
 'j',
 'l',
 'à',
 'm',
 'n',
 's',
 't',
 'y',
 'été',
 'étée',
 'étées',
 'étés',
 'étant',
 'étante',
 'étants',
 'étantes',
 'suis',
 'es',
 'est',
 'sommes',
 'êtes',
 'sont',
 'serai',
 'seras',
 'sera',
 'serons',
 'serez',
 'seront',
 'serais',
 'serait',
 'serions',
 'seriez',
 'seraient',
 'étais',
 'était',
 'étions',
 'étiez',
 'étaient',
 'fus',
 'fut',
 'fûmes',
 'fûtes',
 'furent',
 'sois',
 'soit',
 'soyons',
 'soyez',
 'soient',
 'fusse',
 'fusses',
 'fût',
 'fussions',
 'fussiez',
 'fussent',
 'ayant',
 'ayante',
 'ayantes',
 'ayants',
 'eu'

## Looking up synsets for a word in wordnet

In [4]:
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
syn.name()

'cookbook.n.01'

In [2]:
syn.definition()

'a book of recipes and cooking directions'

In [5]:
wordnet.synsets('cookbook')

[Synset('cookbook.n.01')]

In [14]:
# 同义词示例
wordnet.synsets('cooking')[0].examples()

['cooking can be a great art',
 'people are needed who have experience in cookery',
 'he left the preparation of meals to his wife']

In [19]:
# hypernyms：上义词
# hyponyms：下义词
syn.hypernyms()

[Synset('reference_book.n.01')]

In [20]:
syn.hypernyms()[0].hyponyms()

[Synset('annual.n.02'),
 Synset('atlas.n.02'),
 Synset('cookbook.n.01'),
 Synset('directory.n.01'),
 Synset('encyclopedia.n.01'),
 Synset('handbook.n.01'),
 Synset('instruction_book.n.01'),
 Synset('source_book.n.01'),
 Synset('wordbook.n.01')]

In [21]:
syn.root_hypernyms()
# cookbook的上义词是reference_book
# reference_book的下义词有很多，比如annual、cookbook等
# reference_book的root_hypernyms是entity

[Synset('entity.n.01')]

In [22]:
# 可以获取到整个路径
syn.hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('creation.n.02'),
  Synset('product.n.02'),
  Synset('work.n.02'),
  Synset('publication.n.01'),
  Synset('book.n.01'),
  Synset('reference_book.n.01'),
  Synset('cookbook.n.01')]]

In [23]:
# part of speech(POS)：词性
# POS tag：词性标注
# noun n
# adjective a
# adverb r
# verb v
len(wordnet.synsets('great'))

7

In [24]:
len(wordnet.synsets('great', pos = 'n'))

1

In [25]:
len(wordnet.synsets('great', pos = 'a'))

6

In [26]:
wordnet.synsets('great', pos = 'n')

[Synset('great.n.01')]

## Looking up lemmas and synonyms in wordnet

In [27]:
# lemma, is the caconical form or morphological form of a word
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
lemmas = syn.lemmas()
len(lemmas)

2

In [28]:
lemmas[0].name()

'cookbook'

In [29]:
lemmas[1].name()

'cookery_book'

In [31]:
lemmas[0].synset() == lemmas[1].synset()

True