#### 載入相關模組

In [2]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

import numpy as np

import string

import gensim
from gensim.models.phrases import Phraser, Phrases
from gensim.models.word2vec import Word2Vec

from sklearn.manifold import TSNE

import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
%matplotlib inline

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\brite\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brite\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brite\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### 11.1.1 斷句與斷字（tokenization）

In [3]:
from nltk.corpus import gutenberg

In [4]:
len(gutenberg.fileids())

18

In [5]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [6]:
len(gutenberg.words())

2621613

#####斷句處理

In [7]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [8]:
gberg_sent_tokens[0]

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.'

In [9]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

#####斷字處理

In [10]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [11]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

#####用 sents() method 一次做完斷句、斷字

In [12]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [13]:
gberg_sents[0:3]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I']]

In [14]:
gberg_sents[4][14]

'father'

#### 11.1.2 將大寫字母轉成小寫 (以艾瑪第 1 句為例)

##### 原本索引 4 這句的內容

In [15]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

##### 將索引 4 這句轉成小寫

In [16]:
[w.lower() for w in gberg_sents[4]]

['she',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

#### 11.1.3 移除停用字與標點符號 (以艾瑪第 1 句為例)

In [17]:
stpwrds = stopwords.words('english') + list(string.punctuation)

In [18]:
stpwrds

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
[w.lower() for w in gberg_sents[4] if w.lower() not in stpwrds]

['youngest',
 'two',
 'daughters',
 'affectionate',
 'indulgent',
 'father',
 'consequence',
 'sister',
 'marriage',
 'mistress',
 'house',
 'early',
 'period']

#### 11.1.4 字根提取 (stemming) (以艾瑪第 1 句為例)

In [20]:
stemmer = PorterStemmer()

In [21]:
[stemmer.stem(w.lower()) for w in gberg_sents[4] 
 if w.lower() not in stpwrds]

['youngest',
 'two',
 'daughter',
 'affection',
 'indulg',
 'father',
 'consequ',
 'sister',
 'marriag',
 'mistress',
 'hous',
 'earli',
 'period']

#### 11.1.5 找出 n-gram 詞彙並串成單一詞彙

#####將語料庫中的 2-gram 詞彙檢測出來

In [22]:
phrases = Phrases(gberg_sents) 

In [23]:
bigram = Phraser(phrases) 

In [24]:
bigram.phrasegrams 

{(b'two', b'daughters'): 11.966813731181546,
 (b'her', b'sister'): 17.7960829227865,
 (b"'", b's'): 31.066242737744524,
 (b'very', b'early'): 11.01214147275924,
 (b'Her', b'mother'): 13.529425062715127,
 (b'long', b'ago'): 63.22343628984788,
 (b'more', b'than'): 29.023584433996874,
 (b'had', b'been'): 22.306024648925288,
 (b'an', b'excellent'): 39.063874851750626,
 (b'Miss', b'Taylor'): 453.75918026073305,
 (b'very', b'fond'): 24.134280468850747,
 (b'passed', b'away'): 12.35053642325912,
 (b'too', b'much'): 31.376002029426687,
 (b'did', b'not'): 11.728416217142811,
 (b'any', b'means'): 14.096964108090186,
 (b'wedding', b'-'): 17.4695197740113,
 (b'Her', b'father'): 13.129571562488772,
 (b'after', b'dinner'): 21.5285481168817,
 (b'self', b'-'): 47.79018053120332,
 (b'sixteen', b'years'): 107.0461671612265,
 (b'five', b'years'): 40.128755673408115,
 (b'years', b'old'): 54.735425236061104,
 (b'seven', b'years'): 52.59411150244507,
 (b'each', b'other'): 79.4168405322873,
 (b'a', b'mile'): 

##### 測試看看 bigram 物件能否運作

In [25]:
test_sentence = "Miss Taylor has two daughters".split()

In [26]:
test_sentence

['Miss', 'Taylor', 'has', 'two', 'daughters']

In [27]:
bigram[test_sentence]

['Miss_Taylor', 'has', 'two_daughters']

#### 11.1.6 處理整個古騰堡語料庫

#### 去大寫、刪除標點符號

In [28]:
lower_sents = []
for s in gberg_sents:
    lower_sents.append([w.lower() for w in s if w.lower()
                        not in list(string.punctuation)])

#### 建立 2-gram 詞彙物件

In [29]:
lower_bigram = Phraser(Phrases(lower_sents))

In [30]:
lower_bigram.phrasegrams # 全小寫的 2-gram

{(b'two', b'daughters'): 11.080802900992637,
 (b'her', b'sister'): 16.93971298099339,
 (b'very', b'early'): 10.516998773665177,
 (b'her', b'mother'): 10.70812618607742,
 (b'long', b'ago'): 59.226442015336005,
 (b'more', b'than'): 28.529926612065935,
 (b'had', b'been'): 21.583193129694834,
 (b'an', b'excellent'): 37.41859680854167,
 (b'sixteen', b'years'): 131.42913000977515,
 (b'miss', b'taylor'): 420.4340982546865,
 (b'mr', b'woodhouse'): 104.19907841850323,
 (b'very', b'fond'): 24.185726346489627,
 (b'passed', b'away'): 11.751473221742694,
 (b'too', b'much'): 30.36309017383541,
 (b'did', b'not'): 10.846196223896685,
 (b'any', b'means'): 14.294148100212627,
 (b'after', b'dinner'): 18.60737125272944,
 (b'mr', b'weston'): 91.63290824201266,
 (b'five', b'years'): 37.66428596665674,
 (b'years', b'old'): 48.599094446190286,
 (b'seven', b'years'): 50.3345604292756,
 (b'each', b'other'): 71.31277029783762,
 (b'well', b'informed'): 14.185028016786625,
 (b'a', b'mile'): 11.700110753652233,
 (b

#####進一步過濾 2-gram 語法

In [31]:
lower_bigram = Phraser(Phrases(lower_sents, 
                min_count=32, threshold=64))
lower_bigram.phrasegrams

{(b'miss', b'taylor'): 156.44059469941823,
 (b'mr', b'woodhouse'): 82.04651843976633,
 (b'mr', b'weston'): 75.87438262077481,
 (b'mrs', b'weston'): 160.68485093258923,
 (b'great', b'deal'): 93.36368125424357,
 (b'mr', b'knightley'): 161.74131790625913,
 (b'miss', b'woodhouse'): 229.03802722366902,
 (b'years', b'ago'): 74.31594785893046,
 (b'mr', b'elton'): 121.3990121932397,
 (b'dare', b'say'): 89.94000515807346,
 (b'frank', b'churchill'): 1316.4456593286038,
 (b'miss', b'bates'): 276.39588291692513,
 (b'drawing', b'room'): 84.91494947493561,
 (b'mrs', b'goddard'): 143.57843432545658,
 (b'miss', b'smith'): 73.03442128232508,
 (b'few', b'minutes'): 204.16834974753786,
 (b'john', b'knightley'): 83.03755747111268,
 (b'don', b't'): 250.30957446808512,
 (b'good', b'natured'): 88.69936184891343,
 (b'few', b'moments'): 107.77584531675087,
 (b'thousand', b'pounds'): 166.51834523092802,
 (b'o', b'clock'): 89.14789088153573,
 (b'jane', b'fairfax'): 654.5565917587609,
 (b'miss', b'fairfax'): 196.

##### 最終處理：將語料庫的所有 2-gram 串成單一詞彙

In [32]:
clean_sents = []
for s in lower_sents:
    clean_sents.append(lower_bigram[s])

In [33]:
clean_sents[6]

['sixteen',
 'years',
 'had',
 'miss_taylor',
 'been',
 'in',
 'mr_woodhouse',
 's',
 'family',
 'less',
 'as',
 'a',
 'governess',
 'than',
 'a',
 'friend',
 'very',
 'fond',
 'of',
 'both',
 'daughters',
 'but',
 'particularly',
 'of',
 'emma']

#### 11.2 用 word2vec 建立詞向量空間

#### 11.2.3 使用 word2vec

#### 建立詞向量空間模型物件

In [34]:
model = Word2Vec(sentences=clean_sents, size=64, 
                  sg=1, window=10, iter=5,
                  min_count=10, workers=4)

### 查看模型 (詞向量空間) 的內容

In [35]:
len(model.wv.vocab)

10329

In [36]:
model.wv['dog']

array([-0.6392187 ,  0.32716945, -0.16128439,  0.08041843, -0.09984614,
        0.21967773, -0.37162575, -0.29825193,  0.10480378,  0.00332568,
       -0.00281923, -0.2570237 , -0.47766465, -0.36439824, -0.08074535,
        0.23875275,  0.20045453,  0.12561241,  0.14000611, -0.41371027,
       -0.3014792 ,  0.43775827,  0.08139855, -0.50572604, -0.1443411 ,
       -0.37272337, -0.00157249,  0.15708981, -0.3468983 , -0.08786645,
       -0.0664263 , -0.29021332,  0.7195751 , -0.07143471,  0.3807386 ,
        0.3916732 , -0.2563773 ,  0.118199  , -0.16973594, -0.47874153,
       -0.24956603, -0.00809414, -0.09477931,  0.0909716 ,  0.09328753,
       -0.29775488, -0.14586735, -0.46415055, -0.8037281 , -0.1770414 ,
        0.20697641, -0.18647572, -0.28388846, -0.14065123, -0.45862898,
       -0.01913709, -0.13530183,  0.19671495, -0.16909339, -0.28705767,
        0.24103735, -0.02131163,  0.27344474,  0.2716697 ], dtype=float32)

#### 評估生成的詞向量空間

In [37]:
model.wv.most_similar('father', topn=3)

[('mother', 0.801602303981781),
 ('brother', 0.7300387024879456),
 ('curseth', 0.6988061666488647)]

In [38]:
model.wv.most_similar('dog', topn=3)

[('puppy', 0.8174463510513306),
 ('cage', 0.7660421133041382),
 ('brahmin', 0.7550937533378601)]

In [39]:
model.wv.most_similar('eat', topn=3)

[('bread', 0.8495417833328247),
 ('drink', 0.8142164945602417),
 ('meat', 0.7619834542274475)]

In [40]:
model.wv.most_similar('day', topn=3)

[('morning', 0.8028388023376465),
 ('night', 0.7907911539077759),
 ('week', 0.7091270089149475)]

In [41]:
model.wv.doesnt_match("mother father sister brother dog".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'dog'

In [42]:
model.wv.similarity('father', 'dog')

0.48291725

In [43]:
model.wv.most_similar(positive=['father', 'woman'], negative=['man']) 

[('mother', 0.7811132073402405),
 ('husband', 0.7449092268943787),
 ('sister', 0.7351561784744263),
 ('wife', 0.7219641804695129),
 ('daughter', 0.7178728580474854),
 ('womb', 0.6863711476325989),
 ('child', 0.6800585389137268),
 ('rachel', 0.6715828776359558),
 ('maiden', 0.6706640124320984),
 ('rebekah', 0.6674748659133911)]

In [44]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man']) 

[('wife', 0.7162209749221802),
 ('sister', 0.6940473914146423),
 ('daughter', 0.6512503027915955),
 ('mother', 0.6497961282730103),
 ('conceived', 0.643883466720581),
 ('maid', 0.6316143274307251),
 ('child', 0.6275677680969238),
 ('widow', 0.6212283372879028),
 ('maiden', 0.6208077073097229),
 ('womb', 0.6127462983131409)]