In [1]:
#P.34
""" NLPIA Chapter 2 Section 2.1 Code Listings and Snippets """
import pandas as pd


sentence = "Thomas Jefferson began building Monticello at the age of 26."#將字串指定為sentence
sentence.split()#以空格為分割標準，分解句子成token
# ['Thomas', 'Jefferson', 'began', 'building', 'Monticello', 'at', 'the', 'age', 'of', 'twenty-six.']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [2]:
str.split(sentence)#分割的另一種方法

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [3]:
#P.35 把token轉成onehot的形式
import numpy as np
token_sequence = str.split(sentence)#將sentence分割成list並指定為token_sequence
vocab = sorted(set(token_sequence))#將token_sequence放進集合中刪除一樣的token，按數字、大寫、小寫排列，
                                   #並指定為vocab
', '.join(vocab)#以comma將vocab裡的每個token連結在一起

'26., Jefferson, Monticello, Thomas, age, at, began, building, of, the'

In [4]:
num_tokens = len(token_sequence)#查看token_sequence的長度並指定為num_tokens
vocab_size = len(vocab)#查看vocab的長度並指定為vocab_size
onehot_vectors = np.zeros((num_tokens, vocab_size), int)#建一個num_tokens(10) x vocab_size(10)的零矩陣，
                                                        #dtype為int，並指定為onehot_vectors

In [5]:
for i, word in enumerate(token_sequence):#利用enumerate依序抓出index(i)和token(word)
    onehot_vectors[i, vocab.index(word)] = 1#先找出token在vocab裡的index，將此做為column
                                            #row代表第幾個token，column代表這個token是什麼字
                                            #將onehot_vectors中每個row代表的token(column)放入1
' '.join(vocab)#以whitespace將vocab裡的每個token連結在一起

'26. Jefferson Monticello Thomas age at began building of the'

In [6]:
onehot_vectors#顯示出onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [7]:
import pandas as pd
pd.DataFrame(onehot_vectors, columns=vocab)#利用pd.DataFrame將columns名設成vocab
                                           #使onehot_vectors容易找到對應的token

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [8]:
df = pd.DataFrame(onehot_vectors, columns=vocab)#將較漂亮的onehot_vectors指定為df
df[df == 0] =''#df裡等於0的element設成whitespace，更美觀的呈現資料
df#顯示出df

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,,,,1.0,,,,,,
1,,1.0,,,,,,,,
2,,,,,,,1.0,,,
3,,,,,,,,1.0,,
4,,,1.0,,,,,,,
5,,,,,,1.0,,,,
6,,,,,,,,,,1.0
7,,,,,1.0,,,,,
8,,,,,,,,,1.0,
9,1.0,,,,,,,,,


In [9]:
#P.40 可利用字典的形式儲存分解後的token，較onehot省空間
sentence_bow = {}#指定sentence_bow為一個dictionary
# As you can see, this simple Python function already does a decent job tokenizing the example sentence. A couple more vanilla python statements and you can create numerical vector representations for each word.
for token in sentence.split():#將sentence分割成token，並用迴圈依序抓出
    sentence_bow[token] = 1#將token指定為key，1指定為value，放入sentence_bow

sorted(sentence_bow.items())#將sentence_bow的item按順序列出

[('26.', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [10]:
# A slightly better data structure
sentence = "Thomas Jefferson began building Monticello at the age of 26."#將字串指定為sentence
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T
#sentence切割成token後依序抓出token並返回(token, 1)的tuple，然後放入dictionary，將此字典設為Series的形式
#產生一個DataFrame，index名為token，column名為'sent'
#最後做transpose並指定為df
df#顯示出df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [11]:
#P.41 建一個bag-of-words(BOW)
# And a pandas dataframe is great for holding multiple texts (sentences, tweets, or documents)
sentences  = "Thomas Jefferson began building Monticello at the age of 26. \n"#將字串指定為sentences
sentences += "Construction was done mostly by local masons and carpenters.\n"#在sentences裡增加字串
sentences += "He moved into the South Pavilion in 1770.\n" #在sentences裡增加字串
sentences += "Turning Monticello into a neoclassical masterpiece was Jefferson's obession.\n"#在sentences裡增加字串
sentences#顯示出sentences

"Thomas Jefferson began building Monticello at the age of 26. \nConstruction was done mostly by local masons and carpenters.\nHe moved into the South Pavilion in 1770.\nTurning Monticello into a neoclassical masterpiece was Jefferson's obession.\n"

In [15]:
corpus = {}#指定corpus為一個dictionary
for i, sent in enumerate(sentences.split('\n')):#按sentence裡的'\n'分解句子，並依續抓出index和句子
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())
    #將sentence切割成token後依序抓出token並返回(token, 1)的tuple，然後放入dictionary
    #再將dictionary作為value存入corpus的字典中
    #{}會存入i(sent0~sent3)作為corpus的key
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
#在pd.DataFrame.from_records中sent0有sent1沒有的token原本會返回NAN，利用fillna將NAN改為數字0，並指定為df

In [13]:
df[df.columns[:10]]#顯示出df前10個column

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent0,1,1,1,1,1,1,1,1,1,1
sent1,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,1,0,0,0
sent3,0,0,0,0,1,0,0,0,0,0
sent4,0,0,0,0,0,0,0,0,0,0


In [14]:
#P.42 dot product(點積)計算，矩陣相乘或向量相乘
v1 = pd.np.array([1, 2, 3])#建立一個array指定為v1
v2 = pd.np.array([2, 3, 4])#建立一個array指定為v2
v1.dot(v2)#利用dot算內資

  """Entry point for launching an IPython kernel.
  


20

In [15]:
(v1*v2).sum()#手動內積的算法

20

In [16]:
sum([x1*x2 for x1, x2 in zip(v1, v2)])#利用zip打包v1、v2的element，來計算內積
#不佳，降低運算速度

20

In [16]:
#P.42 利用dot product(點積)看不同句子中重複多少不同的token
df = df.T
df

Unnamed: 0,sent0,sent1,sent2,sent3,sent4
Thomas,1,0,0,0,0
Jefferson,1,0,0,0,0
began,1,0,0,0,0
building,1,0,0,0,0
Monticello,1,0,0,1,0
at,1,0,0,0,0
the,1,0,1,0,0
age,1,0,0,0,0
of,1,0,0,0,0
26.,1,0,0,0,0


In [18]:
df.sent0.dot(df.sent1)#沒有重複的token

0

In [19]:
df.sent0.dot(df.sent2)#有1個重複的token

1

In [20]:
df.sent0.dot(df.sent3#有1個重複的token

1

In [21]:
#P.43 re.split()可處理多個字或正則表達式
import re
sentence = "Thomas Jefferson began building Monticello at the age of 26."#將字串指定為sentence
tokens = re.split(r'[-\s.,;!?]+', sentence)#利用正則表達式分解sentence成token，並指定為tokens
#正則表達式中的\s代表空格； - 必須放在第一個，不然會以為是0-9範圍的意思
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [22]:
#P.45 re.compile()可預編譯正則表達式並將其作為參數，但會出現空格
sentence = "Thomas Jefferson began building Monticello at the age of 26."#將字串指定為sentence
pattern = re.compile(r'([-\s.,;!?])+')#先利用compile編譯正則表達式並指定為pattern
tokens = pattern.split(sentence)#呼叫編譯好的正則表達式來分解sentence，並指定為token

#P.45 多寫一個迴圈把re.compile()造成的空格刪掉
for x in tokens:#從tokens依序抓出token並指定為x
    if(x and x not in '- \t\n.,;!?'):#如果x不在'- \t\n.,;!?'裡
        print(x)#就印出x

Thomas
Jefferson
began
building
Monticello
at
the
age
of
26


In [23]:
#P.46 NLTK的RegexpTokenizer，NLP比賽常用，可直接分割空白和字尾含有標點符號的字
#比re.compile()好
from nltk.tokenize import RegexpTokenizer#從nltk.tokenize載入RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')#將RegexpTokenizer指定為tokenizer
tokenizer.tokenize(sentence)#呼叫tokenizer做tokenize分解sentence

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [24]:
#P.46 NLTK的TreebankWordTokenizer，可直接分割空白、字尾含有標點符號的字，但保留小數、分割縮寫(don't ->do、n't)
#最好的分解器(Tokenizer)
from nltk.tokenize import TreebankWordTokenizer#從nltk.tokenize載入TreebankWordTokenizer
sentence = "Thomas Jefferson began building Monticello at the age of 26."#將字串指定為sentence
tokenizer = TreebankWordTokenizer()#將RegexpTokenizer指定為tokenizer
tokenizer.tokenize(sentence)#呼叫tokenizer做tokenize分解sentence

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [28]:
# P49-50 1-gram tokenizer(將句子分解成一個單詞)
sentence = "Thomas Jefferson began building Monticello at the age of 26."#將字串指定為sentence
pattern = re.compile(r'([-\s.,;!?])+')#先利用compile編譯正則表達式並指定為pattern
tokens = pattern.split(sentence)#呼叫編譯好的正則表達式來分解sentence，並指定為token
#P.45 多寫一個迴圈把re.compile()造成的空格刪掉
tokens = [x for x in tokens if(x and x not in '- \t\n.,;!?')]
#從tokens依序抓出token並指定為x，如果x不在'- \t\n.,;!?'裡，就在list裡放入x
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [29]:
#P.50 n-gram tokenizer(把句子分解成n個單詞一組的字，可增加information)
#ngrams一次僅生成一個元素，不是返回整個序列進內存，可利用list()檢查所有返回的n-gram
from nltk.util import ngrams#從nltk.util載入ngrams
list(ngrams(tokens, 2))#把list裡的token切成2個單詞一組的tuple

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [27]:
list(ngrams(tokens, 3))##把list裡的token切成3個單詞一組的tuple

[('Thomas', 'Jefferson', 'began'),
 ('Jefferson', 'began', 'building'),
 ('began', 'building', 'Monticello'),
 ('building', 'Monticello', 'at'),
 ('Monticello', 'at', 'the'),
 ('at', 'the', 'age'),
 ('the', 'age', 'of'),
 ('age', 'of', '26')]

In [30]:
#P50-51  ngrams返回的list是tuple形式，將tuple轉變為字符串，使資料型態統一
two_grams = list(ngrams(tokens, 2))#把list裡的token切成2個單詞一組的tuple，並指定為two_grams
[" ".join(x) for x in two_grams]#將tuple裡的逗號換成空格

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [29]:
#P.33 stop word(出現頻率很高，但攜帶的information很少)
#有些詞攜帶較多的information，所以就算把句子中的stop word刪除也不會影響意思
#且減少詞彙量可降低overfitting
#但有時刪除stop word會影響整句意義，且NLTK和sklearn裡的stop words會一直更新，每次結果都不同
#所以已從NLP pipeline去除過濾stop word的動作
stop_words = ["a", "an", "the", "on", "of", "off", "this", "is"]#建立一個list並指定為stop_words
tokens = ["the", "house", "is", "on", "fire"]#建立一個list並指定為tokens
tokens_without_stopwords = [x for x in tokens if x not in stop_words]#從tokens依序抓出token並指名為x
#如果x不在stop_words裡，就把x放入list，最後指定為tokens_without_stopwords
print(tokens_without_stopwords)#印出tokens_without_stopwords

['house', 'fire']


In [30]:
#P.53 NLTK裡有stop word
import nltk#載入nltk

nltk.download('stopwords')#從nltk下載stopwords
stop_words = nltk.corpus.stopwords.words('english')#抓出nltk裡英文的topwords，並指定為stop_words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
len(stop_words)#計算stop_words的長度

179

In [32]:
stop_words[:7]#顯示stop_words前7個字

['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

In [33]:
#如果stopwords只有一個字就顯示出來
for sw in stop_words:#從stop_words裡依序抓出token並指定為sw
    if(len(sw) == 1):#如果sw的單詞長度為1(只有1個字)
        print(sw)#就印出sw

i
a
s
t
d
m
o
y


In [34]:
#P.54 sklearn裡的stop word較NLTK多
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
#從sklearn.feature_extraction.text載入ENGLISH_STOP_WORDS並命名為sklearn_stop_words
len(sklearn_stop_words)#計算sklearn_stop_words的長度

318

In [35]:
type(sklearn_stop_words)#查看sklearn_stop_words的type

frozenset

In [36]:
sk = [x for x in sklearn_stop_words]#從sklearn_stop_words依序抓出token並指定為為x，再把x放入list中，
#最後指定list為sk
sk

['last',
 'all',
 'found',
 'elsewhere',
 'mostly',
 'serious',
 'least',
 'what',
 'one',
 'her',
 'about',
 'become',
 'any',
 'before',
 'once',
 'among',
 'been',
 'these',
 'are',
 'well',
 'mine',
 'alone',
 'everyone',
 'whoever',
 'since',
 'again',
 'go',
 'me',
 'fifteen',
 'thereupon',
 'amoungst',
 'hereupon',
 'whose',
 'in',
 'side',
 'sixty',
 'indeed',
 'within',
 'against',
 'hereby',
 'an',
 'above',
 'due',
 'over',
 'behind',
 'thick',
 'whereas',
 'detail',
 'please',
 'sometime',
 'those',
 'sometimes',
 'as',
 'may',
 'during',
 'third',
 'either',
 'its',
 'whereafter',
 'has',
 'and',
 'with',
 'amount',
 'somewhere',
 'see',
 'nobody',
 'why',
 'below',
 'con',
 'less',
 'seem',
 'whenever',
 'thus',
 'cannot',
 'was',
 'noone',
 'itself',
 'becomes',
 'other',
 'whence',
 'down',
 'hundred',
 'out',
 'wherever',
 'my',
 'were',
 'ourselves',
 'seeming',
 'so',
 'by',
 'twenty',
 're',
 'beforehand',
 'up',
 'eight',
 'on',
 'us',
 'formerly',
 'done',
 'very'

In [37]:
len(stop_words)#計算stop_words的長度

179

In [38]:
intersection = 0#把0放入變數intersection
for x in sk:#從sk依序抓出token並命名為x
    if x in stop_words:#如果x在stop_words中
        intersection = intersection + 1#intersection就加1再放回到intersection變數中
intersection#印出intersection

119

In [39]:
# case folding(大小寫摺疊)，將大寫轉換成小寫，使降低詞彙量
#無法處理詞性變化、複數、所有格...
tokens = ['House', 'Visitor', 'Center']#建立list並指定為tokens
normalized_tokens = [x.lower() for x in tokens]#從tokens依序抓出token並命名為x，將x化為小寫後放入list，
#指定list為normalized_tokens
print(normalized_tokens)#印出normalized_tokens

['house', 'visitor', 'center']


In [33]:
#P.58 Stemmimg提取字幹，除去word的後綴詞(s)，使降低相同字的詞彙量
def stem(phrase):#建立一個名為stem的函數
    #先把句子全部改為小寫，在分解成字 (for迴圈出來是list形式)
    #re.findall()返回list形式的[(word,後綴(suffix))]，加上前面for迴圈的list，變為[[(,)]]
        #字尾超過一個s，返回(字,空字串) #字尾為一個s，返回(去除s的字，s) #字尾沒有s，返回(字，空字串)
    #[0][0]進入第一個list，返回tuple裡的第一個值(word)
    #word裡如果'在字前或字後，可利用.strip("'")刪除
    #用' '.join()空格間隔每個字
    return ' '.join([re.findall('^(.*ss|.*?)(s)?$', word)[0][0].strip("'") for word in phrase.lower().split()])

In [41]:
stem("houses")#呼叫stem對houses做詞幹提取

'house'

In [42]:
stem("Doctor House's calls")#呼叫stem對"Doctor House's calls"做詞幹提取

'doctor house call'

In [35]:
#P.58 NLK的Stemmer(詞幹提取器)
from nltk.stem.porter import PorterStemmer#從nltk.stem.porter載入PorterStemmer
stemmer = PorterStemmer()#將PorterStemmer指定為stemmer
' '.join([stemmer.stem(w).strip("'") for w in "dish washer's waseded dishes".split()])
#將"dish washer's waseded dishes"分解成token並依序抓出指定為w，呼叫stem對w做詞幹提取，並刪除引號(')，放入list
#以whitespace將list裡的每個token連結在一起
#stemmer

'dish washer wased dish'

In [44]:
#P.61 NLTK的Lemmatizer(詞形還原器)，根據詞根做轉換
#僅限於Princeton WordNet圖形中的word
#優於Stemmer和case folding，通常先使用Lemmatizer，再用Stemmer
nltk.download('wordnet')#從nltk下載wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [45]:
from nltk.stem import WordNetLemmatizer#從nltk.stem載入WordNetLemmatizer
lemmatizer = WordNetLemmatizer()#將WordNetLemmatizer指定為lemmatizer
lemmatizer.lemmatize("better")#默認為名詞#呼叫lemmatizer對better做詞形還原

'better'

In [57]:
lemmatizer.lemmatize("better", pos = "a")#呼叫lemmatizer對better做詞形還原，詞性指定為adjective

'good'

In [58]:
lemmatizer.lemmatize("good", pos = "a")#呼叫lemmatizer對good做詞形還原，詞性指定為adjective

'good'

In [59]:
lemmatizer.lemmatize("goods", pos = "a")#呼叫lemmatizer對goods做詞形還原，詞性指定為adjective

'goods'

In [60]:
lemmatizer.lemmatize("goods", pos = "n")#呼叫lemmatizer對goods做詞形還原，詞性指定為noun #產品

'good'

In [61]:
lemmatizer.lemmatize("goodness", pos = "n")#呼叫lemmatizer對goodness做詞形還原，詞性指定為noun #善良

'goodness'

In [62]:
lemmatizer.lemmatize("best", pos = "a")#呼叫lemmatizer對best做詞形還原，詞性指定為adjective #best無法轉換

'best'

In [63]:
stemmer.stem('goodness')#呼叫stemmer對goodness做詞幹提取
#Stemmer會盲目的把ness刪除

'good'

In [65]:
#P.64-65 Sentiment analysis測量句子和文本的情感
#human-designe rules(heuristics)，將文本中可在VANDER字典找到的關鍵字的分數相加
#不會查看文本中的所有單詞，只查看VADER字典裡大約7,500個單詞
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#從vaderSentiment.vaderSentiment載入SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()#將SentimentIntensityAnalyzer指定為sa
sa.lexicon#顯示出sa的辭典
#文字、標點符號、表情符號都有相對應的情緒分數

{'$:': -1.5,
 '%)': -0.4,
 '%-)': -1.5,
 '&-:': -0.4,
 '&:': -0.7,
 "( '}{' )": 1.6,
 '(%': -0.9,
 "('-:": 2.2,
 "(':": 2.3,
 '((-:': 2.1,
 '(*': 1.1,
 '(-%': -0.7,
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 '(:': 2.2,
 '(:0': 2.4,
 '(:<': -0.2,
 '(:o': 2.5,
 '(:O': 2.5,
 '(;': 1.1,
 '(;<': 0.3,
 '(=': 2.2,
 '(?:': 2.1,
 '(^:': 1.5,
 '(^;': 1.5,
 '(^;0': 2.0,
 '(^;o': 1.9,
 '(o:': 1.6,
 ")':": -2.0,
 ")-':": -2.1,
 ')-:': -2.1,
 ')-:<': -2.2,
 ')-:{': -2.1,
 '):': -1.8,
 '):<': -1.9,
 '):{': -2.3,
 ');<': -2.6,
 '*)': 0.6,
 '*-)': 0.3,
 '*-:': 2.1,
 '*-;': 2.4,
 '*:': 1.9,
 '*<|:-)': 1.6,
 '*\\0/*': 2.3,
 '*^:': 1.6,
 ',-:': 1.2,
 "---'-;-{@": 2.3,
 '--<--<@': 2.2,
 '.-:': -1.2,
 '..###-:': -1.7,
 '..###:': -1.9,
 '/-:': -1.3,
 '/:': -1.3,
 '/:<': -1.4,
 '/=': -0.9,
 '/^:': -1.0,
 '/o:': -1.4,
 '0-8': 0.1,
 '0-|': -1.2,
 '0:)': 1.9,
 '0:-)': 1.4,
 '0:-3': 1.5,
 '0:03': 1.9,
 '

In [66]:
#VADER lexicon中含有空格的字
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]
#從sa.lexicon抓依序抓出字典裡的key指定為tok，value指定為score，如果token裡有whitespace，
#就將tok和score的tuple放入list

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

In [67]:
#從positive、negative、neutral三種polarity(極性)的分數，得出復合的positivity scores
sa.polarity_scores(text = "Python is very readable and it's great for NLP")
#利用sa.polarity_scores計算字串的極性分數

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}

In [68]:
sa.polarity_scores(text = "Python is not a bad choice for most applications")
#利用sa.polarity_scores計算字串的極性分數

{'neg': 0.0, 'neu': 0.737, 'pos': 0.263, 'compound': 0.431}

In [69]:
corpus = ["Absolutely perfect! Love it! :-) :-) :-)",
"Horrible! Completely useless. :(",
"It was OK. Some good and some bad things."]
#建立一個放入sentence(字串)的list並指定為corpus

In [70]:
for doc in corpus:#從corpus中依序抓出sentence，並指定為doc
    scores = sa.polarity_scores(doc)#利用sa.polarity_scores計算doc的極性分數
    print('{:+}: {}'.format(scores['compound'], doc))#印出compound score和doc
    #(:+)數字為正時，前面加上(+)

+0.9428: Absolutely perfect! Love it! :-) :-) :-)
-0.8768: Horrible! Completely useless. :(
-0.1531: It was OK. Some good and some bad things.


In [71]:
""" Section 2.3 code listings from NLPIA """

' Section 2.3 code listings from NLPIA '

In [72]:
import pandas as pd#仔入pandas並命名為pd
#設置DataFrame的格式
pd.options.display.max_colwidth = 40  # default: 50
pd.options.display.width = 75  # default: 80
pd.options.display.max_columns = 12  # default: 0

In [57]:
#P.65 準備Naive Bayes moddel要用的資料
with open('nlpia_movieReviewSnippets_GroundTruth.txt', 'r', encoding="utf-8") as f:
#開啟編碼為utf-8的txt檔資料，並命名為f
    raw_data = []#建立一個list指定為raw_data
    for line in f:#依序抓出f裡每一行的內容並命名為line
        raw_data.append(line.split("\t"))#以tab分割line成list，並放入raw_data

In [58]:
pd.DataFrame(raw_data).head()#以DataFrame的形式顯示raw_data的前5行

Unnamed: 0,0,1,2
0,id,sentiment,text\n
1,1,2.26666666667,The Rock is destined to be the 21st Century's ...
2,2,3.53333333333,The gorgeously elaborate continuation of ''The...
3,3,-0.6,Effective but too tepid biopic\n
4,4,1.46666666667,If you sometimes like to go to the movies to h...


In [59]:
type(raw_data)#查看raw_data的type

list

In [60]:
for i in range(0, len(raw_data)):#從數字0到raw_data的長度依序抓出數字並指定為i
    if(i>0):#如果i>0 #第0列為column名
        raw_data[i][1] = float(raw_data[i][1])#將raw_data的第i列第1行改為float並放回 #原本type為object
    raw_data[i][2] = raw_data[i][2].replace("\n", "")#將raw_data的第i列第2行的字串中"/n"改為whitespace並放回
pd.DataFrame(raw_data).head()#利用DataFrame顯示raw_data的前5行

Unnamed: 0,0,1,2
0,id,sentiment,text
1,1,2.26667,The Rock is destined to be the 21st Century's ...
2,2,3.53333,The gorgeously elaborate continuation of ''The...
3,3,-0.6,Effective but too tepid biopic
4,4,1.46667,If you sometimes like to go to the movies to h...


In [61]:
movies = pd.DataFrame(raw_data)#將DataFrame格式的raw_data的前5行指定為movies
movies.columns= raw_data[0]#將raw_data的第0列指定為movies的columns
movies = movies.drop(movies.index[0])#刪掉movies的第0列，並指定為movies
movies.head()#顯示movies的前5行

Unnamed: 0,id,sentiment,text
1,1,2.26667,The Rock is destined to be the 21st Century's ...
2,2,3.53333,The gorgeously elaborate continuation of ''The...
3,3,-0.6,Effective but too tepid biopic
4,4,1.46667,If you sometimes like to go to the movies to h...
5,5,1.73333,"Emerges as something rare, an issue movie that..."


In [62]:
movies.describe()
#查看movies的內容
#字段有count（非空值數）、unique（唯一值數）、top（頻數最高者）、freq（最高頻數）

Unnamed: 0,id,sentiment,text
count,10605,10605.0,10605
unique,10605,1417.0,10603
top,6640,2.0,'Stock up on silver bullets for director Neil ...
freq,1,151.0,2


In [63]:
movies["sentiment"].describe()#查看movies裡sentiment欄位的內容
#movies的"sentiment"還是object，所以自訂計算自段內容

count     10605.0
unique     1417.0
top           2.0
freq        151.0
Name: sentiment, dtype: float64

In [64]:
movies["sentiment"].astype(float).describe()#將movies的"sentiment"轉為小數形式，並查看其內容
#轉為數值型內容

count    10605.000000
mean         0.004831
std          1.922050
min         -3.875000
25%         -1.769231
50%         -0.080000
75%          1.833333
max          3.941176
Name: sentiment, dtype: float64

In [65]:
#P.66 先使用casual_tokenize分解文本，計算token出現的次數，將其顯示出來
import pandas as pd#載入pandas並命名為pd
pd.set_option('display.width', 75)#show出較寬為75的DataFrame格式
from nltk.tokenize import casual_tokenize#載入nltk.tokenize並命名為casual_tokenize
#比其他tokenizer好，可處理表情符號、標點符號、里語
bags_of_words = []#建立空list並命名為bags_of_words
from collections import Counter#載入collections並命名為Counter

In [66]:
for text in movies.text:#從movies的text欄位依序抓出document並命名為text
    bags_of_words.append(Counter(casual_tokenize(text)))#利用casual_tokenize分解text成token
    #計算每個token出現的字數，並放入bags_of_words
#Counter()每個token出現的次數，返回字典形式{word:次數}

In [67]:
len(bags_of_words)#計算bags_of_words的長度

10605

In [68]:
bags_of_words[0]#顯示第0個文本的bags_of_words

Counter({'The': 1,
         'Rock': 1,
         'is': 1,
         'destined': 1,
         'to': 2,
         'be': 1,
         'the': 1,
         '21st': 1,
         "Century's": 1,
         'new': 1,
         "'": 4,
         'Conan': 1,
         'and': 1,
         'that': 1,
         "he's": 1,
         'going': 1,
         'make': 1,
         'a': 1,
         'splash': 1,
         'even': 1,
         'greater': 1,
         'than': 1,
         'Arnold': 1,
         'Schwarzenegger': 1,
         ',': 1,
         'Jean': 1,
         'Claud': 1,
         'Van': 1,
         'Damme': 1,
         'or': 1,
         'Steven': 1,
         'Segal': 1,
         '.': 1})

In [69]:
type(bags_of_words[0])

collections.Counter

In [70]:
type(bags_of_words)#查看bags_of_words的type

list

In [93]:
pd.DataFrame.from_records(bags_of_words[0:5])
#利用from_records將bags_of_words前5個document裡字典的key轉為column name，value放在適當的位置，
#如果該column下沒有value就會返回NaN

Unnamed: 0,The,Rock,is,destined,to,be,...,keenly,observed,it,doesn't,feel,one
0,1.0,1.0,1.0,1.0,2.0,1.0,...,,,,,,
1,2.0,,1.0,,,,...,,,,,,
2,,,,,,,...,,,,,,
3,,,1.0,,4.0,,...,,,,,,
4,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0


In [94]:
pd.DataFrame.from_records(bags_of_words[0:5]).fillna(0).astype(int)
#NaN會顯示成float形式，所以缺失值補0後轉為整數

Unnamed: 0,The,Rock,is,destined,to,be,...,keenly,observed,it,doesn't,feel,one
0,1,1,1,1,2,1,...,0,0,0,0,0,0
1,2,0,1,0,0,0,...,0,0,0,0,0,0
2,0,0,0,0,0,0,...,0,0,0,0,0,0
3,0,0,1,0,4,0,...,0,0,0,0,0,0
4,0,0,0,0,0,0,...,1,1,1,1,1,1


In [95]:
for i in range(0, 5):#從0-4中依序抓出數字並命名為i
    print(len(bags_of_words[i]))#印出每個文檔字的長度

33
34
5
18
21


In [96]:
df_bows = pd.DataFrame.from_records(bags_of_words)
#利用from_records將bags_of_words前5個document裡字典的key轉為column name，value放在適當的位置，
#如果該column下沒有value就會返回NaN，並指定為df_bows

In [97]:
df_bows.head()#顯示df_bows的前5行

Unnamed: 0,The,Rock,is,destined,to,be,...,Bearable,Staggeringly,’,ve,muttering,dissing
0,1.0,1.0,1.0,1.0,2.0,1.0,...,,,,,,
1,2.0,,1.0,,,,...,,,,,,
2,,,,,,,...,,,,,,
3,,,1.0,,4.0,,...,,,,,,
4,,,,,,,...,,,,,,


In [98]:
df_bows = df_bows.fillna(0).astype(int)#將df_bows的缺失值補0後轉為整數

In [99]:
df_bows.shape#查看df_bows的shape

(10605, 20756)

In [100]:
movies.sentiment.shape#查看movies裡sentiment欄位的shape

(10605,)

In [102]:
#P.67 以機器學習貝是分類器來測量情緒分數
from sklearn.naive_bayes import MultinomialNB  #從sklearn.naive_bayes載入MultinomialNB
nb = MultinomialNB()#將MultinomialNB指定為nb
nb = nb.fit(df_bows, movies.sentiment > 0)
#x帶入df_bows，y帶入movies的sentiment欄位並轉成True or False，並呼叫nb進行配適，指定為nb
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4
#貝氏分類器output=>0~1，真實分數是-4~4，先做轉換以比較準確度，並放入在movies新建的predicted_sentiment欄位
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
#取預測和真實值誤差的絕對值，並放入在movies新建的error欄位
movies.error.mean().round(1)#取movies裡error欄位的平均值，並取小數第1位
#round只能用在series
# 2.4
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)#真實情緒好(正)的為1，不好(負)為0
#movies的sentiment欄位大於0返回true，否則返回false，並轉為int形式
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0).astype(int)
#movies的predicted_sentiment欄位大於0返回true，否則返回false，並轉為int形式
#預測情緒好(正)的為1，不好(負)為0
movies['sentiment predicted_sentiment sentiment_ispositive predicted_ispositive'.split()].head(8)

Unnamed: 0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispositive
1,2.26667,4,1,1
2,3.53333,4,1,1
3,-0.6,-4,0,0
4,1.46667,4,1,1
5,1.73333,4,1,1
6,2.53333,4,1,1
7,2.46667,4,1,1
8,1.26667,-4,1,0


In [103]:
(movies.predicted_ispositive == movies.sentiment_ispositive).sum() / len(movies)
#如果預測值等於真實值，就返回1，並加總，最後除以movie的總長度，已算出準確度

0.9344648750589345

In [104]:
#P.68 用movies建模的model去跑products
with open('nlpia_amazonReviewSnippets_GroundTruth.txt', 'r', encoding="utf-8") as f:
#開啟編碼為utf-8的txt檔資料，並命名為f
    product_data = []#建立一個list指定為product_data
    for line in f:#依序抓出f裡每一行的內容並命名為line
        product_data.append(line.split("\t"))#以tab分割line成list，並放入raw_data

In [105]:
for i in range(0, len(product_data)):#從數字0到product_data的長度依序抓出數字並指定為i
    if(i>0):#如果i>0 #第0列為column名
        product_data[i][1] = float(product_data[i][1])#將product_data的第i列第1行改為float並放回 #原本type為object
    product_data[i][2] = product_data[i][2].replace("\n", "")#將product_data的第i列第2行的字串中"/n"改為whitespace並放回
products = pd.DataFrame(product_data)#將DataFrame格式的products_data的前5行指定為movies
products.columns= product_data[0]#將products_data的第0列指定為products的columns
products = products.drop(products.index[0])#刪掉products的第0列，並指定為products
pd.DataFrame(products).head()#顯示products的前5行

Unnamed: 0,id,sentiment,text
1,1_1,-0.9,troubleshooting ad-2500 and ad-2600 ...
2,1_2,-0.15,"repost from january 13, 2004 with a ..."
3,1_3,-0.2,does your apex dvd player only play ...
4,1_4,-0.1,or does it play audio and video but ...
5,1_5,-0.5,before you try to return the player ...


In [106]:
len(products)#計算products的長度

3708

In [107]:
bags_of_words = []#建立一個空的list並指定為bags_of_words
for text in products.text:#從products的text欄位依序抓出document並命名為text
    bags_of_words.append(Counter(casual_tokenize(text)))#利用casual_tokenize分解text成token
    #計算每個token出現的字數，並放入bags_of_words

In [108]:
len(bags_of_words)#計算bags_of_words的長度

3708

In [109]:
df_product_bows = pd.DataFrame.from_records(bags_of_words)
#利用from_records將bags_of_words前5個document裡字典的key轉為column name，value放在適當的位置，
#如果該column下沒有value就會返回NaN，並指定為df_bows
df_product_bows = df_product_bows.fillna(0).astype(int)#將df_bows的缺失值補0後轉為整數，並指定為df_product_bows
df_all_bows = df_bows.append(df_product_bows)#將products的詞袋與前面的movies的詞袋做結合，並指定為df_all_bows

In [110]:
df_product_bows.shape#查看df_product_bows的shape

(3708, 5442)

In [111]:
df_bows.shape#查看df_bows的shape

(10605, 20756)

In [112]:
df_all_bows.shape#查看df_all_bows的shape

(14313, 23057)

In [113]:
df_all_bows.columns#查看df_all_bows的欄位名

Index(['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st',
       'Century's', 'new',
       ...
       'sligtly', 'owner', '81', 'defectively', 'warrranty', 'expire',
       'expired', 'voids', 'baghdad', 'harddisk'],
      dtype='object', length=23057)

In [114]:
df_product_bows2 = df_all_bows.iloc[len(movies):][df_bows.columns]
#先將product的BOW抓出來(index從movie的長度到最後)
#再將貝氏建模時使用的movie的column抓出來
#最後指定為df_product_bows2
df_product_bows2.shape#查看df_product_bows2的shape

(3708, 20756)

In [115]:
products['ispos'] = (products.sentiment > 0).astype(int)
#products的sentiment欄位大於0返回true，否則返回false，並轉為int形式
#並放入在products新建的ispos欄位
products['ispos'].head()#抓出products的ispos欄位的前5行

1    0
2    0
3    0
4    0
5    0
Name: ispos, dtype: int32

In [116]:
products.sentiment.head()#顯示products的sentiment欄位的前5行

1    -0.9
2   -0.15
3    -0.2
4    -0.1
5    -0.5
Name: sentiment, dtype: object

In [117]:
df_product_bows2.shape#查看df_product_bows2的shape

(3708, 20756)

In [118]:
df_product_bows2 = df_product_bows2.fillna(0).astype(int)
#將df_product_bows2的缺失值補0後轉為整數，並指定為df_product_bows2

In [119]:
nb.predict(df_product_bows2.values).astype(int)
#呼叫nb對df_product_bows2的值做預測，並轉為int

array([0, 0, 0, ..., 0, 0, 0])

In [120]:
products['pred'] = nb.predict(df_product_bows2.values).astype(int)
#呼叫nb對df_product_bows2的值做預測，並轉為int，並放入在products新建的pred欄位

In [121]:
(products.pred == products.ispos).sum() / len(products)
#如果預測值等於真實值，就返回1，並加總，最後除以movie的總長度，以算出準確度

0.557982740021575