In [1]:
from scipy.stats import sem
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
#from wordcloud import WordCloud
import numpy as np  # 書中遺漏此列程式碼
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /Users/gaoming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gaoming/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
sms_raw = pd.read_csv("./Comment.csv")
sms_raw.columns
sms_raw = sms_raw.drop(
    columns=['useful', 'funny', 'cool', 'Unnamed: 0', 'date'])


def stars_level(star):
    if star > 3:
        return 1
    else:
        return 0


sms_raw['type'] = sms_raw['stars'].apply(stars_level)
# type：垃圾或正常簡訊，text：簡訊文字內容
print(sms_raw.dtypes)

# type 次數分佈，ham 佔多數，但未過度不平衡
print(sms_raw['type'].value_counts()/len(sms_raw['type']))


business_id     object
stars          float64
text            object
Tmp_number     float64
type             int64
dtype: object
1    0.640913
0    0.359087
Name: type, dtype: float64


In [3]:
# Python 自然語言處理工具集(Natural Language ToolKit)
#import nltk
# 串列推導完成分詞
token_list0 = [nltk.word_tokenize(txt) for txt in sms_raw['text']]
print(token_list0[3][1:7])
print(token_list0[4])

['wrap', '-', 'falafel', 'with', 'greens', 'pickles']
['I', 'ate', 'here', 'recently', 'on', 'the', 'recommendations', 'of', 'some', 'Muslim', 'brothers', 'at', 'the', 'Islamic', 'Society', 'of', 'Boston', '(', 'beautiful', 'masjid', ')', '.', 'They', 'said', 'it', 'was', 'halal', 'and', 'the', 'Arab', 'owners', 'confirmed', 'that', 'it', 'was', '.', 'This', 'is', 'a', 'nice', 'downtown', 'spot', 'across', 'the', 'street', 'from', 'the', 'park', 'and', 'right', 'next', 'to', 'the', 'Park', 'Street', 'subway', 'station.We', 'enjoyed', 'the', 'chicken', 'kabob', 'and', 'baked', 'haddock', 'dinners', 'which', 'both', 'came', 'with', 'rice', 'and', 'salad', '.', 'Very', 'enjoyable', 'hot', 'and', 'fresh', 'food', '.', 'When', 'I', 'went', 'to', 'buy', 'some', 'rice', 'pudding', 'for', 'dessert', 'the', 'owner', 'gave', 'it', 'to', 'us', 'for', 'free', '.', 'What', 'a', 'nice', 'touch', '.', 'InshaAllah', 'my', 'wife', 'and', 'I', 'will', 'return', 'soon', '.']


In [4]:
# 串列推導完成轉小寫(Ibiza 變成ibiza)
token_list1 = [[word.lower() for word in doc] for doc in token_list0]  
# doc: 各則的各個字詞
print(token_list1[3][1:7])


# 移除停用詞
#from nltk.corpus import stopwords
# 179 個英語停用字詞
stop_words = set(stopwords.words('english'))

# 停用字 or 已被移除
token_list2 = [[word for word in doc if word not in stop_words] for doc in token_list1]
print(token_list2[3][1:7])

['wrap', '-', 'falafel', 'with', 'greens', 'pickles']
['wrap', '-', 'falafel', 'greens', 'pickles', 'hummus']


In [5]:
# 串列推導移除標點符號 因stopwords變數有改1
token_list3 = [[word for word in doc if word not in string.punctuation] for doc in token_list2]
print(token_list3[3][1:7])

# 串列推導移除所有數字(4 不見了)
token_list4 = [[word for word in doc if not word.isdigit()] for doc in token_list3]
print(token_list4[3][1:7])

# 三層巢狀串列推導移除字符中夾雜數字或標點符號的情形
token_list5 = [[''.join([i for i in word if not i.isdigit() and i not in string.punctuation]) for word in doc] for doc in token_list4]
# doc: 各則簡訊，word: 各則簡訊中的各個字詞，i: 各個字詞中的各個字元
# £10,000 變成£
print(token_list5[3][1:7])

# 串列推導移除空元素
token_list6 = [list(filter(None, doc)) for doc in token_list5]
print(token_list6[3][1:7])



['wrap', 'falafel', 'greens', 'pickles', 'hummus', 'toasted']
['wrap', 'falafel', 'greens', 'pickles', 'hummus', 'toasted']
['wrap', 'falafel', 'greens', 'pickles', 'hummus', 'toasted']
['wrap', 'falafel', 'greens', 'pickles', 'hummus', 'toasted']


In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/gaoming/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [9]:
# 載入nltk.stem 的WordNet 詞形還原庫
# 宣告詞形還原器
lemma = WordNetLemmatizer()
# 串列推導完成詞形還原(needs 變成need)
token_list6 = [[lemma.lemmatize(word) for word in doc] for doc in token_list6]
print(token_list6[3][1:7])

# 串列推導完成各則字詞的串接
# join() 方法將各則簡訊doc 中分開的字符又連接起來
token_list7 = [' '.join(doc) for doc in token_list6]
print(token_list7[:2])

['wrap', 'falafel', 'green', 'pickle', 'hummus', 'toasted']
['setting perfectly adequate food come close dining chain like chili victoria station barbecue betterit s surprise always pick coupon linwood restaurantcom', 'nothing special good enough like another one much better dorchester hardly get area le s goto place pho']


In [13]:
# 從feature_extraction 模組載入詞頻計算與DTM 建構類別
# 宣告空模
# max_df: corpus-specific stop words (特定於語料庫的停用詞) 此處文件頻率超出六成(0.6)者不計入詞彙中;
# min_df: this value is also called cut-off in the literature 此處表文件頻率低於20者不計入詞彙中 (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
# min_df: 類似於max_df，不同之處在於如果某個詞的document frequence小於min_df，則這個詞不會被當作關鍵詞
# ngram_range:片語切分的長度範圍，待詳解
# stop_words	設定停用詞，設為english將使用內建的英語停用詞，設為一個list可自定義停用詞，設為None不使用停用詞，設為None且max_df∈[0.7, 1.0)將自動根據當前的語料庫建立停用詞表
#
vec = CountVectorizer(min_df=500, max_df=0.1, ngram_range=(1, 2), token_pattern="(?u)\\b\\w+\\b", stop_words='english')
# 傳入簡訊配適實模並轉換為DTM 稀疏矩陣X
X = vec.fit_transform(token_list7)
# scipy 套件稀疏矩陣類別

print(type(X))

# 稀疏矩陣儲存詞頻的方式：(橫列，縱行) 詞頻
print(X[:2])  # 前兩則簡訊的詞頻在稀疏矩陣中的存放方式

#import sys
#import numpy
# numpy.set_printoptions(threshold=sys.maxsize)
print(X.toarray()[:2])  # 轉成常規矩陣後，方可見前兩則簡訊的完整詞頻向量

# X 轉為常規矩陣(X.toarray())，並組織為pandas 資料框
sms_dtm = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
# 512221 列(則)7202 行(字) 的結構
print(sms_dtm.shape)

# 模型vec 取出DTM 各字詞的get_feature_names() 方法
print(len(vec.get_feature_names()))  # 共有7202 個字詞

print(vec.get_feature_names()[200:305])


# 512221 則中app 此字只有 則正詞頻，的確稀疏(新版numpy請用下行註解程式碼)
#print(np.argwhere(sms_dtm['awesome'] > 0))  # 列向量
print(np.argwhere((sms_dtm['awesome'] > 0).values.reshape((-1,1)))) # 新版numpy需轉成行向量

# DTM 部分內容
print(sms_dtm.iloc[4460:4470, 300:305])
# sms_dtm.max().max() # 15, 原始詞頻dtm，適合配適multinomialNB()

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 5696)	1
  (0, 4494)	1
  (0, 63)	1
  (0, 1074)	1
  (0, 1591)	1
  (0, 913)	1
  (0, 1008)	1
  (0, 6064)	1
  (0, 386)	1
  (0, 6226)	1
  (0, 4516)	1
  (0, 1283)	1
  (0, 2188)	1
  (0, 1131)	1
  (1, 5948)	1
  (1, 1675)	1
  (1, 2862)	1
  (1, 207)	1
  (1, 3314)	1
  (1, 2661)	1
  (1, 4511)	1
  (1, 2560)	1
  (1, 2662)	1
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(512221, 7202)
7202
['appropriate', 'appropriately', 'approximately', 'apps', 'april', 'aquarium', 'arancini', 'area', 'area definitely', 'area food', 'area great', 'area nt', 'area place', 'area s', 'argue', 'arm', 'aroma', 'arrangement', 'array', 'arrival', 'arrive', 'arrived', 'arrived minute', 'arrived pm', 'arrives', 'arriving', 'art', 'artichoke', 'artichoke dip', 'artist', 'artwork', 'arugula', 'arugula salad', 'asada', 'asap', 'asia', 'asian', 'asian food', 'asian fusion', 'asian restaurant', 'aside', 'ask', 'ask u', 'asked', 'asked server', 'asked u', 'asked waiter', 'asked waitress', 'aske

In [14]:
# 訓練與測試集切分(sms_raw, sms_dtm, token_list6)
sms_raw_train = sms_raw.iloc[:417000, :]
sms_raw_test = sms_raw.iloc[417000:, :]
sms_dtm_train = sms_dtm.iloc[:417000, :]
sms_dtm_test = sms_dtm.iloc[417000:, :]
token_list6_train = token_list6[:417000]
token_list6_test = token_list6[417000:]
# 查核各子集類別分佈
print(sms_raw_train['type'].value_counts()/len(sms_raw_train['type']))

print(sms_raw_test['type'].value_counts()/len(sms_raw_test['type']))


1    0.641686
0    0.358314
Name: type, dtype: float64
1    0.637527
0    0.362473
Name: type, dtype: float64


In [10]:
# WordCloud() 統計詞頻須跨篇組合所有詞項
#tokens_train = [token for doc in token_list6_train for token in doc]
#print(len(tokens_train))

# 邏輯值索引結合zip() 綑綁函數，再加判斷句與串列推導
#tokens_train_spam = [token for is_spam, doc in zip(sms_raw_train['type'] == '0', token_list6_train) if is_spam for token in doc]
# 取出正常簡訊
#tokens_train_ham = [token for is_ham, doc in zip(sms_raw_train['type'] == '1', token_list6_train) if is_ham for token in doc]
# 逗號接合訓練與spam 和ham 兩子集tokens
#str_train = ','.join(tokens_train)
#str_train_spam = ','.join(tokens_train_spam)
#str_train_ham = ','.join(tokens_train_ham)
# Python 文字雲套件(conda install -c conda-forge wordcloud --y, !conda install -c conda-forge wordcloud --y)
# 宣告文字雲物件(最大字數max_words 預設為200)
#wc_train = WordCloud(background_color="white", prefer_horizontal=0.5)
# 傳入資料統計，並產製文字雲物件
#wc_train.generate(str_train_spam)  # str_train -> str_train_ham, str_train_spam
# 呼叫matplotlib.pyplot 模組下的imshow() 方法繪圖
#plt.imshow(wc_train)
#plt.axis("off")
# plt.show()
# plt.savefig('wc_train.png')
# 限於篇幅，str_train_spam 和str_train_ham 文字雲繪製代碼省略

In [15]:
from sklearn.naive_bayes import MultinomialNB
# 載入多項式天真貝氏模型類別
# 模型定義、配適與預測
clf = MultinomialNB()

clf.fit(sms_dtm_train, sms_raw_train['type'])
train = clf.predict(sms_dtm_train)
print(" 訓練集正確率為{}".format(sum(sms_raw_train['type'] == train)/len(train)))

pred = clf.predict(sms_dtm_test)
print(" 測試集正確率為{}".format(sum(sms_raw_test['type'] == pred)/len(pred)))

# 訓練所用的各類樣本數
print(clf.class_count_)

# 兩類與7612(7484) 個屬性的交叉列表
print(clf.feature_count_)

print(clf.feature_count_.shape)

# 已知類別下，各屬性之條件機率Pr[x_i|y] 的對數值
print(clf.feature_log_prob_[:, :4])

print(clf.feature_log_prob_.shape)

# 將對數條件機率轉成機率值(補充程式碼)
feature_prob = np.exp(clf.feature_log_prob_)
print(feature_prob.shape)
print(feature_prob[:, :4])
# 驗證兩類之機率值總和為1(補充程式碼)
print(np.apply_along_axis(np.sum, 1, feature_prob))  # [1. 1.]
# 兩類最大字詞機率值(補充程式碼)
print(np.apply_along_axis(np.max, 1, feature_prob))  # [0.00813987 0.01839848]]
# 抓出兩類機率前十高的字詞，與文字雲結果？？？(補充程式碼)
print(sms_dtm.columns.values[np.argsort(-feature_prob)[:, :10]])
# ham: [['minute' 'said' 'bad' 'asked' 'star' 'way' 'dish' 'think' 'know' 'say']
# spam: ['dish' 'sauce' 'fresh' 'dinner' 'meal' 'recommend' 'favorite' 'day' 'right' 'beer']]
# ------------------------------------------------------------------------------
# 載入sklearn 交叉驗證模型選擇的重要函數
# 自定義k 摺交叉驗證模型績效計算函數

 訓練集正確率為0.8489568345323741
 測試集正確率為0.8472395795045211
[149417. 267583.]
[[355. 256. 299. ... 673. 231. 314.]
 [501. 388. 387. ...  14. 417. 790.]]
(2, 7202)
[[ -9.91215884 -10.23801348 -10.08330709  -7.29891333]
 [ -9.95995355 -10.21497433 -10.21754833  -7.0856464 ]]
(2, 7202)
(2, 7202)
[[4.95683101e-05 3.57838643e-05 4.17710479e-05 6.76273265e-04]
 [4.72549309e-05 3.66178648e-05 3.65237315e-05 8.37033558e-04]]
[1. 1.]
[0.00413185 0.00361867]
[['minute' 'said' 'bad' 'asked' 'star' 'way' 'dish' 'think' 'know' 'say']
 ['dish' 'sauce' 'fresh' 'dinner' 'meal' 'recommend' 'favorite' 'day'
  'right' 'beer']]


In [17]:
# 抓出兩類機率10～20的字詞
print(sms_dtm.columns.values[np.argsort(-feature_prob)[:, 10:20]])
#[['want' 'going' 'meal' 'sauce' 'told' 'waitress' 'customer' 'day' 'took' 'dinner']
# ['small' 'bit' 'cheese' 'sandwich' 'lunch' 'area' 'lot' 'flavor' 'pizza' 'perfect']]

[['want' 'going' 'meal' 'sauce' 'told' 'waitress' 'customer' 'day' 'took'
  'dinner']
 ['small' 'bit' 'cheese' 'sandwich' 'lunch' 'area' 'lot' 'flavor' 'pizza'
  'perfect']]


In [16]:
def evaluate_cross_validation(clf, X, y, K):
    # 創建k 摺交叉驗證迭代器(iterator)，用於X 與y 的切分
    cv = KFold(n_splits=K, shuffle=True, random_state=0)
    scores = cross_val_score(clf, X, y, cv=cv)
    print("{}摺交叉驗證結果如下：\n{}".format(K, scores))
    tmp = " 平均正確率：{0:.3f}(+/-標準誤{1:.3f})"
    print(tmp.format(np.mean(scores), sem(scores)))


evaluate_cross_validation(clf, sms_dtm, sms_raw['type'], 5)

5摺交叉驗證結果如下：
[0.84438479 0.84542775 0.84719456 0.84763383 0.84817071]
 平均正確率：0.847(+/-標準誤0.001)
