# 題目:電商產品評分文件以機器學習方式分辨是否為正向或負向

說明：輸入文件 positive.review 和 negative.review，兩者都是XML檔。我們用BeautifulSoup讀進來，擷取review_text，然後用NLTK自建Tokenizer。 先產生 word-to-index map 再產生 word-frequency vectors。之後 shuffle data 創造 train/test splits，留100個給 test 用。接著用Logistic Regression 分類器找出訓練組和測試組的準確度(Accuracy)。接著我們可以看看每個單字的正負權重，可以訂一個閥值，比方絕對值大於正負0.5，以確認情緒是顯著的。最後我們找出根據現有演算法歸類錯誤最嚴重的正向情緒和負向情緒的例子。

延伸:可用不同的tokenizer，不同的tokens_to_vector，不同的ML分類器做改進準確率的比較。最後可用您的model去預測unlabeled.review檔的內容。

範例程式檔名: sentiment_情緒分析.py，以LogisticRegression 方式完成情緒分析。

模組: sklearn, bs4, numpy, nltk

輸入檔：stopwords.txt, /electronics 下 positive.review, negative.review

成績：辨識百分率

In [2]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
import nltk
import numpy as np
from sklearn.utils import shuffle
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

# 另一個 stopwords 的來源
# from nltk.corpus import stopwords
# stopwords.words('english')

# 讀正向與負向 reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('positive.review', encoding='utf-8').read(), features="html5lib")
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('negative.review', encoding='utf-8').read(), features="html5lib")
negative_reviews = negative_reviews.findAll('review_text')

# 基於nltk自建 tokenizer

def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # 將字串改為tokens
    tokens = [t for t in tokens if len(t) > 2] # 去除短字
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # 去除大小寫
    tokens = [t for t in tokens if t not in stopwords] # 去除 stopwords
    return tokens

In [5]:
# 先產生 word-to-index map 再產生 word-frequency vectors
# 同時儲存 tokenized 版本未來不需再做 tokenization
word_index_map = {}
current_index = 0
positive_tokenized = [] # store the tokenized text so that further tokenization is not needed
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index # build word-to-index map
            current_index += 1

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

len(word_index_map): 11092


In [7]:
# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # 最後一個元素是標記
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # 正規化數據提升未來準確度
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1) 矩陣 - 擺在一塊將來便於shuffle
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

# shuffle data 創造 train/test splits
# 多次嘗試!
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# 最後 100 列是測試用
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.7826315789473685
Test accuracy: 0.68


In [11]:
# 列出每個字的正負 weight
# 用不同的 threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)


# 找出歸類錯誤的例子
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

# 只列出最糟的
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

unit -0.7010595154469509
bad -0.7059793799785382
cable 0.7275943649758804
time -0.7262634923307855
've 0.7378994913992879
month -0.7405258833059363
sound 1.0903452364670843
lot 0.6391836696373069
you 1.0957396949954246
n't -1.9847789677553316
easy 1.8345738872346866
quality 1.4230501828143534
company -0.5024813614130383
card -0.5584481850471422
item -0.9003133512633421
wa -1.611789288996959
perfect 1.0162825197363154
fast 0.8757364397379382
ha 0.6723839077543783
price 2.9202296089006006
value 0.5649784626180324
money -1.1880277009286946
memory 0.9741276399883112
picture 0.5117812363423602
buy -0.9270604819792434
bit 0.6016819962353869
happy 0.5640104741180315
pretty 0.8111063902495742
doe -1.2535149858529508
highly 1.0261693014909732
recommend 0.73436739312428
customer -0.7557421732546674
support -0.8826361691141432
little 0.9006236825646172
returned -0.7335940612594839
excellent 1.32176679702361
love 1.2176271176117985
feature 0.5281932022766801
home 0.5212342115388671
useless -0.5012

In [10]:
# classification of sentiment
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

classifier_names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier()]

paired_c = zip(classifier_names, classifiers)

for name_c, classifiers in paired_c:
  classifiers.fit(Xtrain,Ytrain)
  print("Classification method: {}, Train accuracy: {:.2f}, Test accuracy: {:.2f}".format(name_c, classifiers.score(Xtrain,Ytrain), classifiers.score(Xtest, Ytest)))

Classification method: Nearest Neighbors, Train accuracy: 0.81, Test accuracy: 0.52
Classification method: Linear SVM, Train accuracy: 0.50, Test accuracy: 0.45
Classification method: RBF SVM, Train accuracy: 0.83, Test accuracy: 0.73
Classification method: Decision Tree, Train accuracy: 0.62, Test accuracy: 0.54
Classification method: Random Forest, Train accuracy: 0.56, Test accuracy: 0.53
Classification method: Neural Net, Train accuracy: 0.64, Test accuracy: 0.56
Classification method: AdaBoost, Train accuracy: 0.81, Test accuracy: 0.76
