## 載入資料

In [1]:
import numpy as np
import codecs
import jieba
import jieba.posseg as pseg
import re
import operator
jieba.set_dictionary('./Workspace/sentiment/0616/big/jieba356726.txt')
jieba.load_userdict('./Workspace/sentiment/0616/big/jieba356726.txt')
jieba.load_userdict('./Workspace/sentiment/0616/food/fooddict2027.txt')
jieba.load_userdict('./Workspace/sentiment/0616/menu/menu50806.txt')
jieba.load_userdict('./Workspace/sentiment/0616/sentiment/negativewords.txt')
jieba.load_userdict('./Workspace/sentiment/0616/sentiment/positivewords.txt')
jieba.load_userdict('./Workspace/sentiment/0616/sentiment/negative.txt')
jieba.load_userdict('./Workspace/sentiment/0616/sentiment/more.txt')
jieba.load_userdict('./Workspace/sentiment/0616/sentiment/question.txt')
# 負面
negdict = []
# 正面
posdict = []
# 否定
nodict = []
# 程度
plusdict = []
# 不肯定
question = []
with codecs.open('./Workspace/sentiment/0616/sentiment/negativewords.txt', 'r', 'utf-8') as f:
    for w in f.readlines():
        negdict.append(w.split()[0])
with codecs.open('./Workspace/sentiment/0616/sentiment/positivewords.txt', 'r', 'utf-8') as f:
    for w in f.readlines():
        posdict.append(w.split()[0])
with codecs.open('./Workspace/sentiment/0616/sentiment/negative.txt', 'r', 'utf-8') as f:
    for w in f.readlines():
        nodict.append(w.split()[0])
with codecs.open('./Workspace/sentiment/0616/sentiment/more.txt', 'r', 'utf-8') as f:
    for w in f.readlines():
        plusdict.append(w.split()[0])
with codecs.open('./Workspace/sentiment/0616/sentiment/question.txt', 'r', 'utf-8') as f:
    for w in f.readlines():
        question.append(w.split()[0])

Building prefix dict from /Users/fan/anaconda/bin/Workspace/sentiment/0616/big/jieba356726.txt ...
DEBUG:jieba:Building prefix dict from /Users/fan/anaconda/bin/Workspace/sentiment/0616/big/jieba356726.txt ...
Loading model from cache /var/folders/ws/qq63c39d43l_20sybqcwcf900000gn/T/jieba.u60b07a305259704c4cd827282b04b44e.cache
DEBUG:jieba:Loading model from cache /var/folders/ws/qq63c39d43l_20sybqcwcf900000gn/T/jieba.u60b07a305259704c4cd827282b04b44e.cache
Loading model cost 0.420 seconds.
DEBUG:jieba:Loading model cost 0.420 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


## 建立情緒分析類別

In [2]:
class SentimentAnalysis():
    
    '''建立instance時即讀取字典'''
    def __init__(self, negdict, posdict, nodict, plusdict, question):
        self.score = 0
        self.positiveSubject = []
        self.negativeSubject = []
        self.negdict = negdict
        self.posdict = posdict
        self.nodict = nodict
        self.plusdict = plusdict
        self.question = question
    
    ''' 
        [情緒分數]
        
        計算情緒分數時，先考慮情緒詞正負，再考慮程度與極性      
        例如：好吃的義大利麵 => +1
             不好吃的義大利麵 => -1
             很好吃的義大利麵 => +2
             很不好吃的義大利麵 => -2
             
        考量大部分人傾向用詞委婉，情緒詞正負方向採用不同的處理方式
        對於負評的認定較為寬鬆，而對於正評的認定較為嚴格      
        例如：這個漢堡不算難吃 => 0
             這個漢堡不算好吃 => -1
             
        當否定詞與情緒詞中間夾雜程度用語時，皆判定為中性偏負           
        例如：這個漢堡不算非常難吃 => -0.5
             這個漢堡不算非常好吃 => -0.5
    '''
    def checkDegree(self, sdrange, positive=True):
        # 定義初始極性
        if positive:
            pollar = 1
        else:
            pollar = -1
        # 定義初始程度
        degree = 1
        # 在指定範圍內撈否定詞、程度詞
        for i in xrange(len(sdrange)):
            st = sdrange[i]
            # 標記程度用語是否出現在否定詞之前
            flag = False
            # 標記程度用語是否穿插在否定詞與情緒詞之間
            ambiguous = False
            # 標記是否為負面情緒詞前面含負面詞而不含程度詞
            excp = False
            # 遇到否定用語就改變極性方向
            if st in self.nodict:
                pollar *= -1
            # 若含程度用語
            if st in self.plusdict:
                flag = True
                # 在程度用語前面的範圍找否定詞
                for j in xrange(i):
                    if sdrange[j] in self.nodict:
                        flag = False
                        # 找到代表程度用語穿插在否定詞語情緒詞之間，予以標記
                        ambiguous = True
                        break
                # 若程度用語出現在否定詞之前則乘數設為2
                if flag:
                    degree = 2
                # 若程度用語穿插在否定詞語情緒詞之間，正向詞乘數為-0.5，負向詞乘數為0.5，最後各自乘上極性後分數同為-0.5
                elif ambiguous:
                    if pollar == 1 :
                        degree = -0.5
                    else:
                        degree = 0.5
            # 負向詞遇到否定詞，若沒有程度詞修飾，直接認定為中性
            elif not positive:
                for x in xrange(len(sdrange)):
                    if sdrange[x] in self.nodict:
                        excp = True
                        break
        if excp:
            return 0
        else:
            return pollar * degree
        
    '''
        [情緒分數加總]
        
        找到情緒詞後往前找4個詞，把4個詞丟進checkDegree判斷規則，回傳該情緒詞的分數
        每次計算完分數，判斷正負號，再執行getSubject去抓取正負主題詞，結果存入self.positiveSubject/self.negativeSubject
        並把每次的計算分數累加存入self.score
        
    '''
    def sentimentScore(self, sd):
        words = [w.word for w in sd]
        # 含這些髒東西的句子之後會被跳過
        #pattern = u'文章|本文'
        # 搜尋情緒詞時用倒著找的方式，便於之後主題詞的抓取
        for i in xrange(len(words)-1,-1,-1):
            p = 0
            possbj = None
            negsbj = None
            #遇到髒東西或者是出現使整個句子情緒方向不定的字眼時直接跳過 ex: 見仁見智
            #if re.search(pattern, ''.join(words)) or words[i] in self.question:
            #    break
            # 定義選取範圍
            if i > 3:
                n = 4
            else:
                n = i
            # 若出現在負面詞
            if words[i] in self.negdict:
                if n > 0:
                    p = self.checkDegree(words[i-n:i],positive=False)
                    self.score += p
                else:
                    p = 1
                    self.score -= p
            # 若出現在正面詞
            elif words[i] in self.posdict:
                if n > 0:
                    p = self.checkDegree(words[i-n:i],positive=True)
                    self.score += p
                else:
                    p = 1
                    self.score += p
            # 分別抓取正負情緒詞所對應的主題詞，存入positiveSubject/negativeSubject
            # 主題詞只保留2個字以上的
            if p > 0:
                possbj = self.getSubject(i, sd)
                if possbj and len(possbj)>1:
                    self.positiveSubject.append((possbj, words[i]))
            elif p < 0:
                negsbj = self.getSubject(i, sd)
                if negsbj and len(negsbj)>1:
                    self.negativeSubject.append((negsbj,words[i]))
                    
    '''
        [主題詞抓取原則]
        
        找到情緒詞後，先往後找名詞，並確認是否已在清單中
        若找不到或者是找到的詞已在清單中，則再往前找名詞
    '''   
    def getSubject(self, index, sd):
        subject = None
        end = len(sd)-1
        rindex = index
        lindex = index
        # 是否繼續往前找
        keepgoing = True
        # 把目前已有的positiveSubject/negativeSubject合併
        temp = map(operator.itemgetter(0), self.positiveSubject) + map(operator.itemgetter(0), self.negativeSubject)
        # 往右找
        while(rindex < end):
            rindex += 1
            # 詞性
            flag = sd[rindex].flag
            # 詞
            word = sd[rindex].word
            # 只要詞性包含n就抓下來
            if 'n' in flag:
                if word not in temp:
                    subject = sd[rindex].word
                    keepgoing = False
                break
        # 若往右找沒找到或者找到的已在清單中，則往前找
        if keepgoing:
            while(lindex > 0):
                lindex -= 1
                flag = sd[lindex].flag
                word = sd[lindex].word
                if 'n' in flag:
                    if word not in temp:
                        subject = sd[lindex].word
                    break
        return subject

## 測試文章

In [4]:
# 測試文章
import re
import requests
import time
import math
from bs4 import BeautifulSoup as bs

def removePunctuation(source):
    xx = u"([^a-z^A-Z^\u4e00-\u9fff]+)"
    s = re.sub(xx,' ',source)
    return s

res = requests.get('http://a59407908.pixnet.net/blog/post/115152439')
res.encoding = 'utf-8'
soup = bs(res.text,'html.parser')
[x.extract() for x in soup.select('script')]
[x.extract() for x in soup.select('a')]
art = soup.select('.article-content-inner')
line = [a.text for a in art if a.text!=""]
st = "".join("".join(line).split()).replace(u'延伸閱讀','')
s = removePunctuation(st)
sentence = s.split()
sa = SentimentAnalysis(negdict, posdict, nodict, plusdict, question)

for obj in sentence:
    sd = list(pseg.cut(obj))
    sa.sentimentScore(sd)

print '情緒總分：%s' % str(sa.score)

for ele in sa.positiveSubject:
    print 'positive', ele[0], ele[1]
for ele in sa.negativeSubject:
    print 'negative', ele[0], ele[1]

情緒總分：56
positive 秉持著 新鮮
positive 新鮮 用料實在
positive 新潮 創意
positive 成績 亮眼
positive 回頭客 口碑
positive 對話框 可愛
positive 秘訣 好喝
positive 新鮮度 新鮮
positive 茶飲 特別
positive 檸檬原汁 新鮮
positive 蔗糖 天然
positive 口感 酸甜
positive 酸度 香氣
positive 茶香 獨特的
positive 紅茶 甘醇
positive 果香 甘甜
positive 高雅 清新
positive 青茶 高雅
positive 女孩 最愛
positive 果酸 香氣
positive 果肉 酸甜
positive 百香果 新鮮
positive 柳丁 酸甜
positive 陽光 增添
positive 愛玉 香
positive 奶茶 香氣
positive 花生豆花 出乎意料
positive 香醇 不膩口
positive 豆花 香
positive 香氣 濃郁
positive 阿嬤 香氣
positive 古法 特別
positive 熱門 人氣
positive 冰淇淋 飽滿
positive 茶味 飽滿
negative onebyone 很久


## 測試讀取文章

## 測試句子

In [20]:
# 測試自定義句子

saysomething = "這個漢堡不會太難吃"
sentence2 = saysomething.split()
sa2 = SentimentAnalysis(negdict, posdict, nodict, plusdict, question)

for obj in sentence2:
    sd2 = list(pseg.cut(obj))
    sa2.sentimentScore(sd2)

print '情緒總分：%s' % str(sa2.score)

for ele in sa2.positiveSubject:
    print 'positive', ele[0], ele[1]
for ele in sa2.negativeSubject:
    print 'negative', ele[0], ele[1]

情緒總分：-0.5
negative 漢堡 難吃
