In [3]:
from collections import defaultdict, OrderedDict
import os
import re

import jieba
import jieba.analyse
import codecs
import numpy as np
import pandas as pd
from CHlikelihood.likelihood import Likelihood
import pymysql

In [4]:
def readLines(text):
    lst = []
    with open(text) as f:
        for line in f.readlines():
            line = line.replace('\n', '').replace('\t', '')
            lst.append(line)
    return lst

### 1. 文本切割

    段落 --> 句子集
    句子 --> 词汇集 （分词词包 + 停用词包）
    词汇 --> 字典（词汇：文本序号）

In [253]:
"""
1. 文本切割
"""
def para2sent(text):
    # 替换掉情感分析无关的文本内容
    pattern = r'[、a-zA-Z0-9_.%（）-]+'
    text = re.sub(pattern, '', text)
    # 为分句做准备    
    text = text.replace('？', '。').replace('！', '。').replace('\u3000', '').replace('\n', '')
    # 去除文末的说明
    if '责任编辑' in text:
        text = text.split('责任编辑')[0]
    elif '本文来自' in text:
        text = text.split('本文来自')[0] 
    elif '文章关键词' in text:
        text = text.split('文章关键词')[0] 
    # 分词
    sents = text.split('。')
    sents = [sent for sent in sents if len(sent) != 0 and '报记者' not in sent]
    return sents

In [233]:
def sent2word(sentence):
    """
    Segment a sentence to words
    Delete stopwords
    """
    jieba.load_userdict('财经词典.txt')
    segResult = jieba.lcut(sentence)
    stopwords = readLines('新停用词包.txt')
    newSent = []
    for word in segResult:
        if word in stopwords:
            continue
        else:
            newSent.append(word)

    return newSent

In [234]:
def ordered_word(lst):
    """
    turn cutted keywords into dict.keys, and their order as dict.values.
    """
    wordDict = defaultdict(int)
    for i in range(len(lst)):
        if wordDict[lst[i]] != 0:
            continue
        wordDict[lst[i]] = i 
    return wordDict

### 2. 情感定位
    针对分词后的一个句子

    词汇字典 --> sentWord, notWord, degreeWord
    sentWord[index] = sentScore
    notWord[index] = notScore
    degreeWord[index] = degreeScore

In [251]:
"""
2. 情感定位
"""
def classifyWords(wordDict, senDoc='财经词情感得分.txt', negDoc='否定词.txt', levelDoc='程度副词_得分.txt'):
    # (1) 情感词: dict
    senList = readLines(senDoc)
    senDict = defaultdict()
    for s in senList:
        try:
            senDict[s.split(' ')[0]] = s.split(' ')[1]
        except Exception as e:
            pass
    # (2) 否定词: list
    notList = readLines(negDoc)
    # (3) 程度副词: dict
    degreeList = readLines(levelDoc)
    degreeDict = defaultdict()
    for d in degreeList:
        try:
            degreeDict[d.split(',')[0]] = d.split(',')[1]
        except Exception as e:
            print('adverb Dict error: %s' %(e))
    
    senWord = defaultdict()
    notWord = defaultdict()
    degreeWord = defaultdict()
    
    for word in wordDict.keys():
        if word in senDict.keys() and word not in notList and word not in degreeDict.keys():
            senWord[wordDict[word]] = senDict[word]  # 映射出（地址：senti得分）
        elif word in notList and word not in degreeDict.keys():
            notWord[wordDict[word]] = -1    # 映射出（地址：否定）
        elif word in degreeDict.keys():
            degreeWord[wordDict[word]] = degreeDict[word]  # 映射出（地址：程度）
    return senWord, notWord, degreeWord

### 3. 情感聚合
    根据senWord, notWord, degreeWord计算分词后的情感得分。
    计算方法中依赖了词语间的排序，并未使用依存分析的结论。

In [236]:
"""
3. 情感聚合
"""
def scoreSent(senWord, notWord, degreeWord):
    score = 0
    # 存所有情感词的位置的列表
    senLoc = list(senWord.keys())
    notLoc = list(notWord.keys())
    degreeLoc = list(degreeWord.keys())
    senloc = -1
    
    # 遍历句中所有情感单词senWord，sent_score_word = sent_word * degree * not
    for i in senLoc:
        W = 1
        senloc += 1
        W *= float(senWord[i])

        for j in range(senLoc[senloc-1], senLoc[senloc]):
            if j in notLoc:
                W *= -1
            if j in degreeLoc:
                if W < 0:
                    W *= float(degreeWord[j])**-1  # 程序词在否定意义上表示消弱程度，而非加强，因此取倒数
                else:
                    W *= float(degreeWord[j])
                
        # 调整权重, sent_score_sentence = sum(sent_score_word)
        score += W
            
        # i定位至下一个情感词
        i += 1
    return score

In [None]:
# if len(sentences) < 4:
#     all is important
# elif len(sentences) < 10:
#     first & last & top(1) TFIDF
# else:
#     len(sentences) * 0.3

## 按句分开的情感计算

In [148]:
"""
综合以上的函数，按分句汇总计算感情
"""

def calc_senti(text):
    sents = para2sent(text)  # turn text into sentences
    lst = []
    for sent in sents:
        lst_word = sent2word(sent)  # turn each sentence into list of keywords
        dct_word = ordered_word(lst_word) # turn each sentence into dict of keywords with value as index
        senWord, notWord, degreeWord = classifyWords(dct_word) 
        lst.append(scoreSent(senWord, notWord, degreeWord))
    return np.array(lst).mean()

## 还需要解决的问题
0. 句子的依存分析，判断主从关系
        （没学会；目前来看，也没有必要））
1. 关键词重复时，wordDict无法记录  
        (一个句子中出现重复词的可能很小，影响不大）
2. 没有记录title中的权重  
        (给title内容加3倍权重，具体的是senWord中的value * 3，这个系数3未来是可能优化的）
3. 一个段落中不同的句子权重不同，因此需要添加一个给句子计算权重的函数
4. 整理分词以及停用词的词库（针对金融） 
        （载入”财经词典.txt“）
5. 载入金融舆情分析词库
        （载入”财经词情感得分.txt“）
6. 模型评估将要使用的带标记数据

7. 余弦相似度的计算方法，需要根据业务需求，定制一份

In [306]:
def title_senti_weight(senWord):
    senTitle = dict()
    for key in senWord.keys():
        value = float(senWord[key]) * 3
        senTitle[key] = value
    return senTitle

In [307]:
def calc_senti(title, text):
    # text score
    sents = para2sent(text)  # turn text into sentences
    lst = []
    for sent in sents:
        lst_word = sent2word(sent)  # turn each sentence into list of keywords
        dct_word = ordered_word(lst_word) # turn each sentence into dict of keywords with value as index
        senWord, notWord, degreeWord = classifyWords(dct_word) 
        lst.append(scoreSent(senWord, notWord, degreeWord))
    
    # title score
    lst_word = sent2word(title)
    dct_word = ordered_word(lst_word)
    senWord, notWord, degreeWord = classifyWords(dct_word) 
    senTitle = title_senti_weight(senWord)
    lst.append(scoreSent(senTitle, notWord, degreeWord))
    return np.array(lst).mean()

In [308]:
def get_data(company, time):
    conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='zz6901877', db='big_train',
                           charset="utf8", use_unicode=True)
    cursor = conn.cursor()
    
    sql = 'select title, content from sina WHERE company = "{company}" and type="news" and time LIKE "{time}%"'.format(company=company, time=time)
    cursor.execute(sql)
    results = cursor.fetchall()
    return results

In [348]:
results = get_data('aoma', '2019-04-08')

In [336]:
def senti_doc(result):
    # get title and content text
    title = result[0]
    content = main_text(result[1])
    
    main_content = main_text(content, topK=5)
    senti = calc_senti(title, main_content)
    return senti

In [349]:
def senti_day(results):
    denominator = len(results)
    numerator = 0
    for result in results:
        numerator += senti_doc(result)
    score = int(numerator / denominator)
    return score

In [350]:
senti_day(results)

-19

In [328]:
"""
得出最大tfidf列表
"""
def content4tfidf(content):
    new_content = ''
    sentences = para2sent(content)
    for sentence in sentences:
        words = sent2word(sentence)
        new_content += ' '.join(words)
    return new_content

def sentence_tfidf(content, topK=5):
    """
    调用content4tfidf;
    
    return: a string contents the main keywords in the content text
    """
    content = content4tfidf(content)
    jieba.analyse.set_idf_path('财经词典.txt')
    result = jieba.analyse.extract_tags(new_content, topK=topK)
    text_essence = ' '.join(result)
    return text_essence

def level_likelihood(content, topK=5):
    """
    调用sentence_tfidf;
    
    return: a list of tuple, (likelyhood, sentences_index)
    """
    # 初始化余弦相似度计算器
    a = Likelihood()
    # 获取所有句子
    sentences = para2sent(content)
    # 获取文章主干
    text_essence = sentence_tfidf(content, topK=topK)
    # 计算并保存
    alike_lst = []
    i = 0
    for sentence in sentences:
        alike_value = a.likelihood(sentence, text_essence)
        alike_lst.append((alike_value, i))  # 相似度, 文章中句子index
        i += 1
        
    result = sorted(alike_lst, reverse=True)
    return result

In [321]:
def content_text(sentences):
    text_len = int(len(sentences) * 0.3) - 2
    s = ''
    for i in range(text_len):
        sentence_index = like_result[i][1]
        s += sentences[sentence_index]
    return s

def main_text(content, topK=5):
    """
    调用 content_text
    return: text relevant to the theme according to tfidf algorithm
    """
    sentences = para2sent(content)
    like_result = level_likelihood(content, topK=topK)
    if len(sentences) < 4:
        text = ''.join(sentences)
    elif len(sentences) < 10:
        text = sentences[0] + sentences[like_result[0][1]] + sentences[like_result[1][1]]
    else:
        text = content_text(sentences)
    return text