## 分析新闻正负情感值：
1. 读取新闻并且进行文本处理和分词
2. 建立模型 build_and_evaluate(X,y,classifier,outpath='model')
    - 采用了gridsearch， 允许同时运训多种参数同时进行
    - 也可以gridsearch = False 如果不需要调参数
    * 根据precision 和 recall 值， 对比Multibinomial, logistic regression, SGD等。最终采用**随机森林** 模型
    * outpath 允许模型保存到本地
3. 用**随机森林**模型，得出每个词的重要性，也就是feature importance
4. 寻找对于模型预测最有决定性的文字：show_most_informative_features(model, text=None, n=20):
    - 这个function只适用于pipline结构，也就是在训练模型的时候，需要吧gridsearch关掉训练出来的模型才能用
    - 随机森里不可用
    - 会产出哪些词是正面导向和负面导向以及向量
 

In [118]:
import numpy as np
import pickle
import logging
np.random.seed(1337)
import re
import codecs
import jieba
import math
import numpy as np
from itertools import product, count  
from heapq import nlargest  
from gensim.models import word2vec  
from sklearn.utils import shuffle
try:
    reload(sys)
    sys.setdefaultencoding('utf-8')
except:
    pass

## 打开正面和负面的新闻
格式是 {标题：新闻内容}

In [119]:
import json
with open("pos.JSON", 'r') as fp:
    pos_text = json.load( fp)
with open("neg.JSON", 'r') as fp:
    neg_text = json.load( fp)

In [120]:
#分句
def sentence_split(str_centence):
    list_ret = list()
    for s_str in str_centence.split('。'):
        if '?' in s_str:
            list_ret.extend(s_str.split('？'))
        elif '!' in s_str:
            list_ret.extend(s_str.split('！'))
        else:
            list_ret.append(s_str.strip())
    return list_ret

In [121]:
#建立停词
stopwordset = set()
with open('sa/jieba_dict/stopwords.txt','r',encoding = 'utf-8') as sw:
    for line in sw:
        stopwordset.add(line.strip('\n'))

#分词wrapper
def seg_art_list(art_list):
    corpus = []
    for title, art in art_list.items():
        corpus.append(" ".join(seg_sent(art)))
    return corpus

#分词，去标点符号，去停词，去英文，去数字
#art： 单篇文章
#return：所有的单词
output = open('word_seg.txt','w')
def seg_sent(art):
    r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'  
    art = art.strip()
    art = re.sub(r, '', art)
    art = re.sub("[a-z]","",art)
    art = re.sub("[0-9]","",art)
    l = []

    seg_list = list(jieba.cut(art, cut_all = False))
    for word in seg_list:
        if word not in stopwordset and word != ' ' and word != "\n" and word != "\n\n":
                output.write(word + ' ')
                l.append(word)
    return l


In [122]:
#建立和保存正面新闻的 分词
#建立和保存负面新闻的 分词

pos_tfidf = seg_art_list(pos_text)
neg_tfidf = seg_art_list(neg_text)
with open("pos_tfidf.JSON", 'w') as fp:
    json.dump(pos_tfidf, fp)
with open("neg_tfidf.JSON", 'w') as fp:
    json.dump(neg_tfidf, fp)

## 建模

In [131]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics.scorer import make_scorer
from sklearn import linear_model
from sklearn import metrics
import operator
from time import time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split as tts
import sklearn
import timeit
from sklearn.model_selection import GridSearchCV

In [124]:
#Y： 1为正面，-1为负面
X = list(pos_tfidf) + list(neg_tfidf)
y = [1] * len(pos_tfidf) + [-1]*len(neg_tfidf)

In [126]:
len(y)

925

### 模型训练 

In [157]:
def build_and_evaluate(X, y, classifier=SGDClassifier,outpath = False, grid_Search = True, parameters = True):

    def build(classifier, X, y=None, parameters = True):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier()
        if grid_Search:
            if parameters:
                parameters = {'vect__max_df': (0.5, 0.75, 1.0),
                            #'vect__max_features': (None, 5000, 10000, 50000),
                            'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
                            'tfidf__use_idf': (True, False),
                            #'tfidf__norm': ('l1', 'l2'),
                            #'clf__alpha': (0.00001, 0.000001),
                            #'clf__penalty': ('l2', 'elasticnet'),
                            #'clf__n_iter': (10, 50, 80),
                    }

            pipeline = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', classifier),])
            grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
            print("Performing grid search...")
            print("pipeline:", [name for name, _ in pipeline.steps])
            print("parameters:")
            print(parameters)
            t0 = time()
            grid_search.fit(X, y)
            print("done in %0.3fs" % (time() - t0))
            print()

            print("Best score: %0.3f" % grid_search.best_score_)
            print("Best parameters set:")
            best_parameters = grid_search.best_estimator_.get_params()
            for param_name in sorted(parameters.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))
            #model.fit(X, y)
            if outpath:
                with open(outpath, 'wb') as f:
                    pickle.dump(grid_search, f, protocol = 2)
            print("Model written out to {}".format(outpath))
            return grid_search

        else:
            
            pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),('tfidf', TfidfTransformer()),('clf', classifier),])
            t0 = time()
            pipeline.fit(X, y)

            print("done in %0.3fs" % (time() - t0))
            return pipeline
            
    # Begin evaluation

    labels = LabelEncoder()

    y = labels.fit_transform(y)

    print("Building for evaluation")

    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)

    model = build(classifier, X_train, y_train)

    y_pred = model.predict(X_test)
    print("classifier result:")
    print(clsr(y_test, y_pred, target_names=['neg', 'pos']))
    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f, protocol = 2)

        print("Model written out to {}".format(outpath))
    print("confusion matrix")
    print(metrics.confusion_matrix(y_test, y_pred))

    return model


## 1. MultinomialNB

In [175]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB


model_Mb = build_and_evaluate(X,y,classifier,outpath='model', grid_Search = False)

Building for evaluation
done in 0.841s
classifier result:
             precision    recall  f1-score   support

        neg       0.97      0.59      0.73        61
        pos       0.83      0.99      0.90       124

avg / total       0.88      0.86      0.85       185

Model written out to model
confusion matrix
[[ 36  25]
 [  1 123]]


## 2. Logistic Regression

In [149]:

from sklearn.linear_model import LogisticRegression
classifier= LogisticRegression

model_Lr = build_and_evaluate(X,y,classifier,outpath='model', gir)

Building for evaluation
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False)}
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    6.2s finished


done in 7.354s

Best score: 0.877
Best parameters set:
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Model written out to model
classifier result:
             precision    recall  f1-score   support

        neg       0.93      0.67      0.78        78
        pos       0.80      0.96      0.87       107

avg / total       0.85      0.84      0.83       185

Model written out to model
confusion matrix
[[ 52  26]
 [  4 103]]


## 3. Stochastic gradient descent

In [151]:
model_default = build_and_evaluate(X,y,outpath='model')


Building for evaluation
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False)}
Fitting 3 folds for each of 12 candidates, totalling 36 fits






[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    7.0s finished


done in 8.214s

Best score: 0.950
Best parameters set:
	tfidf__use_idf: False
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)
Model written out to model
classifier result:
             precision    recall  f1-score   support

        neg       0.96      0.92      0.94        71
        pos       0.95      0.97      0.96       114

avg / total       0.95      0.95      0.95       185

Model written out to model
confusion matrix
[[ 65   6]
 [  3 111]]


## 4. Random Forest

In [61]:
_X.ravel()

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [168]:
from sklearn.ensemble import RandomForestClassifier

classifier= RandomForestClassifier(n_estimators=100,random_state=5,min_samples_leaf=2)

model_RF = build_and_evaluate(X,y,classifier,outpath='model',grid_Search=False)

Building for evaluation
done in 1.574s
classifier result:
             precision    recall  f1-score   support

        neg       0.93      0.83      0.87        63
        pos       0.91      0.97      0.94       122

avg / total       0.92      0.92      0.92       185

Model written out to model
confusion matrix
[[ 52  11]
 [  4 118]]


## Feature Importance

In [162]:

cv = CountVectorizer(ngram_range=(1,2), min_df=20,max_df = 0.9,lowercase=False)
_X = cv.fit_transform(X).toarray()
tfidf = TfidfVectorizer()
_X = tfidf.fit_transform(X)
feature_names = list(tfidf.get_feature_names())
classifier= RandomForestClassifier(n_estimators=100,random_state=5,min_samples_leaf=2)



In [163]:
X_train, X_test, y_train, y_test = tts(_X, y, test_size=0.2)

model = classifier.fit(X_train, y_train)

In [164]:
feature_importance = pd.DataFrame({'name':feature_names, 'importance':model.feature_importances_},columns = ['name','importance'])

In [165]:
sort_importance = feature_importance.sort_values(by= ['importance'],ascending = False)
sort_importance.head(50)

Unnamed: 0,name,importance
2060,买入,0.053069
1724,中性,0.028459
9458,收入,0.014834
11027,毛利率,0.014178
5513,同比,0.013133
7207,对应,0.010716
17396,风险,0.009473
3521,公司,0.009059
15145,评级,0.008855
1323,下降,0.008594


In [166]:
sort_importance.to_csv("sort_importance.csv",index=False)

## 几个模型的训练结果是随机森林效果最佳

### 最有决定性的文字：


#### 这个需要grid_searhch = False, 而且取决于模型的功能。随机森林不可用

In [167]:
def show_most_informative_features(model, text=None, n=20):
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vect']
    classifier = model.named_steps['clf']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {}.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=operator.itemgetter(0), reverse=True
    )

    # Get the top n and bottom n coef, name pairs
    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append(
            "Classified as: {}".format(model.predict([text]))
        )
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(
                cp, fnp, cn, fnn
            )
        )

    return "\n".join(output)


In [177]:
words_Mb = show_most_informative_features(model_Mb)
print(words_Mb)

-8.2820             公司    -11.8568          龙雨 电子
-9.2924             亿元    -11.8568             龙雨
-9.3279             增长    -11.8568        龙头企业 拥有
-9.5122             业务    -11.8568        龙头企业 布局
-9.6053             有望    -11.8568      龙头企业 实力雄厚
-9.6294             预计    -11.8568        龙头企业 上海
-9.6427             产品    -11.8568          龙头 雏形
-9.6530             项目    -11.8568        龙头 稳定增长
-9.6546             产能    -11.8568          龙头 现已
-9.6635             行业    -11.8568        龙头 火星时代
-9.7327             我们    -11.8568          龙头 每年
-9.7488             提升    -11.8568          龙头 挤压
-9.7504             业绩    -11.8568          龙头 战略
-9.7550             同比    -11.8568          龙头 实施
-9.7746             市场    -11.8568          龙头 主营
-9.7939             分别    -11.8568          龙大 鲜肉
-9.8158             预期    -11.8568          龙大 肉食
-9.8608             万吨    -11.8568          龙大 五块
-9.8809             投资    -11.8568             龙大
-9.9133             未来    -11.8568         齿式 联轴器
