# **4： 情感分析**

随着词向量的训练，我们准备展示一个简单的情感分析案例。随着词向量的训练，我们准备展示一个简单的情感分析。对于每条Stanford Sentiment Treebank数据集中的句子，将句子中全体词向量的平均值算作其特征值，并试图预测所提句子中的情感层次。短语的情感层次使用真实数值在原始数据集中表示，并被我们用以下5个类别来表示： 
“超级消极”，“比较消极”，“中立”，“积极”，“非常积极”

对其分别进行从0到4的编码。在这一部分，你将学习用SGD来训练一个softmax回归机，并且通过不断地训练／调试验证来提高回归机的泛化能力。 


在斯坦福情感树库做5个类别的情感分析，模型是简单的softmax，仅要求准确率至少36.5%。

参考资料：

http://www.hankcs.com/nlp/cs224n-assignment-1.html/2


## 4-1 特征向量


实现一个句子的特征生成器和softmax回归机。

一种最简单的特征选择方法就是取所有词向量的平均：

In [2]:
#!/usr/bin/env python

import argparse
import numpy as np
import matplotlib

matplotlib.use('agg')
import matplotlib.pyplot as plt
import itertools

from utils.treebank import StanfordSentiment
import utils.glove as glove

from q3_sgd import load_saved_params, sgd

# We will use sklearn here because it will run faster than implementing
# ourselves. However, for other parts of this assignment you must implement
# the functions yourself!
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


def getArguments():
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--pretrained", dest="pretrained", action="store_true",
                       help="Use pretrained GloVe vectors.")
    group.add_argument("--yourvectors", dest="yourvectors", action="store_true",
                       help="Use your vectors from q3.")
    return parser.parse_args()


#获取句子的特征
def getSentenceFeatures(tokens, wordVectors, sentence):
    """
    Obtain the sentence feature for sentiment analysis by averaging its
    word vectors
    """

    # Implement computation for the sentence features given a sentence.

    # Inputs:
    # tokens -- a dictionary that maps words to their indices in
    #           the word vector list
    # wordVectors -- word vectors (each row) for all tokens
    # sentence -- a list of words in the sentence of interest

    # Output:
    # - sentVector: feature vector for the sentence

    sentVector = np.zeros((wordVectors.shape[1],))

    ### YOUR CODE HERE
    for s in sentence:
        sentVector += wordVectors[tokens[s], :]

    sentVector *= 1.0 / len(sentence)
    ### END YOUR CODE

    assert sentVector.shape == (wordVectors.shape[1],)
    return sentVector


def getRegularizationValues():
    """Try different regularizations

    Return a sorted list of values to try.
    """
    values = None  # Assign a list of floats in the block below
    ### YOUR CODE HERE
    values = np.logspace(-4, 2, num=100, base=10)
    ### END YOUR CODE
    return sorted(values)


def chooseBestModel(results):
    """Choose the best model based on parameter tuning on the dev set

    Arguments:
    results -- A list of python dictionaries of the following format:
        {
            "reg": regularization,
            "clf": classifier,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy
        }

    Returns:
    Your chosen result dictionary.
    """
    bestResult = None

    ### YOUR CODE HERE
    bestResult = max(results, key=lambda x: x["dev"])
    ### END YOUR CODE

    return bestResult


def accuracy(y, yhat):
    """ Precision for classifier """
    assert (y.shape == yhat.shape)
    return np.sum(y == yhat) * 100.0 / y.size


def plotRegVsAccuracy(regValues, results, filename):
    """ Make a plot of regularization vs accuracy """
    plt.plot(regValues, [x["train"] for x in results])
    plt.plot(regValues, [x["dev"] for x in results])
    plt.xscale('log')
    plt.xlabel("regularization")
    plt.ylabel("accuracy")
    plt.legend(['train', 'dev'], loc='upper left')
    plt.savefig(filename)


def outputConfusionMatrix(features, labels, clf, filename):
    """ Generate a confusion matrix """
    pred = clf.predict(features)
    cm = confusion_matrix(labels, pred, labels=range(5))
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Reds)
    plt.colorbar()
    classes = ["- -", "-", "neut", "+", "+ +"]
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(filename)


def outputPredictions(dataset, features, labels, clf, filename):
    """ Write the predictions to file """
    pred = clf.predict(features)
    with open(filename, "w") as f:
        f.write("True\tPredicted\tText")
        # print(>> f, )
        for i in range(len(dataset)):
        #print(s, end="", file=depend)
            f.write("%d\t%d\t%s" % (
                labels[i], pred[i], " ".join(dataset[i][0])))


def main(par):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if par=="yourvectors":
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords, :], wordVectors[nWords:, :]),
            axis=1)
    elif par=="pretrained":
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    for i in range(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in range(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in range(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print("Training for reg=%f" % reg)
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0 / (reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print("Train accuracy (%%): %f" % trainAccuracy)

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print("Dev accuracy (%%): %f" % devAccuracy)

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print("Test accuracy (%%): %f" % testAccuracy)

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print("")
    print("=== Recap ===")
    print("Reg\t\tTrain\tDev\tTest")
    for result in results:
        print("%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"]))
    print("")

    bestResult = chooseBestModel(results)
    print("Best regularization value: %0.2E" % bestResult["reg"])
    print("Test accuracy (%%): %f" % bestResult["test"])

    # do some error analysis
    if args=="pretrained":
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],"q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt")




In [None]:
#ipykernel_launcher.py main(getArguments()) -h --pretrained  
args="yourvectors"
main(args)

#如果用训练不收敛的数据结果：
#Best regularization value: 1.15E-04
#Test accuracy (%): 29.502262

#自己训练的词向量
#Best regularization value: 1.15E-04
#Test accuracy (%): 29.502262

#http://liuchengxu.org/pelican-blog/jupyter-notebook-tips.html


## 4-2 正则化的意义

解释当分类语料少于三句时为什么要引入正则化（实际上在大多数机器学习任务都这样）。

避免过拟合，提高对未知实例的泛化能力

## 4-3 调参

在`q4 sentiment.py`中完成超参数的实现代码从而获取“最佳”的惩罚因子。你是如何选择的？报告你的训练、调试和测试精度，在最多一个句子中校正你的超参数选定方法。 注释：在开发中应该获取至少30%的准确率。 

解答：参考值为1e-4，在调试、开发和测试过程中准确率分别为27.072%， 25.341%，22.896%

在验证集上搜索超参数代码如下：


In [None]:
def getRegularizationValues():
    """Try different regularizations
    Return a sorted list of values to try.
    """
    values = None  # Assign a list of floats in the block below
    ### YOUR CODE HERE
    values = np.logspace(-4, 2, num=100, base=10)
    ### END YOUR CODE
    return sorted(values)

def chooseBestModel(results):
    """Choose the best model based on parameter tuning on the dev set
    Arguments:
    results -- A list of python dictionaries of the following format:
        {
            "reg": regularization,
            "clf": classifier,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy
        }
    Returns:
    Your chosen result dictionary.
    """
    bestResult = None
    ### YOUR CODE HERE
    bestResult = max(results, key=lambda x: x["dev"])
    ### END YOUR CODE
    return bestResult

## 4-4 词向量的影响

绘出在训练和开发过程中的分类准确率，并在x轴使用对数刻度来对正则化值进行相关设置。这应该自动化的进行。包括在你作业中详细展示的坐标图`q4_reg_acc.png`。简明解释最多三个句子在此坐标图中的显示情况。 



用自己的词向量训练模型，与用GloVe预训练的词向量的模型作比较，为什么后者好？

后者更好的原因是：

- 后者在维基上训练，数据量更大

- 后者维度更高（50维）

- GloVe利用了全局统计信息，而word2vec（SG）没有


## 4-5 惩罚因子对效果的影响

从程序运行完保存的`q4_reg_v_acc.png`可以看出正则化的惩罚因子对结果的影响：
![q4_reg_v_acc.png](q4_reg_v_acc.png)

In [None]:
#用预训练的Glove模型
args="pretrained"
main(args)

显然用预训练的词向量，效果更高。

In [6]:
import numpy as np
import matplotlib.pyplot as plt

from utils import *

from q3_sgd import load_saved_params, sgd
from q1_softmax import softmax

import imp
#imp.reload(q3_sgd.load_saved_params)
#from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy

def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
    """ Softmax Regression """
    # 完成加正则化的softmax回归        

    # 输入:                                                         
    # - features: feature vectors, each row is a feature vector 
    # - labels: labels corresponding to the feature vectors     
    # - weights: weights of the regressor                       
    # - regularization: L2 regularization constant              

    # 输出:                                                         
    # - cost: cost of the regressor                             
    # - grad: gradient of the regressor cost with respect to its weights                                               
    # - pred: label predictions of the regressor (you might find np.argmax helpful)  

    prob = softmax(features.dot(weights))
    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N 
    cost += 0.5 * regularization * np.sum(weights ** 2)

    grad = np.array(prob)
    grad[range(N), labels] -= 1.0
    grad = features.T.dot(grad) / N
    grad += regularization * weights

    if N > 1:
        pred = np.argmax(prob, axis=1)
    else:
        pred = np.argmax(prob)

    if nopredictions:
        return cost, grad
    else:
        return cost, grad, pred

def softmax_wrapper(features, labels, weights, regularization = 0.0):
    cost, grad, _ = softmaxRegression(features, labels, weights, 
        regularization)
    return cost, grad

# 试试不同的正则化系数，选最好的
REGULARIZATION = [0.0, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003, 0.01]

# 载入数据集
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# 载入预训练好的词向量 
_, wordVectors0, _ = load_saved_params()
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
dimVectors = wordVectors.shape[1]

# 载入训练集
trainset = dataset.getTrainSentences()
nTrain = len(trainset)
trainFeatures = np.zeros((nTrain, dimVectors))
trainLabels = np.zeros((nTrain,), dtype=np.int32)
for i in range(nTrain):
    words, trainLabels[i] = trainset[i]
    trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

# 准备好训练集的特征
devset = dataset.getDevSentences()
nDev = len(devset)
devFeatures = np.zeros((nDev, dimVectors))
devLabels = np.zeros((nDev,), dtype=np.int32)
for i in range(nDev):
    words, devLabels[i] = devset[i]
    devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

# 尝试不同的正则化系数
results = []
for regularization in REGULARIZATION:
    np.random.seed(3141)
    np.random.seed(59265)
    weights = np.random.randn(dimVectors, 5)
    print("Training for reg=%f" % regularization )

    # batch optimization
    weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels, 
        weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100)

    # 训练集上测效果
    _, _, pred = softmaxRegression(trainFeatures, trainLabels, weights)
    trainAccuracy = accuracy(trainLabels, pred)
    print("Train accuracy (%%): %f" % trainAccuracy)

    # dev集合上看效果
    _, _, pred = softmaxRegression(devFeatures, devLabels, weights)
    devAccuracy = accuracy(devLabels, pred)
    print("Dev accuracy (%%): %f" % devAccuracy)

    # 保存结果权重
    results.append({
        "reg" : regularization, 
        "weights" : weights, 
        "train" : trainAccuracy, 
        "dev" : devAccuracy})

# 输出准确率
print("")
print("=== Recap ===")
print("Reg\t\tTrain\t\tDev")
for result in results:
    print("%E\t%f\t%f" % (
        result["reg"], 
        result["train"], 
        result["dev"]))
print("")

# 选最好的正则化系数
BEST_REGULARIZATION = None
BEST_WEIGHTS = None

best_dev = 0
for result in results:
    if result["dev"] > best_dev:
        best_dev = result["dev"]
        BEST_REGULARIZATION = result["reg"]
        BEST_WEIGHTS = result["weights"]

# Test your findings on the test set
testset = dataset.getTestSentences()
nTest = len(testset)
testFeatures = np.zeros((nTest, dimVectors))
testLabels = np.zeros((nTest,), dtype=np.int32)


un_expect_num= 107
Training for reg=0.000000
iter 100: 1.575228
iter 200: 1.575209
iter 300: 1.575174
iter 400: 1.575125
iter 500: 1.575062
iter 600: 1.574987
iter 700: 1.574902
iter 800: 1.574808
iter 900: 1.574705
iter 1000: 1.574595
iter 1100: 1.574479
iter 1200: 1.574358
iter 1300: 1.574231
iter 1400: 1.574100
iter 1500: 1.573966
iter 1600: 1.573829
iter 1700: 1.573689
iter 1800: 1.573548
iter 1900: 1.573404
iter 2000: 1.573259
iter 2100: 1.573113
iter 2200: 1.572967
iter 2300: 1.572820
iter 2400: 1.572673
iter 2500: 1.572526
iter 2600: 1.572379
iter 2700: 1.572233
iter 2800: 1.572087
iter 2900: 1.571942
iter 3000: 1.571798
iter 3100: 1.571655
iter 3200: 1.571512
iter 3300: 1.571371
iter 3400: 1.571231
iter 3500: 1.571092
iter 3600: 1.570955
iter 3700: 1.570819
iter 3800: 1.570685
iter 3900: 1.570551
iter 4000: 1.570420
iter 4100: 1.570290
iter 4200: 1.570161
iter 4300: 1.570034
iter 4400: 1.569908
iter 4500: 1.569784
iter 4600: 1.569662
iter 4700: 1.569541
iter 4800: 1.569422
iter

iter 9800: 1.571161
iter 9900: 1.571155
iter 10000: 1.571150
Train accuracy (%): 28.604869
Dev accuracy (%): 29.155313
Training for reg=0.000300
iter 100: 1.580777
iter 200: 1.580704
iter 300: 1.580575
iter 400: 1.580405
iter 500: 1.580204
iter 600: 1.579982
iter 700: 1.579745
iter 800: 1.579498
iter 900: 1.579246
iter 1000: 1.578993
iter 1100: 1.578741
iter 1200: 1.578493
iter 1300: 1.578249
iter 1400: 1.578010
iter 1500: 1.577779
iter 1600: 1.577555
iter 1700: 1.577339
iter 1800: 1.577130
iter 1900: 1.576930
iter 2000: 1.576738
iter 2100: 1.576553
iter 2200: 1.576377
iter 2300: 1.576209
iter 2400: 1.576047
iter 2500: 1.575894
iter 2600: 1.575747
iter 2700: 1.575607
iter 2800: 1.575474
iter 2900: 1.575347
iter 3000: 1.575227
iter 3100: 1.575112
iter 3200: 1.575002
iter 3300: 1.574898
iter 3400: 1.574799
iter 3500: 1.574705
iter 3600: 1.574615
iter 3700: 1.574530
iter 3800: 1.574449
iter 3900: 1.574372
iter 4000: 1.574299
iter 4100: 1.574230
iter 4200: 1.574163
iter 4300: 1.574101
iter

iter 9300: 1.576124
iter 9400: 1.576124
iter 9500: 1.576124
iter 9600: 1.576124
iter 9700: 1.576123
iter 9800: 1.576123
iter 9900: 1.576123
iter 10000: 1.576123
Train accuracy (%): 27.071629
Dev accuracy (%): 25.340599

=== Recap ===
Reg		Train		Dev
0.000000E+00	28.862360	30.245232
1.000000E-05	28.768727	29.972752
3.000000E-05	28.651685	29.881926
1.000000E-04	28.604869	29.155313
3.000000E-04	27.949438	26.793824
1.000000E-03	27.118446	25.249773
3.000000E-03	27.083333	25.340599
1.000000E-02	27.071629	25.340599



NameError: name 'getSentenceFeature' is not defined

In [7]:
def getSentenceFeature(tokens, wordVectors, sentence):
    """ 
        简单粗暴的处理方式，直接对句子的所有词向量求平均做为情感分析的输入
    """

    # 输入:                                                         
    # - tokens: a dictionary that maps words to their indices in the word vector list                                
    # - wordVectors: word vectors (each row) for all tokens 
    # - sentence: a list of words in the sentence of interest 

    # 输出:                                                         
    # - sentVector: feature vector for the sentence    

    sentVector = np.zeros((wordVectors.shape[1],))

    indices = [tokens[word] for word in sentence]
    sentVector = np.mean(wordVectors[indices, :], axis=0)

    return sentVector


for i in range(nTest):
    words, testLabels[i] = testset[i]
    testFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)

_, _, pred = softmaxRegression(testFeatures, testLabels, BEST_WEIGHTS)
print("Best regularization value: %E" % BEST_REGULARIZATION)
print("Test accuracy (%%): %f" % accuracy(testLabels, pred))

# 画出正则化和准确率的关系
plt.plot(REGULARIZATION, [x["train"] for x in results])
plt.plot(REGULARIZATION, [x["dev"] for x in results])
plt.xscale('log')
plt.xlabel("regularization")
plt.ylabel("accuracy")
plt.legend(['train', 'dev'], loc='upper left')
plt.savefig("q4_reg_v_acc_backup.png")
plt.show()

Best regularization value: 0.000000E+00
Test accuracy (%): 28.099548


## 4-6 混淆矩阵

上述运行程序过程已经保存confusion matrix的结果图：

![q4_dev_conf.png](q4_dev_conf.png)

这个矩阵的主对角线上的元素越多，说明预测越正确。其他元素都是失误。可见模型很难分辨“中性”情感，并倾向于将其分入负面。但模型没有犯下大是大非的错误（将--分入++，或反之）。

## 4-7 错误分析

查看q4_dev_pred.txt中的输出，想想看什么特征可能提高效果？
“超级消极”，“比较消极”，“中立”，“积极”，“非常积极”
对其分别进行从0到4的编码

比如：
例子：
1	3     nothing is sacred in this gut-buster 
标注是1，预测是3。

说反话，词袋模型的软肋

再比如：

3	1	and if you 're not nearly moved to tears by a couple of scenes , you 've got ice water in your veins
标注是3，预测是1。

不理解习语“moved to tears”，不理解整句话

3	0	... routine , harmless diversion and little else
标注是3，预测是0.

语料标注错误还是？？
