##  文本分类之 - 情感分析 

In [2]:
from matplotlib import pyplot as plt
import jieba # 分词
import re # 正则
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [8]:
def read_data(path, is_pos=None):
    """
    给定文件的路径，读取文件
    path: path to the data
    is_pos: 是否数据是postive samples. 
    return: (list of review texts, list of labels) 
    """
    # reviews表示文本内容列表，labels表示与文本对应的标签列表
    reviews, labels  = [], []
    with open(path, 'r', encoding='utf-8') as file:
        # review_start表示是否为文本内容行
        review_start  = False
        review_text = []
        
        # 按行读取文件内容
        for line in file:
            # 去除行两端的空格
            line = line.strip()
            # 如果此行为空时结束此次循环，进入下次循环（读取下一行）
            if not line: continue
            # 如果不是文本内容行，并且此行以"<review"开头（此时为一段文本的开始）。
            if not review_start and line.startswith("<review"):
                # 因为以"<review"开头之后的下一行即为文本内容行，此处将review_start设置为True
                review_start = True
                # <review id="0">，当label存在时
                if "label" in line:
                    # 将id中的数字提取出来加入labels列表中
                    labels.append(int(line.split('"')[-2]))
                continue                
            # 如果review_start为真，并且此行为"</review>"（说明此时是一段文本的结束）。
            if review_start and line == "</review>":
                # 将review_start设置为假为下一个文本判断做准备
                review_start = False
                # 将review_text中的文本内容以空格作为分隔符分隔并加入reviews的文本列表中
                reviews.append(" ".join(review_text))
                # 将review_text中内容清空，为下一段文本内容做准备
                review_text = []
                continue
            # 此时为文本内容行，将文本内容加入review_text列表中
            if review_start:
                review_text.append(line)
    # 如果是积极的文本，则将其标签设置为1
    if is_pos:
        labels = [1]*len(reviews)
    # 如果是消极的文本，则将其标签设置为0
    elif not is_pos is None:
        labels = [0]*len(reviews)
    # 最终返回文本内容及其对应的标签
    return reviews, labels


def process_file():
    """
    读取训练数据和测试数据，并对它们做一些预处理
    """    
    train_pos_file = "data_sentiment/train.positive.txt"
    train_neg_file = "data_sentiment/train.negative.txt"
    test_comb_file = "data_sentiment/test.combined.txt"
    
    # 读取文件部分，把具体的内容写入到变量里面
    train_pos_cmts, train_pos_lbs = read_data(train_pos_file, True)
    train_neg_cmts, train_neg_lbs = read_data(train_neg_file, False)
    train_comments = train_pos_cmts + train_neg_cmts
    train_labels = train_pos_lbs + train_neg_lbs
    test_comments, test_labels = read_data(test_comb_file)
    return train_comments, train_labels, test_comments, test_labels
train_comments, train_labels, test_comments, test_labels = process_file()

In [5]:
# 训练数据和测试数据大小
print (len(train_comments), len(test_comments))

print (train_comments[1], train_labels[1])

8064 2500
手感超好，而且黑色相比白色在转得时候不容易眼花，找童年的记忆啦。 1


In [10]:
def load_stopwords(path):
    """
    从外部文件中导入停用词
    """
    stopwords = set()
    with open(path, 'r',encoding='utf-8') as in_file:
        for line in in_file:
            stopwords.add(line.strip())
    return stopwords


def clean_non_chinese_symbols(text):
    """
    处理非中文字符
    """
    text = re.sub('[!！]+', "!", text)
    text = re.sub('[?？]+', "?", text)
    text = re.sub("[a-zA-Z#$%&\'()*+,-./:;：<=>@，。★、…【】《》“”‘’[\\]^_`{|}~]+", " UNK ", text)
    # 将text中任意多个字符替换为空格
    return re.sub("\s+", " ", text)  

def clean_numbers(text):
    """
    处理数字符号  128  190  NUM 
    """
    return re.sub("\d+", ' NUM ', text)

def preprocess_text(text, stopwords):
    """
    文本的预处理过程
    """
    text = clean_non_chinese_symbols(text)
    text = clean_numbers(text)
    # 循环遍历所有文本将文本进行jieba分词，并将其中的停用词去除
    text = " ".join([term for term in jieba.cut(text) if term and not term in stopwords])
    return text

In [11]:
path_stopwords = "./data_sentiment/stopwords.txt"
stopwords = load_stopwords(path_stopwords)

In [12]:
# 对于train_comments, test_comments进行字符串的处理，几个考虑的点：
#   1. 停用词过滤
#   2. 去掉特殊符号
#   3. 去掉数字（比如价格..)
#   4. ...
#   需要注意的点是，由于评论数据本身很短，如果去掉的太多，很可能字符串长度变成0
#   预处理部部分，可以自行选择合适的方案，只要注释就可以。

train_comments_new = [preprocess_text(comment, stopwords) for comment in train_comments]
test_comments_new = [preprocess_text(comment, stopwords) for comment in test_comments]

print (train_comments_new[0], test_comments_new[0])


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Public\Documents\Wondershare\CreatorTemp\jieba.cache
Loading model cost 0.598 seconds.
Prefix dict has been built successfully.


发短信 特别 不 方便 ! 背后 屏幕 很大 起来 不 舒服   UNK   手触 屏 ! 切换 屏幕 很 麻烦 ! 终于 找到 同道中人 初中   UNK   已经 喜欢 上   UNK   同学 都 鄙夷 眼光 看   UNK   人为   UNK   样子 古怪 说 ＂ 丑 ＂ 当场 气晕 现在 同道中人   UNK   好开心 !   UNK   !   UNK  


In [13]:
#   利用tf-idf从文本中提取特征,写到数组里面. 
#   参考：https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf = TfidfVectorizer()
X_train =  tfidf.fit_transform(train_comments_new) # 训练数据的特征
y_train =  train_labels # 训练数据的label
X_test = tfidf.transform(test_comments_new) # 测试数据的特征
y_test = test_labels# 测试数据的label

print (np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test))

(8064, 23101) (2500, 23101) (8064,) (2500,)


In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf = MultinomialNB()
# 利用朴素贝叶斯做训练
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("accuracy on test data: ", accuracy_score(y_test, y_pred))

accuracy on test data:  0.6368


In [15]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("accuracy on test data: ", accuracy_score(y_test, y_pred))

accuracy on test data:  0.524


In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

# StandardScaler()归一化，归一化可以：1、加快梯度下降求最优解的速度。2、有可能提高精度。
normalizer = StandardScaler()  # data is no longer sparse
X_train_normalized = normalizer.fit_transform(X_train.toarray())
X_test_normalized = normalizer.transform(X_test.toarray())

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train_normalized, y_train)

#Now we can predict prices:
y_pred = knn.predict(X_test_normalized)
print("accuracy on test data: ", accuracy_score(y_test, y_pred))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [17]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("accuracy on test data: ", accuracy_score(y_test, y_pred))

accuracy on test data:  0.7136
