In [1]:
import numpy as np
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


In [2]:
data=pd.read_csv('out.csv')

In [3]:
X=data[['review']]
y=data.label

In [4]:
def cutword(text):
    return ' '.join(jieba.cut(text,use_paddle=True))

In [5]:
X['cutted_review'] = X.review.apply(cutword)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.879 seconds.
Prefix dict has been built successfully.


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
def get_custom_stopwords(stop_words_file):
    with open(stop_words_file,encoding='UTF-8') as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list

In [8]:
stop_words_file = 'cn_stopwords.txt'
stopwords = get_custom_stopwords(stop_words_file)

In [9]:
'''vect = CountVectorizer()
term_matrix = pd.DataFrame(vect.fit_transform(X_train.cutted_review).toarray(), columns=vect.get_feature_names())'''


'vect = CountVectorizer()\nterm_matrix = pd.DataFrame(vect.fit_transform(X_train.cutted_review).toarray(), columns=vect.get_feature_names())'

In [10]:
'''vect = CountVectorizer(stop_words=frozenset(stopwords))
term_matrix = pd.DataFrame(vect.fit_transform(X_train.cutted_review).toarray(), columns=vect.get_feature_names())
term_matrix.head()'''

'vect = CountVectorizer(stop_words=frozenset(stopwords))\nterm_matrix = pd.DataFrame(vect.fit_transform(X_train.cutted_review).toarray(), columns=vect.get_feature_names())\nterm_matrix.head()'

In [11]:
max_df = 0.8 # 在超过这一比例的文档中出现的关键词（过于平凡），去除掉。
min_df = 3 # 在低于这一数量的文档中出现的关键词（过于独特），去除掉。
vect = TfidfVectorizer(max_df = max_df,
                       min_df = min_df,
                       token_pattern='(?u)\\b[^\\d\\W]\\w+\\b',#"(?u)“放在前面的意思是匹配中对大小写不敏感,"\b"表示匹配两个词语的间隔(可以简单的理解为空格)
                       stop_words=frozenset(stopwords))

In [12]:
vect1 = CountVectorizer(max_df = max_df,
                       min_df = min_df,
                       token_pattern='(?u)\\b[^\\d\\W]\\w+\\b',#"(?u)“放在前面的意思是匹配中对大小写不敏感,"\b"表示匹配两个词语的间隔(可以简单的理解为空格)
                       stop_words=frozenset(stopwords))

In [13]:
term_matrix1 = pd.DataFrame(vect.fit_transform(X_train.cutted_review).toarray(), columns=vect.get_feature_names())

In [14]:
term_matrix = pd.DataFrame(vect.fit_transform(X_train.cutted_review).toarray(), columns=vect.get_feature_names())

In [15]:
term_matrix[['分钟','小时','迟到','终于','三点']]

Unnamed: 0,分钟,小时,迟到,终于,三点
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
8985,0.0,0.0,0.0,0.0,0.0
8986,0.0,0.0,0.0,0.0,0.0
8987,0.0,0.0,0.0,0.0,0.0
8988,0.0,0.0,0.0,0.0,0.0


In [16]:
term_matrix1.sum().sort_values().to_csv('index.csv')

In [17]:
nb = MultinomialNB()

In [18]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(vect, nb)

In [19]:
pipe.steps

[('tfidfvectorizer',
  TfidfVectorizer(max_df=0.8, min_df=3,
                  stop_words=frozenset({'', '$', '0', '1', '2', '3', '4', '5',
                                        '6', '7', '8', '9', '?', '_', '“', '”',
                                        '、', '。', '《', '》', '一', '一些', '一何', '一切',
                                        '一则', '一方面', '一旦', '一来', '一样', '一般', ...}),
                  token_pattern='(?u)\\b[^\\d\\W]\\w+\\b')),
 ('multinomialnb', MultinomialNB())]

In [20]:
#训练集交叉验证准确率
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train.cutted_review, y_train, cv=5, scoring='accuracy').mean()

0.8466073414905451

In [21]:
pipe.fit(X_train.cutted_review, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.8, min_df=3,
                                 stop_words=frozenset({'', '$', '0', '1', '2',
                                                       '3', '4', '5', '6', '7',
                                                       '8', '9', '?', '_', '“',
                                                       '”', '、', '。', '《', '》',
                                                       '一', '一些', '一何', '一切',
                                                       '一则', '一方面', '一旦', '一来',
                                                       '一样', '一般', ...}),
                                 token_pattern='(?u)\\b[^\\d\\W]\\w+\\b')),
                ('multinomialnb', MultinomialNB())])

In [22]:
pipe.predict(X_test.cutted_review)

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [23]:
y_pred = pipe.predict(X_test.cutted_review)

In [24]:
from sklearn import metrics

In [25]:
metrics.accuracy_score(y_test, y_pred)

0.8575241908575242

In [26]:
metrics.confusion_matrix(y_test, y_pred)

array([[1863,  119],
       [ 308,  707]], dtype=int64)

In [27]:
from snownlp import SnowNLP
def get_sentiment(text):
    return SnowNLP(text).sentiments

In [28]:
y_pred_snownlp = X_test.review.apply(get_sentiment)

In [29]:
y_pred_snownlp_normalized = y_pred_snownlp.apply(lambda x: 1 if x>0.5 else 0)

In [30]:
y_pred_snownlp_normalized[:5]

3259    0
1488    0
9341    0
2619    0
9799    0
Name: review, dtype: int64

In [31]:
metrics.accuracy_score(y_test, y_pred_snownlp_normalized)

0.7901234567901234

In [32]:
metrics.confusion_matrix(y_test, y_pred_snownlp_normalized)

array([[1629,  353],
       [ 276,  739]], dtype=int64)