In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# jieba分词
import jieba

# 统计词频
from sklearn.feature_extraction.text import CountVectorizer

# 朴素贝叶斯 特征变量是离散变量，符合多项分布，在文档分类中特征变量体现在一个单词出现的次数，或者是单词的 TF-IDF 值等。|
from sklearn.naive_bayes import MultinomialNB

# 获取数据

In [3]:
data = pd.read_csv("../data/book/书籍评价.csv", encoding="gbk")
data[:5]

Unnamed: 0.1,Unnamed: 0,内容,评价
0,0,从编程小白的角度看，入门极佳。,好评
1,1,很好的入门书，简洁全面，适合小白。,好评
2,2,讲解全面，许多小细节都有顾及，三个小项目受益匪浅。,好评
3,3,前半部分讲概念深入浅出，要言不烦，很赞,好评
4,4,看了一遍还是不会写，有个概念而已,差评


# 数据基本处理
## 取出内容列，对数据进行分析

In [4]:
content = data["内容"]
content.head()

0              从编程小白的角度看，入门极佳。
1            很好的入门书，简洁全面，适合小白。
2    讲解全面，许多小细节都有顾及，三个小项目受益匪浅。
3          前半部分讲概念深入浅出，要言不烦，很赞
4             看了一遍还是不会写，有个概念而已
Name: 内容, dtype: object

## 判定评判标准 -- 1好评;0差评(训练没有用到)

In [6]:
data.loc[:, "评价"] == "好评"

0      True
1      True
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9      True
10    False
11    False
12    False
Name: 评价, dtype: bool

In [7]:
data["评价"] == "好评"

0      True
1      True
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9      True
10    False
11    False
12    False
Name: 评价, dtype: bool

In [8]:
# loc[]
data.loc[data["评价"] == "好评", "评论称号"] = 1
data.loc[data["评价"] == "差评", "评论称号"] = 0
data.head()

Unnamed: 0.1,Unnamed: 0,内容,评价,评论称号
0,0,从编程小白的角度看，入门极佳。,好评,1.0
1,1,很好的入门书，简洁全面，适合小白。,好评,1.0
2,2,讲解全面，许多小细节都有顾及，三个小项目受益匪浅。,好评,1.0
3,3,前半部分讲概念深入浅出，要言不烦，很赞,好评,1.0
4,4,看了一遍还是不会写，有个概念而已,差评,0.0


## 加载停用词

In [13]:
with open("../data/book/stopwords.txt", "r", encoding="utf-8") as f:
    # 取出所有行
    lines = f.read().splitlines()
# set去重
stopwords = list(set(lines))
stopwords[:10]

['碰巧', '老', '全体', '练习', '表示', '存心', '除此而外', '何以', '几番', '＝［']

## 把“内容”处理，转化成标准格式

In [14]:
comment_list = []
for temp in content:
    # 对文本数据进行切割
    # cut_all 参数默认为 False, 不一次性切割所有, 所有使用 cut 方法时默认为精确模式
    seg_split = jieba.cut(temp, cut_all=False)

    # 使用逗号分隔成字符串
    seg_str = ",".join(seg_split)
    comment_list.append(seg_str)
comment_list

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.895 seconds.
Prefix dict has been built successfully.


[' ,从,编程,小白,的,角度看,，,入门,极佳,。',
 '很,好,的,入门,书,，,简洁,全面,，,适合,小白,。',
 '讲解,全面,，,许多,小,细节,都,有,顾及,，,三个,小,项目,受益匪浅,。',
 '前半部,分讲,概念,深入浅出,，,要言不烦,，,很赞',
 '看,了,一遍,还是,不会,写,，,有个,概念,而已',
 '中规中矩,的,教科书,，,零,基础,的,看,了,依旧,看不懂',
 '内容,太,浅显,，,个人,认为,不,适合,有,其它,语言,编程,基础,的,人',
 '破书,一本',
 '适合,完完全全,的,小白读,，,有,其他,语言,经验,的,可以,去,看,别的,书',
 '基础知识,写,的,挺,好,的,！',
 '太,基础',
 '略,_,嗦,。,。,适合,完全,没有,编程,经验,的,小白',
 '真的,真的,不,建议,买']

# 统计词的个数

In [15]:
cv = CountVectorizer(stop_words=stopwords)

In [18]:
x = cv.fit_transform(comment_list).toarray()
x
# 0 1 是一句话包含哪一些词

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0

In [19]:
# 通过 get_feature_names_out()可获取词袋中所有文本的关键字
cv.get_feature_names_out()

array(['一本', '一遍', '三个', '中规中矩', '依旧', '入门', '内容', '分讲', '前半部', '受益匪浅',
       '基础', '基础知识', '完完全全', '小白', '小白读', '建议', '很赞', '教科书', '有个', '极佳',
       '概念', '浅显', '深入浅出', '看不懂', '真的', '破书', '简洁', '细节', '经验', '编程',
       '要言不烦', '角度看', '讲解', '语言', '适合', '项目', '顾及'], dtype=object)

In [21]:
pd.DataFrame(x, columns=cv.get_feature_names_out())

Unnamed: 0,一本,一遍,三个,中规中矩,依旧,入门,内容,分讲,前半部,受益匪浅,...,细节,经验,编程,要言不烦,角度看,讲解,语言,适合,项目,顾及
0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,1,1
3,0,0,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,1,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 准备训练集和测试集

In [22]:
# 准备训练集   这里将文本前10行当做训练集  后3行当做测试集
x_train = x[:10]
x_val = x[10:]
x_train.shape, x_val.shape

((10, 37), (3, 37))

In [33]:
# 这里使用 "评价" 和 "评论称号" 都可以训练, 模型可以使用字符串作为label
y_train = data["评论称号"][:10]
y_val = data["评论称号"][10:]
y_train.shape, y_val.shape

((10,), (3,))

In [34]:
y_train, y_val

(0    1.0
 1    1.0
 2    1.0
 3    1.0
 4    0.0
 5    0.0
 6    0.0
 7    0.0
 8    0.0
 9    1.0
 Name: 评论称号, dtype: float64,
 10    0.0
 11    0.0
 12    0.0
 Name: 评论称号, dtype: float64)

# 模型训练

In [35]:
# alpha 为可选参数，默认 1.0，添加拉普拉修/Lidstone 平滑参数
mnb = MultinomialNB(alpha=1)

In [36]:
mnb.fit(x_train, y_train)

In [37]:
mnb.predict(x_val)

array([0., 0., 0.])

In [38]:
y_val

10    0.0
11    0.0
12    0.0
Name: 评论称号, dtype: float64

In [39]:
mnb.predict_proba(x_val)

array([[0.75319149, 0.24680851],
       [0.51708778, 0.48291222],
       [0.51281802, 0.48718198]])

# 模型评估

In [40]:
mnb.score(x_val, y_val)

1.0