In [None]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import numpy
from sklearn.model_selection import train_test_split
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from gensim.models import KeyedVectors

In [None]:
#读取数据
df = pd.read_csv('分词后data.csv')
df = df.dropna()
print(df.head())

In [None]:
data = df['文本'].tolist()
label = df['标签'].tolist()
print(len(data), len(label)) #查看语料信息
print(Counter(label)) #查看不同标签文本数量

In [None]:
texts = [each.split() for each in data]
print(texts[0:5])

In [None]:
sentence_length = [len(x.split()) for x in data] #得到数据所有的文本长度
#画图
matplotlib.rcParams['font.sans-serif'] = 'SimHei' #matplotlib画图显示中文
plt.hist(sentence_length, 50, density = True)
plt.xlim(0,60)
plt.xlabel('文本长度')
plt.ylabel('文本数量占比')
plt.show()

In [None]:
plt.hist(sentence_length, 50, density = True, cumulative=True)
plt.xlim(0,30)
plt.xlabel('文本长度')
plt.ylabel('文本数量累计占比')
plt.show()

In [None]:
#划分训练数据和测试数据
train_sets, test_sets, train_label, test_label = train_test_split(data, label, test_size=0.3, random_state=42)

In [None]:
print(len(train_sets), len(train_label))
print(len(test_sets), len(test_label))

In [None]:
print(train_sets[0:5])
print(train_label[0:5])

In [None]:
#word2vec词向量
cn_model = KeyedVectors.load_word2vec_format('..\\4 词向量文档表示模型\\sgns.weibo.word.bz2', binary=False)

In [None]:
#对每个句子的所有词向量取均值，来生成一个句子的vector
def build_sentence_vector(sentence,size,w2v_model):
    sen_vec=np.zeros(size).reshape((1,size))
    count=0
    for word in sentence:
        try:
            sen_vec+=w2v_model[word].reshape((1,size))
            count+=1
        except KeyError:
            continue
    if count!=0:
        sen_vec/=count
    return sen_vec

In [None]:
train_data = np.zeros([len(train_sets),300], dtype = float)
for i in range(0, len(train_sets)):
    train_data[i] = build_sentence_vector(train_sets[i], 300, cn_model)

In [None]:
test_data = np.zeros([len(test_sets),300], dtype = float)
for i in range(0, len(test_sets)):
    test_data[i] = build_sentence_vector(test_sets[i], 300, cn_model)

In [None]:
print(train_data.shape)

In [None]:
#创建svm分类器
clf = svm.SVC(C=20, kernel='rbf', gamma=0.1)
#训练模型
svm_clf = clf.fit(train_data, train_label)

In [None]:
#推断预测集的分类结果
pre_label = svm_clf.predict(test_data)
print(pre_label[0:50])

In [None]:
#计算准确率和召回率。
p = metrics.precision_score(test_label, pre_label, average='binary') #准确率
r = metrics.recall_score(test_label, pre_label, average='binary') #召回率
f1score = metrics.f1_score(test_label, pre_label, average='binary') #f值
 
print(p)
print(r)
print(f1score)

In [None]:
#输出结果报告
print(classification_report(test_label, pre_label, digits=4, target_names = ['正常短信', '垃圾短信']))

In [None]:
#输出混淆矩阵
confusion_matrix = metrics.confusion_matrix(test_label, pre_label) #混淆矩阵
print(confusion_matrix) 

In [None]:
#预测新语句
inputs = build_sentence_vector("尊敬 会员 你好 建设路 意尔康 专卖店 依旧 换 新", 300, cn_model)
print(svm_clf.predict(inputs))

In [23]:
#网格搜索，寻找最优参数
from sklearn.model_selection import GridSearchCV
parameters={'kernel':['linear','rbf'],'C':np.linspace(0.1,20,5),'gamma':np.linspace(0.1,20,5)}
svc = svm.SVC()
model = GridSearchCV(svc,parameters,cv=5,scoring='accuracy')
model.fit(train_data, train_label)
print(model.best_params_)
print(model.score(test_data,test_label))

{'C': 10.049999999999999, 'gamma': 0.1, 'kernel': 'rbf'}
0.9597855227882037


In [24]:
#保存模型
import pickle

In [25]:
s=pickle.dumps(svm_clf)
f=open('svm.model', "wb+")
f.write(s)
f.close()

In [26]:
#读取模型，判断新的句子
f2=open('svm.model','rb')
s2=f2.read()
f2.close()
model1=pickle.loads(s2)


In [27]:
inputs = build_sentence_vector("作者 长篇大论 借用 详细", 300, cn_model)
print(model1.predict(inputs))

[0]


In [28]:
#使用朴素贝叶斯进行训练
mnb = GaussianNB()   # 使用默认配置初始化朴素贝叶斯
mnb.fit(train_data, train_label)    # 利用训练数据对模型参数进行估计

In [29]:
#推断预测集的分类结果
pre_label = mnb.predict(test_data)
print(classification_report(test_label, pre_label, digits=4, target_names = ['正常短信', '垃圾短信']))

              precision    recall  f1-score   support

        正常短信     0.9967    0.9159    0.9546       333
        垃圾短信     0.5821    0.9750    0.7290        40

    accuracy                         0.9223       373
   macro avg     0.7894    0.9455    0.8418       373
weighted avg     0.9523    0.9223    0.9304       373



In [30]:
#使用随机森林进行训练
rf_clf = RandomForestClassifier()
rf_clf.fit(train_data, train_label) 

In [31]:
#推断预测集的分类结果
pre_label = rf_clf.predict(test_data)
print(classification_report(test_label, pre_label, digits=4, target_names = ['正常短信', '垃圾短信']))

              precision    recall  f1-score   support

        正常短信     0.9024    1.0000    0.9487       333
        垃圾短信     1.0000    0.1000    0.1818        40

    accuracy                         0.9035       373
   macro avg     0.9512    0.5500    0.5653       373
weighted avg     0.9129    0.9035    0.8665       373

