## 数据读取

In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import sklearn
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import jieba
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']

In [2]:
data_path = "../data/data.csv"
stopword_path = "../data/stop_words.txt"
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,类别名字,文本数据
0,娱乐,黎明喜当爹满面春风，乐基儿发福成大妈！离婚后两人差距这么大？
1,旅游,亚洲排名第一都市，北京GDP不及它一半，上海10年都赶超不了
2,国际,悠仁：日本明仁天皇孙辈唯一男丁堪称“独苗”，将来可能继位
3,娱乐,被意大利炮干掉的阿部规秀到底是个什么官？
4,财经,公司亏损时，股东如何维护自己的利益？


In [3]:
# 去除证券类
data = data[data["类别名字"] != "证券"]

## 分词去停用词

In [4]:
with open(stopword_path, "r", encoding="utf-8") as f:
    stopwords = f.readlines()

stopwords = [i.strip() for i in stopwords]

def segment(data):
    data = jieba.cut(data)
    data = [i for i in data if i not in stopwords]
    return data

In [5]:
data["分词后的数据"] = data["文本数据"].apply(segment)
data.head()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/bj/34ww9tjd5mz5wrq2jqgqrtw40000gn/T/jieba.cache
Loading model cost 0.590 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,类别名字,文本数据,分词后的数据
0,娱乐,黎明喜当爹满面春风，乐基儿发福成大妈！离婚后两人差距这么大？,"[黎明, 喜当爹, 满面春风, 乐基儿, 发福, 成, 大妈, 离婚, 两人, 差距, 大]"
1,旅游,亚洲排名第一都市，北京GDP不及它一半，上海10年都赶超不了,"[亚洲, 排名, 第一, 都市, 北京, GDP, 不及, 它, 一半, 上海, 10, 年..."
2,国际,悠仁：日本明仁天皇孙辈唯一男丁堪称“独苗”，将来可能继位,"[悠仁, 日本, 明, 仁天皇, 孙辈, 唯一, 男丁, 堪称, 独苗, 将来, 可能, 继位]"
3,娱乐,被意大利炮干掉的阿部规秀到底是个什么官？,"[被, 意大利, 炮, 干掉, 阿部规秀, 到底, 是, 个, 什么, 官]"
4,财经,公司亏损时，股东如何维护自己的利益？,"[公司, 亏损, 时, 股东, 如何, 维护, 自己, 利益]"


In [25]:
data_all = data["分词后的数据"].tolist()
print(np.shape(data_all))
print(data_all[0])

(294304,)
['黎明', '喜当爹', '满面春风', '乐基儿', '发福', '成', '大妈', '离婚', '两人', '差距', '大']


In [26]:
def get_word2vec(sentences,fname,min_count_num,embedding_dim):
    if not os.path.exists(fname):
        model = Word2Vec(sentences, min_count=min_count_num, size=embedding_dim)
        model.save(fname)
    else:
        model = Word2Vec.load(fname)
    
    unk_vector = np.zeros(shape=[embedding_dim,])
    
    data_tensor = []
    for sent in sentences:        
        sent_matrix = []
        for word in sent:
            try:
                temp_vector = model[word]
            except KeyError:
                temp_vector = unk_vector
            sent_matrix.append(temp_vector)
        sent_matrix = np.mean(sent_matrix,axis=0)
        data_tensor.append(sent_matrix)
        
    return np.array(data_tensor)

In [27]:
embedding_dim = 50
min_count_num=5
fname = '../data/word2vec.model'
data_all = get_word2vec(data_all,fname,min_count_num,embedding_dim=embedding_dim)
print(np.shape(data_all))

  from ipykernel import kernelapp as app


(294304, 50)


## 模型

In [11]:
label_all = data["类别名字"].tolist()

def change_label(data):
    dict_all = {}
    set_all = list(set(data))
    for i,content in enumerate(set_all):
        dict_all[content] = i
    return dict_all
dict_all = change_label(label_all)
label_id = [dict_all[i] for i in label_all]

In [12]:
print(np.shape(data_all))
print(np.shape(label_id))
print(dict_all)

(294304, 50)
(294304,)
{'文化': 0, '科技': 1, '电竞': 2, '体育': 3, '汽车': 4, '农业': 5, '房产': 6, '军事': 7, '旅游': 8, '民生': 9, '教育': 10, '国际': 11, '财经': 12, '娱乐': 13}


## 训练集和测试集的划分

In [13]:
from sklearn.model_selection import train_test_split

train_data,test_data,train_label,test_label = train_test_split(data_all,label_id,train_size=0.8,test_size=0.2)
print(np.shape(train_data))
print(np.shape(test_data))
print(np.shape(train_label))
print(np.shape(test_label))

(235443, 50)
(58861, 50)
(235443,)
(58861,)


## 模型部分

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

model = LogisticRegression()
model.fit(train_data,train_label)
pred_result = model.predict(test_data)

train_pred_result = model.predict(train_data)



In [15]:
print("训练集结果：")
accuracy = accuracy_score(y_pred=train_pred_result,y_true=train_label)
precision = np.average(precision_score(y_pred=train_pred_result,y_true=train_label,average=None))
recall = np.average(recall_score(y_pred=train_pred_result,y_true=train_label,average=None))
f1score = np.average(f1_score(y_pred=train_pred_result,y_true=train_label,average=None))
print("准确率：",accuracy)
print("紧缺率：",precision)
print("召回率：",recall)
print("f1值：",f1score)

print("测试集结果：")
accuracy = accuracy_score(y_pred=pred_result,y_true=test_label)
precision = np.average(precision_score(y_pred=pred_result,y_true=test_label,average=None))
recall = np.average(recall_score(y_pred=pred_result,y_true=test_label,average=None))
f1score = np.average(f1_score(y_pred=pred_result,y_true=test_label,average=None))
print("准确率：",accuracy)
print("紧缺率：",precision)
print("召回率：",recall)
print("f1值：",f1score)


训练集结果：
准确率： 0.7020382852749922
紧缺率： 0.6885845691203977
召回率： 0.6702205195433673
f1值： 0.6776100379896441
测试集结果：
准确率： 0.7025534734374204
紧缺率： 0.6890568968628153
召回率： 0.6706090696632047
f1值： 0.6781137466293281


## 测试

In [24]:
input_data = input("请输入待分类的文本：")

with open(stopword_path,"r",encoding="utf-8") as f:
    stopwords = f.readlines()

stopwords = [i.strip() for i in stopwords]

def segment(data):
    data = list(jieba.cut(data))
    data = [i for i in data if i not in stopwords]
    return data


def get_sent_word2vec(sent,seq_length=30):
    
    seg_data = segment(sent)
    word2vec_model = Word2Vec.load("../data/word2vec.model")
    
    unk_vector = np.zeros(shape=[embedding_dim,])
    sent_matrix = []
    for word in sent:
        try:
            temp_vector = word2vec_model[word]
        except KeyError:
            temp_vector = unk_vector
        sent_matrix.append(temp_vector)
    tensor = np.reshape(np.mean(sent_matrix,axis=0),(1,embedding_dim))
    return tensor
    


data_vector = get_sent_word2vec(input_data)

pred_result = model.predict(data_vector)
dict_all_reverse = dict(zip(dict_all.values(),dict_all.keys()))
print("当前文本类别为:",dict_all_reverse[pred_result[0]])

请输入待分类的文本：DF-17导弹搭载核弹头发向美国
当前文本类别为: 军事


