# 文本分类实现

## 1 数据预处理

## 1.1 导入包

In [1]:
import jieba
import pandas as pd

## 1.2 读取所有文本数据

In [2]:
df_technology = pd.read_csv('../data/technology_news.csv',encoding='utf-8')
df_technology = df_technology.dropna()

df_tour = pd.read_csv('../data/tour.csv',encoding='gbk')
df_tour = df_tour.dropna()

df_entertainment = pd.read_csv('../data/entertainment_news.csv',encoding='utf-8')
df_entertainment = df_entertainment.dropna()

df_military = pd.read_csv('../data/military_news.csv',encoding='utf-8')
df_military = df_military.dropna()

df_sports = pd.read_csv('../data/sports_news.csv',encoding='utf-8')
df_sports = df_sports.dropna()


technology = df_technology.content.values.tolist()[1000:6000]
tour = df_tour.content.values.tolist()[1000:6000]
entertainment = df_entertainment.content.values.tolist()[:5000]
military = df_military.content.values.tolist()[:5000]
sports = df_sports.content.values.tolist()[:5000]

## 1.3 读取预测的单一文本

In [3]:
ori_data = pd.read_csv('C:/Users/Mars/Desktop/july/pachong/data/data.txt',index_col=False,names = ['content'])
ori_data = ori_data.dropna()
# ori_data.head(10)
content = ori_data.content.values.tolist()
segment = []
for line in content:
    try:
        segs = jieba.lcut(line)
        for seg in segs:
            if len(seg)>1 and seg!='\r\n':
                segment.append(seg)
    except:
        print(line)
        continue
# print(segment)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Mars\AppData\Local\Temp\jieba.cache
Loading model cost 0.888 seconds.
Prefix dict has been built succesfully.


## 1.4 读取停用词

In [4]:
stopwords = pd.read_csv('../data/stopwords.txt',index_col=False,quoting=3,sep='\t',
                        names=['stopword'],encoding='utf-8')
stopwords = stopwords['stopword'].values

## 1.5 定义数据预处理函数

In [5]:
def preprocess(content_line,sentences,category):
    for line in content_line:
        try:
            segs = jieba.lcut(line)
            segs = filter(lambda x:len(x)>1,segs)
            segs = filter(lambda x:x not in stopwords,segs)
            sentences.append((" ".join(segs),category))
        except:
            print(line)
            continue
            
sentences = []
preprocess(technology,sentences,'tenchnology')
preprocess(tour,sentences,'tour')
preprocess(entertainment,sentences,'entertainment')
preprocess(military,sentences,'military')
preprocess(sports,sentences,'sports')

print("preprocess ending!")

preprocess ending!


In [6]:
segs = []
segment = filter(lambda x:x not in stopwords,segment)
#print(list(segment))
segs.append(" ".join(segment))
# print(segs)

## 1.6 切分数据集

In [7]:
import random
random.shuffle(sentences)

In [8]:
from sklearn.model_selection import train_test_split
x,y = zip(*sentences)
train_data,test_data,train_target,test_target = train_test_split(x,y,random_state=1234)

In [9]:
len(train_data),len(test_data),len(train_data)/len(test_data)

(18750, 6250, 3.0)

In [10]:
# print(train_data[:10])

# 2 特征抽取

# 2.1 词袋模型

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    analyzer = 'word',
    ngram_range=(1,4),
    max_features = 1000,
)
# 训练数据向量化
X_train = vec.fit_transform(train_data)

#测试数据向量化
X_test = vec.transform(test_data)

#预测数据向量化
pre_data = vec.transform(segs)
#print(X)
word = vec.get_feature_names()
# print(word)
# def get_feature(x):
#     vec.transform(x)

# 2.2 TF-IDF模型

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf_X_train = transformer.fit_transform(X_train)
tfidf_X_test = transformer.transform(X_test)

# 3 机器学习实现文本分类

## 3.1 朴素贝叶斯模型

In [13]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np
classifier = MultinomialNB()

## 3.2 使用词袋模型

In [14]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
def calculate_result(actual,pred):
    m_precision = precision_score(actual,pred,average='micro')
    m_recall = recall_score(actual,pred,average='macro')
    print('precision:{0:.3f}'.format(m_precision))
    print('recall:{0:.3f}'.format(m_recall))
    print('f1-socre:{0:.3f}'.format(f1_score(actual,pred,average='macro')))

In [15]:
classifier.fit(X_train,train_target)
# socre = classifier.score(X_test,test_target)
# pred = classifier.predict(X_test)
# test_target = np.array(test_target)
# calculate_result(test_target,pred)
classifier.predict(X_test)
classifier.score(X_test,test_target)
pred = classifier.predict(X_test)
calculate_result(test_target,pred)

precision:0.781
recall:0.781
f1-socre:0.780


## 3.3 使用TF-IDF

In [17]:
classifier.fit(tfidf_X_train,train_target)
classifier.score(tfidf_X_test,test_target)
pred_2 = classifier.predict(tfidf_X_test)
calculate_result(test_target,pred_2)

precision:0.784
recall:0.785
f1-socre:0.784


## 3.4 预测属于类别

In [18]:
classifier.predict(pre_data)

array(['tour'], 
      dtype='<U13')

# 3.5 SVM模型

In [20]:
from sklearn import svm

In [21]:
clf = svm.SVC()

In [22]:
clf.fit(X_train,train_target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
clf.score(X_test,test_target)

0.68655999999999995

In [24]:
pred_3 = clf.predict(X_test)
calculate_result(pred_3,test_target)

precision:0.687
recall:0.827
f1-socre:0.695


# 神经网络实现文本分类

# CNN网络模型

## 1.导入包

In [6]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import numpy as np
from sklearn import metrics
import tensorflow as tf

## 2.定义超参数，后面根据数据调整

In [7]:
learn = tf.contrib.learn

FLAGS = None

MAX_DOCUMENT_LENGTH = 100

MIN_WORD_FREQUENCE = 2

EMBEDDING_SIZE = 20

N_FILTERS = 10

WINDOW_SIZE = 20

FILTER_SHAPE1 = [WINDOW_SIZE,EMBEDDING_SIZE]
FILTER_SHAPE2 = [WINDOW_SIZE,N_FILTERS]

POOLING_WINDOW = 4
POOLING_STRIDE = 2

n_words = 0

## 3.定义cnn模型，两个隐层

In [8]:
def cnn_model(features,target):
    
    target = tf.one_hot(target,15,1,0)
    word_vectors = tf.contrib.layers.embed_sequence(
                    features,vocab_size=n_words,embed_dim=EMBEDDING_SIZE,scope='words')
    word_vectors = tf.expand_dims(word_vectors,3)
    
    with tf.variable_scope('CNN_Layer1'):
        
        conv1 = tf.contrib.layers.convolution2d(
                word_vectors,N_FILTERS,FILTER_SHAPE1,padding='VALID')
        
        conv1 = tf.nn.relu(conv1)
        
        pool1 = tf.nn.max_pool(
                conv1,
                ksize=[1,POOLING_WINDOW,1,1],
                strides = [1,POOLING_STRIDE,1,1],
                padding='SAME')
        
        pool1 = tf.transpose(pool1,[0,1,3,2])
        
    with tf.variable_scope('CNN_Layer2'):
        
        conv2 = tf.contrib.layers.convolution2d(
                pool1,N_FILTERS,FILTER_SHAPE2,padding='VALID')
        
        pool2 = tf.squeeze(tf.reduce_max(conv2,1),squeeze_dims=[1])
        
    
    logits = tf.contrib.layers.fully_connected(pool2,15,activation_fn=None)
    loss = tf.losses.softmax_cross_entropy(target,logits)
    
    train_op = tf.contrib.layers.optimize_loss(
                loss,
                tf.contrib.framework.get_global_step(),
                optimizer='Adam',
                learning_rate=0.01)
    
    return ({
        'class':tf.argmax(logits,1),
        'prob':tf.nn.softmax(logits)
    },loss,train_op)

## 4.数据向量化

In [9]:
global n_words
vocab_processor = learn.preprocessing.VocabularyProcessor(
                    MAX_DOCUMENT_LENGTH,min_frequency=MIN_WORD_FREQUENCE)
x_train = np.array(list(vocab_processor.fit_transform(train_data)))
x_test = np.array(list(vocab_processor.transform(test_data)))
n_words = len(vocab_processor.vocabulary_)
print('Total words:%d'%n_words)

Total words:50281


## 5.类别对应表

In [10]:
cate_dic = {'technology':1,'car':2,'entertainment':3,'military':4,'sports':5}
train_target = map(lambda x:cate_dic[x],train_target)
test_target = map(lambda x:cate_dic[x],test_target)
y_train = pd.Series(train_target)
y_test = pd.Series(test_target)

## 6.构造分类器

In [11]:
classifier = learn.SKCompat(learn.Estimator(model_fn=cnn_model))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000026B63EB2DA0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'C:\\Users\\Mars\\AppData\\Local\\Temp\\tmptt32vsl4'}


## 7.输出准确率

In [12]:
classifier.fit(x_train, y_train, steps=1000)
y_predicted = classifier.predict(x_test)['class']
score = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(score))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Mars\AppData\Local\Temp\tmptt32vsl4\model.ckpt.
INFO:tensorflow:loss = 2.70804, step = 1
INFO:tensorflow:global_step/sec: 12.1625
INFO:tensorflow:loss = 0.906659, step = 101 (8.243 sec)
INFO:tensorflow:global_step/sec: 13.5506
INFO:tensorflow:loss = 0.522719, step = 201 (7.360 sec)
INFO:tensorflow:global_step/sec: 13.967
INFO:tensorflow:loss = 0.438215, step = 301 (7.161 sec)
INFO:tensorflow:global_step/sec: 13.8374
INFO:tensorflow:loss = 0.466484, step = 401 (7.226 sec)
INFO:tensorflow:global_step/sec: 13.0419
INFO:tensorflow:loss = 0.193651, step = 501 (7.668 sec)
INFO:tensorflow:global_step/sec: 13.267
INFO:tensorflow:loss = 0.128035, step = 601 (7.538 sec)
INFO:tensorflow:global_step/sec: 13.0368
INFO:tensorflow:loss = 0.230962, step = 701 (7.671 sec)
INFO:tensorflow:global_step/sec: 13.1172
INFO:tensorflow:loss = 0.205118, step = 801 (7.623 sec)
INFO:tensorflow:global_step/sec: 12.57

# 词袋模型

In [13]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

import numpy as np
import pandas
from sklearn import metrics
import tensorflow as tf
from tensorflow.contrib.layers.python.layers import encoders

In [14]:
learn = tf.contrib.learn

FLAGS = None

MAX_DOCUMENT_LENGTH = 15
MIN_WORD_FREQUENCE = 1
EMBEDDING_SIZE = 50

In [15]:
global n_words

vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH,
                                                        min_frequency=MIN_WORD_FREQUENCE)

x_train = np.array(list(vocab_processor.fit_transform(train_data)))
x_test = np.array(list(vocab_processor.transform(test_data)))
n_words = len(vocab_processor.vocabulary_)
print('Total words:%d'%n_words)

Total words:67631


In [16]:
def bag_of_words_model(features,target):
    target = tf.one_hot(target,15,1,0)
    features = encoders.bow_encoder(
                features,
                vocab_size=n_words,
                embed_dim=EMBEDDING_SIZE)
    logits = tf.contrib.layers.fully_connected(features,15,activation_fn=None)
    loss = tf.contrib.losses.softmax_cross_entropy(logits,target)
    train_op = tf.contrib.layers.optimize_loss(
                loss,
                tf.contrib.framework.get_global_step(),
                optimizer = 'Adam',
                learning_rate = 0.01)
    
    return ({
        'class':tf.argmax(logits,1),
        'prob':tf.nn.softmax(logits)
    },loss,train_op)

In [17]:
model_fn = bag_of_words_model
classifier = learn.SKCompat(learn.Estimator(model_fn=model_fn))

classifier.fit(x_train,y_train,steps=1000)
y_predicted = classifier.predict(x_test)['class']
score = metrics.accuracy_score(y_test,y_predicted)
print('Acc:{0:f}'.format(score))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001AE0E0978D0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'C:\\Users\\Mars\\AppData\\Local\\Temp\\tmpi3ma6bzf'}
Instructions for updating:
Use tf.losses.softmax_cross_entropy instead. Note that the order of the logits and labels arguments has been changed.
Instructions for updating:
Use tf.losses.compute_weighted_loss instead.
Instructions for updating:
Use tf.losses.add_loss instead.
INFO:tensorflow:Create CheckpointSaverH

# GRU模型

In [18]:
def rnn_model(features,target):
    word_vectors = tf.contrib.layers.embed_sequence(
                    features,
                    vocab_size=n_words,
                    embed_dim=EMBEDDING_SIZE,
                    scope='words')
    word_list = tf.unstack(word_vectors,axis=1)
    
    cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)
    
    _,encoding = tf.contrib.rnn.static_rnn(cell,word_list,dtype=tf.float32)
    
    target = tf.one_hot(target,15,1,0)
    
    logits = tf.contrib.layers.fully_connected(encoding,15,activation_fn=None)
    loss = tf.contrib.losses.softmax_cross_entropy(logits,target)
    
    train_op = tf.contrib.layers.optimize_loss(
                loss,
                tf.contrib.framework.get_global_step(),
                optimizer='Adam',
                learning_rate=0.01)
    
    return ({
        'class':tf.argmax(logits,1),
        'prob':tf.nn.softmax(logits)
    },loss,train_op)

In [19]:
model_fn = rnn_model
classifier = learn.SKCompat(learn.Estimator(model_fn=model_fn))

classifier.fit(x_train,y_train,steps=1000)
y_predicted = classifier.predict(x_test)['class']
score = metrics.accuracy_score(y_test,y_predicted)
print('Acc:{0:f}'.format(score))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000026B656F05C0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'C:\\Users\\Mars\\AppData\\Local\\Temp\\tmpnv3_stbn'}
Instructions for updating:
Use tf.losses.softmax_cross_entropy instead. Note that the order of the logits and labels arguments has been changed.
Instructions for updating:
Use tf.losses.compute_weighted_loss instead.
Instructions for updating:
Use tf.losses.add_loss instead.
INFO:tensorflow:Create CheckpointSaverH