In [1]:
import os
import jieba
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
warnings.filterwarnings('ignore')

In [7]:
def cut_words(file_path):
    """
    对文本进行切词
    :param file_path: txt文本路径
    :return: 用空格分词的字符串
    """
    text_with_spaces = ''
    text = open(file_path, 'r', encoding='gb18030', errors='ignore').read()
    textcut = jieba.cut(text)
    for word in textcut:
        text_with_spaces += word + ' '
    return text_with_spaces

In [3]:
def loadfile(file_dir, label):
    """
    将路径下的所有文件加载
    :param file_dir: 保存txt文件目录
    :param label: 文档标签
    :return: 分词后的文档列表和标签
    """
    file_list = os.listdir(file_dir)
    words_list = []
    labels_list = []
    for file in file_list:
        file_path = file_dir + '/' + file
        words_list.append(cut_words(file_path))
        labels_list.append(label)
    return words_list, labels_list

In [4]:
# 训练数据
train_words_list1, train_labels1 = loadfile('./text classification/train/女性', '女性')
train_words_list2, train_labels2 = loadfile('./text classification/train/体育', '体育')
train_words_list3, train_labels3 = loadfile('./text classification/train/文学', '文学')
train_words_list4, train_labels4 = loadfile('./text classification/train/校园', '校园')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/nz/4h9y8lkj6kg717r9k610_fph0000gn/T/jieba.cache
Loading model cost 0.637 seconds.
Prefix dict has been built successfully.


In [5]:
train_words_list = train_words_list1 + train_words_list2 + train_words_list3 + train_words_list4
train_labels = train_labels1 + train_labels2 + train_labels3 + train_labels4

In [8]:
# 测试数据
test_words_list1, test_labels1 = loadfile('./text classification/test/女性', '女性')
test_words_list2, test_labels2 = loadfile('./text classification/test/体育', '体育')
test_words_list3, test_labels3 = loadfile('./text classification/test/文学', '文学')
test_words_list4, test_labels4 = loadfile('./text classification/test/校园', '校园')

In [9]:
test_words_list = test_words_list1 + test_words_list2 + test_words_list3 + test_words_list4
test_labels = test_labels1 + test_labels2 + test_labels3 + test_labels4

In [10]:
stop_words = open('./text classification/stop/stopword.txt', 'r', encoding='utf-8').read()
stop_words = stop_words.encode('utf-8').decode('utf-8-sig')  # 列表头部\ufeff处理
stop_words = stop_words.split('\n')  # 根据分隔符分隔

In [11]:
# 计算单词权重
tf = TfidfVectorizer(stop_words=stop_words, max_df=0.5)

In [12]:
tf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=[',', '?', '、', '。', '“', '”', '《', '》', '！', '，',
                            '：', '；', '？', '人民', '#', '###', '啊', '阿', '哎',
                            '哎呀', '哎哟', '唉', '俺', '俺们', '按', '按照', '吧', '吧哒',
                            '把', '罢了', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [13]:
train_features = tf.fit_transform(train_words_list)

In [17]:
# 上面fit过了，这里transform
test_features = tf.transform(test_words_list)

In [18]:
# 多项式贝叶斯分类器
clf = MultinomialNB(alpha=0.001).fit(train_features, train_labels)
clf

MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)

In [19]:
predict_labels = clf.predict(test_features)

In [21]:
# 计算准确率
'准确率为：%f' % metrics.accuracy_score(test_labels, predict_labels)

'准确率为：0.910000'