In [1]:
import numpy as np
import pandas as pd
import sklearn.datasets as sd
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import sklearn.metrics as sm  # 评估模块

In [2]:
# 加载数据集
data = sd.load_files('./data/20news', encoding='latin1', random_state=7)
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
len(data.data)  # 2968封邮件

2968

In [4]:
data.data[0], data.target[0], data.target_names[4]

('From: gene@theporch.raider.net (Gene Wright)\nSubject: NASA Special Publications for Voyager Mission?\nOrganization: The MacInteresteds of Nashville, Tn.\nLines: 12\n\nI have two books, both NASA Special Publications, on the Voyager \nMissions. One is titled "Voyages to Jupiter" the other "Voyage to Saturn" \nThese were excellent books put together after the encounters with each \nplanet. \n\nThe question is: Did NASA ever put together a similar book for either the \nUranus encounter or Neptune? If so, what SP number is it and where can it \nbe obtained? If not, why didn\'t they?\n\n--\n  gene@theporch.raider.net (Gene Wright)\ntheporch.raider.net  615/297-7951 The MacInteresteds of Nashville\n',
 4,
 'sci.space')

**逻辑回归测试和引入朴素贝叶斯**

In [5]:
import sklearn.feature_extraction.text as ft
# 整理输入集与输出集  TFIDF
cv = ft.CountVectorizer()
bow = cv.fit_transform(data.data)
tt = ft.TfidfTransformer()
tfidf = tt.fit_transform(bow)

# 拆分测试集与训练集
train_x, test_x, train_y, test_y = ms.train_test_split(tfidf, data.target, test_size=0.2, random_state=7)

# 交叉验证
model = lm.LogisticRegression()

# 使用朴素贝叶斯
import sklearn.naive_bayes as nb
model = nb.MultinomialNB()
score = ms.cross_val_score(model, train_x, train_y, cv=5, scoring='f1_weighted')
print(score.mean())

# 训练模型
model.fit(train_x, train_y)

# 测试模型，评估模型
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))

0.9354517048605283
              precision    recall  f1-score   support

           0       0.95      0.78      0.86       131
           1       0.93      0.97      0.95       122
           2       0.96      0.99      0.97       110
           3       0.89      0.98      0.93       121
           4       0.97      0.99      0.98       110

    accuracy                           0.94       594
   macro avg       0.94      0.94      0.94       594
weighted avg       0.94      0.94      0.94       594



**整理一组测试样本进行模型测试**

In [6]:
new_data = ["At the last game, a spectator was hit by a baseball and was hospitalized. ",
        "Recently, Lao Wang is working on asymmetric encryption algorithms. ",
        "The two-wheeled car runs well on the highway. ",
        "Next year, China will explore Mars. "]
# 把样本按照训练时的方式转换为tfidf矩阵，才可以交给模型
bow = cv.transform(new_data)
test_data = tt.transform(bow)
pred_test_y = model.predict(test_data)
print(pred_test_y)

[2 3 1 4]


In [7]:
data.target_names

['misc.forsale',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.crypt',
 'sci.space']