## 单词计数向量

In [1]:
sample = ["Machine learning is fascinating, it is wonderful",
          "Machine learning is a sensational techonology",
          "Elsa is a popular character"]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
vec = CountVectorizer()
X = vec.fit_transform(sample)
X

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [4]:
# 使用接口get_feature_names()调用每个列的名称

vec.get_feature_names()  # 按照字母的顺序排列

['character',
 'elsa',
 'fascinating',
 'is',
 'it',
 'learning',
 'machine',
 'popular',
 'sensational',
 'techonology',
 'wonderful']

In [7]:
X.toarray()

array([[0, 0, 1, 2, 1, 1, 1, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0],
       [1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0]])

In [5]:
import pandas as pd

In [8]:
# 注意稀疏矩阵是无法输入pandas的
CVresult = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
CVresult

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0,0,1,2,1,1,1,0,0,0,1
1,0,0,0,1,0,1,1,0,1,1,0
2,1,1,0,1,0,0,0,1,0,0,0


## TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF

In [10]:
vec = TFIDF()
X = vec.fit_transform(sample)
X  # 每一个单词作为一个特征，每个单词在这个句子中所占的比例

<3x11 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [11]:
# 同样使用接口get_feature_names()调用每个列的名称
TFIDFresult = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
TFIDFresult

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0.0,0.0,0.424396,0.50131,0.424396,0.322764,0.322764,0.0,0.0,0.0,0.424396
1,0.0,0.0,0.0,0.315444,0.0,0.406192,0.406192,0.0,0.534093,0.534093,0.0
2,0.546454,0.546454,0.0,0.322745,0.0,0.0,0.0,0.546454,0.0,0.0,0.0


### #使用TF-IDF编码之后，出现得多的单词的权重被降低了么？ 

In [16]:
CVresult.sum(axis=0)

character      1
elsa           1
fascinating    1
is             4
it             1
learning       2
machine        2
popular        1
sensational    1
techonology    1
wonderful      1
dtype: int64

In [12]:
CVresult.sum(axis=0).sum()

16

In [14]:
CVresult.sum(axis=0)/CVresult.sum(axis=0).sum()

character      0.0625
elsa           0.0625
fascinating    0.0625
is             0.2500
it             0.0625
learning       0.1250
machine        0.1250
popular        0.0625
sensational    0.0625
techonology    0.0625
wonderful      0.0625
dtype: float64

In [15]:
TFIDFresult.sum(axis=0) / TFIDFresult.sum(axis=0).sum()
# 将原本出现次数比较多的词压缩我们的权重
# 将原本出现次数比较少的词增加我们的权重

character      0.083071
elsa           0.083071
fascinating    0.064516
is             0.173225
it             0.064516
learning       0.110815
machine        0.110815
popular        0.083071
sensational    0.081192
techonology    0.081192
wonderful      0.064516
dtype: float64

## 探索文本数据

In [17]:
from sklearn.datasets import fetch_20newsgroups

In [18]:
# 初次使用这个数据集的时候，会在实例化的时候开始下载
data = fetch_20newsgroups()

In [19]:
# 通常我们使用data来查看data里面到底包含了什么内容
# 但由于fetch_20newsgourps这个类加载出的数据巨大，数据结构中混杂很多文字，因此很难去看清

In [20]:
# 不同类型的新闻
# 标签的分类都有哪些
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [26]:
# 其实fetch_20newsgroups也是一个类，既然是类，应该就有可以调用的参数
# 面对简单数据集，我们往往在实例化的过程中什么都不写，但是现在data中数据量太多，不方便探索
# 因此我们需要来看看我们的类fetch_20newsgroups都有什么样的参数可以帮助我们

In [27]:
import numpy as np
import pandas as pd

categories = ["sci.space",  # 科学技术 - 太空
              "rec.sport.hockey",  # 运动 - 曲棍球
              "talk.politics.guns",  # 政治 - 枪支问题
              "talk.politics.mideast"]  # 政治 - 中东问题

train = fetch_20newsgroups(subset="train", categories=categories)
test = fetch_20newsgroups(subset="test", categories=categories)

In [28]:
train
# 可以观察到，里面依然是类字典结构，我们可以通过使用键的方式来提取内容

{'data': ["From: tvartiai@vipunen.hut.fi (Tommi Vartiainen)\nSubject: Re: Finland/Sweden vs.NHL teams (WAS:Helsinki/Stockholm & NHL expansion)\nNntp-Posting-Host: vipunen.hut.fi\nOrganization: Helsinki University of Technology, Finland\nLines: 51\n\nIn <1993Apr16.195754.5476@ousrvr.oulu.fi> mep@phoenix.oulu.fi (Marko Poutiainen) writes:\n\n>: FINLAND:  \n>: \n>: D-Jyrki Lumme.......20\n>: D-Teppo Numminen....20\n>: D-Peter Ahola.......13\n>: \n>Well well, they don't like our defenders (mainly Lumme and Numminen)...\n\nAbout 25 is correct for Numminen and Lumme.\n\n\n>: R-Teemu Selanne.....27\n>: \n>Compared to Kurri, Selanne's points are too high, lets make it 25 or 26.\n\nNo, Kurri's points are too low. 27 for Kurri and 28 for Sel{nne.\n\n>: well in the Canada Cup and World Championships largely due to the efforts of\n>: Markus Ketterer (the goalie), 3-4 or the players listed above and luck. There's\n>: presumably a lot of decent players in Finland that wouldn't be superstars at\n>: t

In [29]:
train.target_names  # 四个类别，四个目录下的标签的分类

['rec.sport.hockey',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.mideast']

In [32]:
# 查看总共有多少篇文章存在
len(train.data)

2303

In [33]:
# 随意提取一篇文章来看看
print(train.data[0])

From: tvartiai@vipunen.hut.fi (Tommi Vartiainen)
Subject: Re: Finland/Sweden vs.NHL teams (WAS:Helsinki/Stockholm & NHL expansion)
Nntp-Posting-Host: vipunen.hut.fi
Organization: Helsinki University of Technology, Finland
Lines: 51

In <1993Apr16.195754.5476@ousrvr.oulu.fi> mep@phoenix.oulu.fi (Marko Poutiainen) writes:

>: FINLAND:  
>: 
>: D-Jyrki Lumme.......20
>: D-Teppo Numminen....20
>: D-Peter Ahola.......13
>: 
>Well well, they don't like our defenders (mainly Lumme and Numminen)...

About 25 is correct for Numminen and Lumme.


>: R-Teemu Selanne.....27
>: 
>Compared to Kurri, Selanne's points are too high, lets make it 25 or 26.

No, Kurri's points are too low. 27 for Kurri and 28 for Sel{nne.

>: well in the Canada Cup and World Championships largely due to the efforts of
>: Markus Ketterer (the goalie), 3-4 or the players listed above and luck. There's
>: presumably a lot of decent players in Finland that wouldn't be superstars at
>: the highest level but still valuable rol

In [34]:
# 查看一下我们的标签
np.unique(train.target)

array([0, 1, 2, 3])

In [35]:
len(train.target)

2303

In [36]:
# 是否存在样本不平衡问题？
for i in [1, 2, 3]:
    print(i, (train.target == i).sum()/len(train.target))

1 0.25749023013460703
2 0.23708206686930092
3 0.24489795918367346


## 使用TF-IDF将文本数据编码

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF

In [38]:
Xtrain = train.data
Xtest = test.data
Ytrain = train.target
Ytest = test.target

In [39]:
tfidf = TFIDF().fit(Xtrain)

In [40]:
Xtrain_ = tfidf.transform(Xtrain)
Xtest_ = tfidf.transform(Xtest)

In [41]:
Xtrain_

<2303x40725 sparse matrix of type '<class 'numpy.float64'>'
	with 430306 stored elements in Compressed Sparse Row format>

In [42]:
tosee = pd.DataFrame(Xtrain_.toarray(), columns=tfidf.get_feature_names())

In [43]:
tosee.head()

Unnamed: 0,00,000,0000,00000,000000,000021,000062david42,000152,000246,000256,...,zwrm,zx,zx6wre,zxp,zxqi,zy,zyg,zz,zz_g9q3,zzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.058046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
tosee.shape

(2303, 40725)

## 在贝叶斯上分别建模，查看结果

In [48]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import brier_score_loss as BS

names = ["Multinomial", "Complement", "Bournulli"]
# 注意高斯朴素贝叶斯不接受稀疏矩阵
models = [MultinomialNB(), ComplementNB(), BernoulliNB()]

In [50]:
for name, clf in zip(names, models):
    clf.fit(Xtrain_, Ytrain)
    y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_, Ytest)
    print(name)

    # 4个不同的标签取值下的布里尔分数
    Bscore = []
    for i in range(len(np.unique(Ytrain))):
        bs = BS((Ytest == i).astype(int), proba[:, i], pos_label=1)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(train.target_names[i], bs))

    print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")

Multinomial
	Brier under rec.sport.hockey:0.018
	Brier under sci.space:0.033
	Brier under talk.politics.guns:0.030
	Brier under talk.politics.mideast:0.026
	Average Brier:0.027
	Accuracy:0.975


Complement
	Brier under rec.sport.hockey:0.023
	Brier under sci.space:0.039
	Brier under talk.politics.guns:0.039
	Brier under talk.politics.mideast:0.033
	Average Brier:0.033
	Accuracy:0.986


Bournulli
	Brier under rec.sport.hockey:0.068
	Brier under sci.space:0.025
	Brier under talk.politics.guns:0.045
	Brier under talk.politics.mideast:0.053
	Average Brier:0.048
	Accuracy:0.902




In [52]:
from sklearn.calibration import CalibratedClassifierCV

name = ["Multinomial",
        "Multinomial + Isotonic",
        "Multinomial + Sigmoid",
        "Complement",
        "Complement + Isotonic",
        "Complement + Sigmoid",
        "Bernoulli",
        "Bernoulli + Isotonic",
        "Bernoulli + Sigmoid"]

models = [MultinomialNB(),
          CalibratedClassifierCV(MultinomialNB(), cv=2, method='isotonic'),
          CalibratedClassifierCV(MultinomialNB(), cv=2, method='sigmoid'),
          ComplementNB(),
          CalibratedClassifierCV(ComplementNB(), cv=2, method='isotonic'),
          CalibratedClassifierCV(ComplementNB(), cv=2, method='sigmoid'),
          BernoulliNB(),
          CalibratedClassifierCV(BernoulliNB(), cv=2, method='isotonic'),
          CalibratedClassifierCV(BernoulliNB(), cv=2, method='sigmoid')]

In [53]:
for name, clf in zip(names, models):
    clf.fit(Xtrain_, Ytrain)
    y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_, Ytest)
    print(name)
    Bscore = []
    for i in range(len(np.unique(Ytrain))):
        bs = BS((Ytest == i).astype(int), proba[:, i], pos_label=1)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(train.target_names[i], bs))
    print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")

Multinomial
	Brier under rec.sport.hockey:0.018
	Brier under sci.space:0.033
	Brier under talk.politics.guns:0.030
	Brier under talk.politics.mideast:0.026
	Average Brier:0.027
	Accuracy:0.975


Complement
	Brier under rec.sport.hockey:0.006
	Brier under sci.space:0.012
	Brier under talk.politics.guns:0.013
	Brier under talk.politics.mideast:0.009
	Average Brier:0.010
	Accuracy:0.973


Bournulli
	Brier under rec.sport.hockey:0.006
	Brier under sci.space:0.012
	Brier under talk.politics.guns:0.013
	Brier under talk.politics.mideast:0.009
	Average Brier:0.010
	Accuracy:0.973




In [None]:
a