1.读取数据

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import casual_tokenize
import numpy as np
import pandas as pd

In [3]:
sms=pd.read_csv('dataset/sms-spam.csv')

In [4]:
sms.head(20)

Unnamed: 0.1,Unnamed: 0,spam,text
0,0,0,"Go until jurong point, crazy.. Available only ..."
1,1,0,Ok lar... Joking wif u oni...
2,2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,0,U dun say so early hor... U c already then say...
4,4,0,"Nah I don't think he goes to usf, he lives aro..."
5,5,1,FreeMsg Hey there darling it's been 3 week's n...
6,6,0,Even my brother is not like to speak with me. ...
7,7,0,As per your request 'Melle Melle (Oru Minnamin...
8,8,1,WINNER!! As a valued network customer you have...
9,9,1,Had your mobile 11 months or more? U R entitle...


In [7]:
sms.shape

(4837, 3)

处理数据

In [8]:
##向短消息的索引号后面添加一个感叹号，以使垃圾短消息更容易被发现
index=['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]

In [9]:
sms.index = index

In [10]:
print(sms.head(6))

       Unnamed: 0  spam                                               text
sms0            0     0  Go until jurong point, crazy.. Available only ...
sms1            1     0                      Ok lar... Joking wif u oni...
sms2!           2     1  Free entry in 2 a wkly comp to win FA Cup fina...
sms3            3     0  U dun say so early hor... U c already then say...
sms4            4     0  Nah I don't think he goes to usf, he lives aro...
sms5!           5     1  FreeMsg Hey there darling it's been 3 week's n...


计算词袋向量

In [24]:
# 计算词袋向量
np.random.seed(42)
counter = CountVectorizer(tokenizer=casual_tokenize) ##对每个训练文本，只考虑每种词汇在该训练文本中出现的频率
## CountVectorizer会将文本中的词语转化为词频矩阵，它通过fit_transform函数计算各个词语出现的次数
bow_docs = pd.DataFrame(counter.fit_transform(raw_documents=sms.text).toarray(), index=index)
column_nums, terms = zip(*sorted(zip(counter.vocabulary_.values(), counter.vocabulary_.keys())))
bow_docs.columns = terms


再次检查，看这里的词频是否对标记为“sms0”有意义

In [25]:
# 看看对标记为“sms0”的第一条短消息
print(sms.loc['sms0'].text)


Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [26]:
print( bow_docs.loc['sms0'][bow_docs.loc['sms0'] > 0].head())

,            1
..           1
...          2
amore        1
available    1
Name: sms0, dtype: int64


用LDIA为短消息语料库创建主题

In [27]:
from sklearn.decomposition import LatentDirichletAllocation as LDiA

In [28]:
ldia = LDiA(n_components=16, learning_method='batch')
ldia = ldia.fit(bow_docs)

In [29]:
ldia.components_.shape

(16, 9232)

因此上述模型已经将9232个词 分配给16个主题

In [30]:
# 看看开头的几个词，我们了解一下它们是如何分配到 16 个主题中的。
pd.set_option('display.width', 75)  ##横向最多显示75个字符
columns = ['topic{}'.format(i) for i in range(ldia.n_components)]
components = pd.DataFrame(ldia.components_.T, index=terms, columns=columns)

In [35]:
components.round(2).head(10)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
!,184.03,15.0,72.22,394.95,45.48,36.14,9.55,44.81,0.43,90.23,37.42,44.18,64.4,297.29,41.16,11.7
"""",0.68,4.22,2.41,0.06,152.35,0.06,0.06,0.06,0.45,0.68,8.42,11.42,0.07,62.72,12.27,0.06
#,0.06,0.06,0.06,0.06,0.06,2.07,0.06,0.06,0.06,0.06,0.06,0.06,1.07,4.05,0.06,0.06
#150,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,1.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06
#5000,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,3.06,0.06,0.06,0.06,0.06,0.06,0.06
$,1.09,2.99,0.06,0.06,1.13,0.06,0.06,1.06,8.68,0.06,0.06,1.06,0.06,5.42,2.06,0.06
%,0.06,0.06,0.06,1.06,0.06,4.95,0.06,0.06,0.06,0.06,0.06,2.17,0.06,0.06,2.06,0.06
&,10.26,0.06,0.06,47.49,22.58,9.97,19.01,0.06,0.06,107.26,10.09,0.06,0.06,50.24,7.42,10.31
',0.06,0.06,0.06,0.06,21.08,0.06,0.06,0.06,0.06,3.39,0.06,0.06,0.06,7.87,0.06,127.92
(,0.06,0.06,0.35,2.16,9.95,0.06,13.42,0.06,0.06,52.09,3.75,0.06,0.06,0.89,4.88,0.06


'!'被分配到大多数主题中，它其实是topic3中一个重要的部分，在该主题中引号（"）几乎不起作用.huoxutopic3关注情感的强度或强调，并不太在意数值或者引用。

In [34]:
components.topic3.sort_values(ascending=False)[:10]

!       394.952246
.       218.049724
to      119.533134
u       118.857546
call    111.948541
£       107.358914
,        96.954384
*        90.314783
your     90.215961
is       75.750037
Name: topic3, dtype: float64

在拟合LDA分类器值钱，需要为所有短消息计算出LDiA主题向量。

In [36]:
ldia16_topic_vectors = ldia.transform(bow_docs)
ldia16_topic_vectors = pd.DataFrame(ldia16_topic_vectors, index=index, columns=columns)

对比于pca,svd.Ldia产生的主题之间分隔更加清晰

In [39]:
print(ldia16_topic_vectors.round(2).head())   ##round()为返回浮点数x的四舍五入值

       topic0  topic1  topic2  topic3  topic4  topic5  topic6  topic7  \
sms0     0.00    0.62    0.00    0.00    0.00    0.00    0.00    0.00   
sms1     0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.01   
sms2!    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00   
sms3     0.00    0.00    0.00    0.00    0.09    0.00    0.00    0.00   
sms4     0.39    0.00    0.33    0.00    0.00    0.00    0.14    0.00   

       topic8  topic9  topic10  topic11  topic12  topic13  topic14  \
sms0     0.34    0.00     0.00     0.00     0.00     0.00     0.00   
sms1     0.78    0.01     0.01     0.12     0.01     0.01     0.01   
sms2!    0.00    0.98     0.00     0.00     0.00     0.00     0.00   
sms3     0.85    0.00     0.00     0.00     0.00     0.00     0.00   
sms4     0.00    0.00     0.00     0.00     0.09     0.00     0.00   

       topic15  
sms0      0.00  
sms1      0.01  
sms2!     0.00  
sms3      0.00  
sms4      0.00  


上述主题之间分隔更加清晰。在为消息分配主题时，会出现很多0.这是使得LDiA主题更容易想同时解释的做法之一

下面用LDiA向量来训练LDA模型

In [40]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [42]:
from sklearn.model_selection import train_test_split

In [56]:
x_train,x_test,y_train,y_test=train_test_split(ldia16_topic_vectors,sms.spam,test_size=0.99,random_state=271828)

In [57]:
ldia16_topic_vectors

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.002500,0.620075,0.002500,0.002500,0.002500,0.002500,0.002500,0.002500,0.344925,0.002500,0.002500,0.002500,0.002500,0.002500,0.002500,0.002500
sms1,0.006944,0.006944,0.006944,0.006944,0.006944,0.006944,0.006944,0.006944,0.781721,0.006944,0.006944,0.121056,0.006944,0.006944,0.006944,0.006944
sms2!,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.976562,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563
sms3,0.004464,0.004464,0.004464,0.004464,0.088350,0.004464,0.004464,0.004464,0.849150,0.004464,0.004464,0.004464,0.004464,0.004464,0.004464,0.004464
sms4,0.394632,0.004167,0.333172,0.004167,0.004167,0.004167,0.136218,0.004167,0.004167,0.004167,0.004167,0.004167,0.085978,0.004167,0.004167,0.004167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sms4832!,0.001563,0.001563,0.001563,0.927395,0.001563,0.001563,0.001563,0.001563,0.001563,0.050730,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563
sms4833,0.006250,0.006250,0.006250,0.006250,0.006250,0.589556,0.006250,0.006250,0.322944,0.006250,0.006250,0.006250,0.006250,0.006250,0.006250,0.006250
sms4834,0.003906,0.003906,0.494008,0.003906,0.003906,0.003906,0.003906,0.003906,0.354560,0.003906,0.003906,0.003906,0.003906,0.003906,0.003906,0.100650
sms4835,0.002315,0.002315,0.681962,0.002315,0.002315,0.002315,0.002315,0.002315,0.002315,0.285630,0.002315,0.002315,0.002315,0.002315,0.002315,0.002315


In [58]:
sms.spam

sms0        0
sms1        0
sms2!       1
sms3        0
sms4        0
           ..
sms4832!    1
sms4833     0
sms4834     0
sms4835     0
sms4836     0
Name: spam, Length: 4837, dtype: int64

In [60]:
x_test.shape

(4789, 16)

In [61]:
y_test.shape

(4789,)

In [62]:
lda=LDA(n_components=1)  ##

In [63]:
lda=lda.fit(x_train,y_train)

In [64]:
sms['ldial6_spam']=lda.predict(ldia16_topic_vectors)

In [65]:
lda.score(x_test,y_test)

0.8730423888076843

改变训练集测试集比例

In [66]:
x_train,x_test,y_train,y_test=train_test_split(ldia16_topic_vectors,sms.spam,test_size=0.8,random_state=271828)

In [67]:
lda=LDA(n_components=1)  ##

In [68]:
lda=lda.fit(x_train,y_train)

In [69]:
sms['ldial6_spam']=lda.predict(ldia16_topic_vectors)

In [70]:
lda.score(x_test,y_test)

0.9304909560723514