In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


data_dir = "../input/"
df = pd.read_csv(data_dir + '/spam.csv', encoding='latin-1')
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
print(type(df.v1))

<class 'pandas.core.series.Series'>


In [3]:

# 把数据拆分成为训练集和测试集
data_train, data_test, labels_train, labels_test = train_test_split(
    df.v2,
    df.v1, 
    test_size=0.2, 
    random_state=0)  

print ('拆分过后的每个邮件内容')
print (data_train[:10])
print ('拆分过后每个邮件是否是垃圾邮件')
print (labels_train[:10])

拆分过后的每个邮件内容
1114    No no:)this is kallis home ground.amla home to...
3589    I am in escape theatre now. . Going to watch K...
3095    We walked from my moms. Right on stagwood pass...
1012       I dunno they close oredi not... ÌÏ v ma fan...
3320                               Yo im right by yo work
4130    \Its Ur luck to Love someone. Its Ur fortune t...
1197     He also knows about lunch menu only da. . I know
5426        Oh yeah! And my diet just flew out the window
624     Nah it's straight, if you can just bring bud o...
2260    SplashMobile: Choose from 1000s of gr8 tones e...
Name: v2, dtype: object
拆分过后每个邮件是否是垃圾邮件
1114     ham
3589     ham
3095     ham
1012     ham
3320     ham
4130     ham
1197     ham
5426     ham
624      ham
2260    spam
Name: v1, dtype: object


In [4]:
print (data_test[:10])

4456    Aight should I just plan to come up later toni...
690                                    Was the farm open?
944     I sent my scores to sophas and i had to do sec...
3768    Was gr8 to see that message. So when r u leavi...
1189    In that case I guess I'll see you at campus lodge
4437    Nothing will ever be easy. But don't be lookin...
3587    If you were/are free i can give. Otherwise nal...
1982    Hey i will be late... i'm at amk. Need to drin...
2038          Hey are we going for the lo lesson or gym? 
2078                       85233 FREE>Ringtone!Reply REAL
Name: v2, dtype: object


In [5]:
data_train.shape

(4457,)

In [6]:
data_test.shape

(1115,)

建立词汇表，统计两个类目下面的共词计数

In [7]:
'''
    用一个dictionary保存词汇，并给每个词汇赋予唯一的一个id
'''



'\n    用一个dictionary保存词汇，并给每个词汇赋予唯一的一个id\n'

In [8]:
def GetVocabulary(data):
    vocab_dict={}    #以单词为key，以id 为 value。 id 从o开始
    wid=0
    for document in data:
        words=document.split() # 按空格分词“I am a student" => ['I', 'am','a','student']
        for word in words:
            word = word.lower() #归一化，都小写
            if word not in vocab_dict:
                vocab_dict[word]=wid
                wid += 1
    return vocab_dict


In [9]:
vocab_dict = GetVocabulary(data_train)
print('Number of all the unique words : '+ str(len(vocab_dict.keys())))


Number of all the unique words : 11706


In [10]:
data_train.head()

1114    No no:)this is kallis home ground.amla home to...
3589    I am in escape theatre now. . Going to watch K...
3095    We walked from my moms. Right on stagwood pass...
1012       I dunno they close oredi not... ÌÏ v ma fan...
3320                               Yo im right by yo work
Name: v2, dtype: object

In [11]:
# 以下的用不到，只是用来玩儿
vocab_dict_test = GetVocabulary(data_test)
print('Number of all the unique words in test : '+ str(len(vocab_dict_test.keys())))

Number of all the unique words in test : 4957


把文字变成词向量  以便于计算

In [12]:
def Document2Vector(vocab_dict,data):
    word_vector = np.zeros(len(vocab_dict.keys()))
    words = data.split()
    for word in words:
        word = word.lower()
        if word in vocab_dict:
            word_vector[vocab_dict[word]]+=1
    return word_vector

#examples

example = Document2Vector(vocab_dict," hello world")
print(example)
print(vocab_dict['world'])
print(example[vocab_dict['world']])
    
    
    

[0. 0. 0. ... 0. 0. 0.]
122
1.0


In [13]:
len(vocab_dict)

11706

In [14]:
#把训练集的句子全部变成向量形式，这里面全是数字，每个词汇表里的单词 根据id排序的 出现在 该文章里的次数，即使没有出现 也是有 0
train_matrix =[]
for document in data_train.values:
    word_vector = Document2Vector(vocab_dict,document)
    train_matrix.append(word_vector)

print(len(train_matrix))  # 有多少个文档
train_matrix[0:10]


4457


[array([1., 1., 2., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 1., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 1., ..., 0., 0., 0.])]

做naive bayes 训练 得到训练集每个词汇的概率

In [15]:
print(len(train_matrix[0]))  #有多少个词汇
train_matrix[0]

11706


array([1., 1., 2., ..., 0., 0., 0.])

In [16]:
ee = np.ones(12)
print(type(ee))
ee

<class 'numpy.ndarray'>


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [17]:
'''
    在训练集计算两种概率：
        1. 词在每个分类下的概率，比如P('email'|Spam)
        2. 每个分类的概率，比如P(Spam)
        
    这里的计算实现巧妙利用了numpy的array结构：
        1. 在每个分类下创建一个与词汇量大小相等的vector(即 numpy array), 即spam_word_counter 和 ham_word_counter
        2. 在遍历每一个句子的时候，直接与句子对应的vector相加，累积每个单词出现的次数
        3. 在遍历完所有句子之后，再除以总词汇量，得到每个单词的概率
'''

def NaiveBayes_train(train_matrix,labels_train):
    num_docs = len(train_matrix)
    num_words = len(train_matrix[0])  # 对第一个样本取长度
    
    # 在每个分类下创建一个与词汇量大小相等的vector(即 numpy array) 用以计算每个单词在该类别下的频率
    spam_word_counter =np.ones(num_words) 
    ham_word_counter = np.ones(num_words) #计算每个word出现的次数，初始化为1. 即使用拉普拉斯平滑
    
    spam_total_count =0 
    ham_total_count = 0   #每一个类别 单词总的计数， 所有词出现在ham里头的总数 ham的总词数 （不去重 ）
    
    spam_count =0  #spam 邮件的总数
    ham_count = 0
    
    for i in range(num_docs):
        if i%1000==0:
            print('Train on the doc id:'+ str(i))
        
        if labels_train[i] =='spam': #为什么这里是lables，因为前面已经从train_data里 提取了各种数据了，现在只要再跟label来对应就好
            spam_word_counter += train_matrix[i]
            spam_total_count += sum(train_matrix[i]) # 这里不能用spam_word_counter 因为会有重复，用matrix[i] 则不会重复
            spam_count +=1
        else:
            ham_word_counter += train_matrix[i]
            ham_total_count += sum(train_matrix[i])
            ham_count += 1
            
    #spam_word_counter => 每个词的计数
    #spam_total_count => Spam的总次数
    #spam_count => Spam邮件计数
    
    #以下则是，每个单词 在各类别下出现的概率，并且取了log，为什么取log，就是怕太小变成0～这部分再看看了解下为什么
    #并且注意 在分母上也要加上平滑部分
    p_spam_vector = np.log(spam_word_counter/(spam_total_count+num_words))
    p_ham_vector = np.log(ham_word_counter/(ham_total_count+num_words))

    return p_spam_vector, np.log(spam_count/num_docs), p_ham_vector,np.log(ham_count/num_docs)

p_spam_vector, p_spam, p_ham_vector, p_ham = NaiveBayes_train(train_matrix, labels_train.values)
    
    

Train on the doc id:0
Train on the doc id:1000
Train on the doc id:2000
Train on the doc id:3000
Train on the doc id:4000


In [18]:
len(vocab_dict)

11706

In [19]:
p_spam_vector

array([ -6.89649987, -10.1545964 ,  -5.30256614, ..., -10.1545964 ,
       -10.1545964 , -10.1545964 ])

In [20]:
len(vocab_dict)

11706

In [21]:
'''
    对测试集进行预测，按照公式计算例子在两个分类下的概率，选择概率较大者作为预测结果
'''

def Predict(test_word_vector, p_spam_vector,p_spam,p_ham_vector,p_ham):
    
    spam = sum(test_word_vector * p_spam_vector)+ p_spam
    ham = sum(test_word_vector * p_ham_vector) + p_ham
    
    if spam > ham:
        return 'spam'
    else:
        return 'ham'
    
predictions =[]

i = 0
for document in data_test.values:
    if i %200 ==0:
        print('test on the doc id: '+str(i))
    i+=1
    test_word_vector = Document2Vector(vocab_dict, document)
    ans = Predict(test_word_vector, p_spam_vector, p_spam,p_ham_vector,p_ham)
    predictions.append(ans)

test on the doc id: 0
test on the doc id: 200
test on the doc id: 400
test on the doc id: 600
test on the doc id: 800
test on the doc id: 1000


In [22]:
len(test_word_vector)

11706

In [23]:
len(vocab_dict)

11706

In [24]:
# 检测模型

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score


print (accuracy_score(labels_test, predictions))
print (classification_report(labels_test, predictions))
print (confusion_matrix(labels_test, predictions))

0.9775784753363229
             precision    recall  f1-score   support

        ham       0.98      1.00      0.99       949
       spam       0.99      0.86      0.92       166

avg / total       0.98      0.98      0.98      1115

[[948   1]
 [ 24 142]]
