In [2]:
# 读取spam.csv文件
import pandas as pd
df = pd.read_csv("data_spam/spam.csv", encoding='latin', usecols=['v1', 'v2'])
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# 重命名数据中的v1和v2列，使得拥有更好的可读性
df.rename(columns={'v1': 'Label', 'v2': 'Text'}, inplace=True)
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# 把'ham'和'spam'标签重新命名为数字0和1
df['numLabel'] = df['Label'].map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,Label,Text,numLabel
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
# 统计有多少个ham，有多少个spam
print("# of ham : ", len(df[df.numLabel == 0]),
      " # of spam: ", len(df[df.numLabel == 1]))
print("# of total samples: ", len(df))

# of ham :  4825  # of spam:  747
# of total samples:  5572


In [6]:
# 统计文本的长度信息
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
text_lengths = [len(df.loc[i, 'Text']) for i in range(len(df))]
print("the minimum length is: ", min(text_lengths))

plt.hist(text_lengths, 100, facecolor='blue', alpha=0.5)
plt.xlim([0, 200])
plt.show()

the minimum length is:  2


<Figure size 640x480 with 1 Axes>

In [7]:
# 导入英文的停用词库
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
# what is stop wordS? he she the an a that this ...
stopset = set(stopwords.words("english"))

# 构建文本的向量 （基于词频的表示）
vectorizer = CountVectorizer(stop_words=stopset, binary=True)
# vectorizer = CountVectorizer()

# sparse matrix
X = vectorizer.fit_transform(df.Text)
y = df.numLabel

In [25]:
print(X.shape)
print(X[1][0])

(5572, 8536)
  (0, 5427)	1
  (0, 4448)	1
  (0, 4255)	1
  (0, 8264)	1
  (0, 5454)	1


In [26]:
# 把数据分成训练数据和测试数据
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=100)
print("训练数据中的样本个数: ", X_train.shape[0], "测试数据中的样本个数: ", X_test.shape[0])

训练数据中的样本个数:  4457 测试数据中的样本个数:  1115


In [30]:
# 利用朴素贝叶斯做训练
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("accuracy on test data: ", accuracy_score(y_test, y_pred))

# 打印混淆矩阵
confusion_matrix(y_test, y_pred, labels=[1, 0])

accuracy on test data:  0.9757847533632287


array([[136,   9],
       [ 18, 952]])