# 4.1 基于贝叶斯决策理论的分类方法

# 4.2 条件概率

# 4.3 使用条件概率来分类

# 4.4 使用朴素贝叶斯进行文档分类

# 4.5 使用Python进行文本分类

## 4.5.1 准备数据：从文本中构建词向量

**程序清单4-1** 词表到向量的转换函数

In [2]:
def load_data_set():
    posting_list = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    class_vec = [0,1,0,1,0,1]
    return posting_list, class_vec

def create_vocab_list(data_set):
    vocab_set = set([])
    for document in data_set:
        vocab_set = vocab_set | set(document)
    return list(vocab_set)

def set_of_words2vec(vocab_list, input_set):
    return_vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] = 1
        else:
            print('The word: {} is not in my vocabulary!'.format(word))
    return return_vec

In [3]:
list_of_posts, list_classes = load_data_set()

In [4]:
my_vocab_list = create_vocab_list(list_of_posts)

In [5]:
my_vocab_list

['flea',
 'mr',
 'stop',
 'dog',
 'stupid',
 'love',
 'not',
 'ate',
 'to',
 'posting',
 'take',
 'steak',
 'worthless',
 'how',
 'so',
 'my',
 'problems',
 'I',
 'has',
 'is',
 'help',
 'buying',
 'food',
 'garbage',
 'park',
 'please',
 'quit',
 'maybe',
 'him',
 'cute',
 'licks',
 'dalmation']

In [6]:
set_of_words2vec(my_vocab_list, list_of_posts[0])

[1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0]

In [7]:
set_of_words2vec(my_vocab_list, list_of_posts[3])

[0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

## 4.5.2 训练算法：从词向量计算概率

**程序清单4-2** 朴素贝叶斯分类器训练函数

In [8]:
from numpy import *

In [9]:
def train_NB0(train_matrix, train_category):
    num_train_docs = len(train_matrix)
    num_words = len(train_matrix[0])
    p_abusive = sum(train_category)/float(num_train_docs)
    p0_num = zeros(num_words)
    p1_num = zeros(num_words)
    p0_denom = 0.0
    p1_denom = 0.0
    for i in range(num_train_docs):
        if train_category[i] == 1:
            p1_num += train_matrix[i]
            p1_denom += sum(train_matrix[i])
        else:
            p0_num += train_matrix[i]
            p0_denom += sum(train_matrix[i])
    p1_vect = p1_num/p1_denom
    p0_vect = p0_num/p0_denom
    return p0_vect, p1_vect, p_abusive

In [10]:
list_of_posts, list_classes = load_data_set()

In [11]:
my_vocab_list = create_vocab_list(list_of_posts)

In [12]:
train_mat = []

In [13]:
for post_in_doc in list_of_posts:
    train_mat.append(set_of_words2vec(my_vocab_list, post_in_doc))

In [15]:
p0V, p1V, pAb = train_NB0(train_mat, list_classes)

In [16]:
pAb

0.5

In [17]:
p0V

array([ 0.04166667,  0.04166667,  0.04166667,  0.04166667,  0.        ,
        0.04166667,  0.        ,  0.04166667,  0.04166667,  0.        ,
        0.        ,  0.04166667,  0.        ,  0.04166667,  0.04166667,
        0.125     ,  0.04166667,  0.04166667,  0.04166667,  0.04166667,
        0.04166667,  0.        ,  0.        ,  0.        ,  0.        ,
        0.04166667,  0.        ,  0.        ,  0.08333333,  0.04166667,
        0.04166667,  0.04166667])

In [18]:
p1V

array([ 0.        ,  0.        ,  0.05263158,  0.10526316,  0.15789474,
        0.        ,  0.05263158,  0.        ,  0.05263158,  0.05263158,
        0.05263158,  0.        ,  0.10526316,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.05263158,  0.05263158,  0.05263158,  0.05263158,
        0.        ,  0.05263158,  0.05263158,  0.05263158,  0.        ,
        0.        ,  0.        ])

## 4.5.3 测试算法：根据现实情况修改分类器

In [19]:
def train_NB0(train_matrix, train_category):
    num_train_docs = len(train_matrix)
    num_words = len(train_matrix[0])
    p_abusive = sum(train_category)/float(num_train_docs)
    p0_num = ones(num_words)
    p1_num = ones(num_words)
    p0_denom = 2.0
    p1_denom = 2.0
    for i in range(num_train_docs):
        if train_category[i] == 1:
            p1_num += train_matrix[i]
            p1_denom += sum(train_matrix[i])
        else:
            p0_num += train_matrix[i]
            p0_denom += sum(train_matrix[i])
    p1_vect = log(p1_num/p1_denom)
    p0_vect = log(p0_num/p0_denom)
    return p0_vect, p1_vect, p_abusive

**程序清单4-3** 朴素贝叶斯分类函数

In [22]:
def classify_NB(vec2classify, p0_vec, p1_vec, p_class1):
    p1 = sum(vec2classify * p1_vec) + log(p_class1)
    p0 = sum(vec2classify * p0_vec) + log(1.0 - p_class1)
    if p1 > p0:
        return 1
    if p1 < p0:
        return 0

def testing_NB():
    list_of_posts, list_classes = load_data_set()
    my_vocab_list = create_vocab_list(list_of_posts)
    train_mat = []
    for post_in_doc in list_of_posts:
        train_mat.append(set_of_words2vec(my_vocab_list, post_in_doc))
    p0V, p1V, pAb = train_NB0(array(train_mat), array(list_classes))
    test_entry = ['love', 'my', 'dalmation']
    this_doc = array(set_of_words2vec(my_vocab_list, test_entry))
    print(test_entry, 'classified as:',classify_NB(this_doc, p0V, p1V, pAb))
    test_entry = ['stupid', 'garbage']
    this_doc = array(set_of_words2vec(my_vocab_list, test_entry))
    print(test_entry, 'classified as:',classify_NB(this_doc, p0V, p1V, pAb))

In [23]:
testing_NB()

['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as: 1


## 4.5.4 准备数据：文档词袋模型

**程序清单4-4** 朴素贝叶斯模型

In [24]:
def bag_of_words2vec_MN(vocab_list, input_set):
    return_vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] += 1
    return return_vec

# 4.6 示例：使用朴素贝叶斯过滤垃圾邮件

## 4.6.1 准备数据：切分文本

In [25]:
my_sent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'

In [26]:
my_sent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [27]:
import re
reg_ex = re.compile('\\W*')
list_of_tokens = reg_ex.split(my_sent)

  app.launch_new_instance()


In [28]:
list_of_tokens

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

In [29]:
[tok for tok in list_of_tokens if len(tok) > 0]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [30]:
[tok.lower() for tok in list_of_tokens if len(tok) > 0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [31]:
email_text = open('email/ham/6.txt').read()

In [32]:
list_of_tokens = reg_ex.split(email_text)

  if __name__ == '__main__':


In [33]:
list_of_tokens

['Hello',
 'Since',
 'you',
 'are',
 'an',
 'owner',
 'of',
 'at',
 'least',
 'one',
 'Google',
 'Groups',
 'group',
 'that',
 'uses',
 'the',
 'customized',
 'welcome',
 'message',
 'pages',
 'or',
 'files',
 'we',
 'are',
 'writing',
 'to',
 'inform',
 'you',
 'that',
 'we',
 'will',
 'no',
 'longer',
 'be',
 'supporting',
 'these',
 'features',
 'starting',
 'February',
 '2011',
 'We',
 'made',
 'this',
 'decision',
 'so',
 'that',
 'we',
 'can',
 'focus',
 'on',
 'improving',
 'the',
 'core',
 'functionalities',
 'of',
 'Google',
 'Groups',
 'mailing',
 'lists',
 'and',
 'forum',
 'discussions',
 'Instead',
 'of',
 'these',
 'features',
 'we',
 'encourage',
 'you',
 'to',
 'use',
 'products',
 'that',
 'are',
 'designed',
 'specifically',
 'for',
 'file',
 'storage',
 'and',
 'page',
 'creation',
 'such',
 'as',
 'Google',
 'Docs',
 'and',
 'Google',
 'Sites',
 'For',
 'example',
 'you',
 'can',
 'easily',
 'create',
 'your',
 'pages',
 'on',
 'Google',
 'Sites',
 'and',
 'share',


## 4.6.2 测试算法：使用朴素贝叶斯进行交叉验证

**程序清单4-5** 文件解析及完整的垃圾邮件测试函数

In [53]:
def text_parse(big_string):
    import re
    list_of_tokens = re.split(r'\W*', big_string)
    return [tok.lower() for tok in list_of_tokens if len(tok) > 2]

def spam_test():
    doc_list = []
    class_list = []
    full_text = []
    for i in range(1, 26):
        word_list = text_parse(open('email/spam/{}.txt'.format(i)).read())
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)
        word_list = text_parse(open('email/ham/{}.txt'.format(i)).read())
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)
    vocab_list = create_vocab_list(doc_list)
    training_set = list(range(50))
    test_set = []
    for i in range(10):
        rand_index = int(random.uniform(0, len(training_set)))
        test_set.append(training_set[rand_index])
        del(training_set[rand_index])
    train_mat = []
    train_classes = []
    for doc_index in training_set:
        train_mat.append(set_of_words2vec(vocab_list, doc_list[doc_index]))
        train_classes.append(class_list[doc_index])
    p0V, p1V, p_spam = train_NB0(array(train_mat), array(train_classes))
    error_count = 0
    for doc_index in test_set:
        word_vector = set_of_words2vec(vocab_list, doc_list[doc_index])
        if classify_NB(array(word_vector), p0V, p1V, p_spam) != class_list[doc_index]:
            error_count += 1
    print('The error rate is: ', float(error_count)/len(test_set))        

In [54]:
spam_test()

The error rate is:  0.1


  return _compile(pattern, flags).split(string, maxsplit)


In [55]:
spam_test()

The error rate is:  0.0


  return _compile(pattern, flags).split(string, maxsplit)


# 4.7 示例：使用朴素贝叶斯分类器从个人广告中获取区域倾向

## 4.7.1 收集数据：导入RSS源

In [56]:
import feedparser

In [61]:
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')

In [62]:
ny['entries']

[{'dc_source': 'http://newyork.craigslist.org/brk/stp/6269059846.html',
  'dc_type': 'text',
  'enc_enclosure': {'resource': 'https://images.craigslist.org/00o0o_lWpVzu6teeE_300x300.jpg',
   'type': 'image/jpeg'},
  'id': 'http://newyork.craigslist.org/brk/stp/6269059846.html',
  'language': 'en-us',
  'link': 'http://newyork.craigslist.org/brk/stp/6269059846.html',
  'links': [{'href': 'http://newyork.craigslist.org/brk/stp/6269059846.html',
    'rel': 'alternate',
    'type': 'text/html'}],
  'published': '2017-08-19T23:31:42-04:00',
  'published_parsed': time.struct_time(tm_year=2017, tm_mon=8, tm_mday=20, tm_hour=3, tm_min=31, tm_sec=42, tm_wday=6, tm_yday=232, tm_isdst=0),
  'rights': 'copyright 2017 craiglist',
  'rights_detail': {'base': 'https://newyork.craigslist.org/search/stp?format=rss',
   'language': None,
   'type': 'text/plain',
   'value': 'copyright 2017 craiglist'},
  'summary': 'Looking for a girl for friendship... please add pictures 9two9 threefive5 four6two9',
  

In [63]:
len(ny['entries'])

25

**程序清单4-6** RSS源分类器及高频词去除函数

In [64]:
def calc_most_freq(vocab_list, full_text):
    import operator
    freq_dict = {}
    for token in vocab_list:
        freq_dict[token] = full_text.count(token)
    sorted_freq = sorted(freq_dict.items(), key=operator.itemgetter(1),
                        reverse=True)
    return sorted_freq[:30]

def local_words(feed1, feed0):
    import feedparser
    doc_list = []
    class_list = []
    full_text = []
    min_len = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(min_len):
        word_list = text_parse(feed1['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)
        word_list = text_parse(feed0['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)
    vocab_list = create_vocab_list(doc_list)
    top_30_words = calc_most_freq(vocab_list, full_text)
    for pair_w in top_30_words:
        if pair_w[0] in vocab_list:
            vocab_list.remove(pair_w[0])
    training_set = list(range(2*min_len))
    test_set = []
    for i in range(20):
        rand_index = int(random.uniform(0, len(training_set)))
        test_set.append(training_set[rand_index])
        del(training_set[rand_index])
    train_mat = []
    train_classes = []
    for doc_index in training_set:
        train_mat.append(bag_of_words2vec_MN(vocab_list, doc_list[doc_index]))
        train_classes.append(class_list[doc_index])
    p0V, p1V, p_spam = train_NB0(array(train_mat), array(train_classes))
    error_count = 0
    for doc_index in test_set:
        word_vector = bag_of_words2vec_MN(vocab_list, doc_list[doc_index])
        if classify_NB(array(word_vector), p0V, p1V, p_spam) != class_list[doc_index]:
            error_count += 1
    print('The error rate is:', float(error_count)/len(test_set))
    return vocab_list, p0V, p1V

In [66]:
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

In [67]:
vocab_list, p_SF, p_NY = local_words(ny, sf)

The error rate is: 0.6


  return _compile(pattern, flags).split(string, maxsplit)


In [68]:
vocab_list, p_SF, p_NY = local_words(ny, sf)

The error rate is: 0.35


  return _compile(pattern, flags).split(string, maxsplit)


## 4.7.2 分析数据：显示地域相关的用词

**程序清单4-7** 最具表征性的词汇显示函数

In [77]:
def get_top_words(ny, sf):
    import operator
    vocab_list, p0V, p1V = local_words(ny, sf)
    top_NY = []; top_SF = []
    for i in range(len(p0V)):
        if p0V[i] > -4.5:
            top_SF.append((vocab_list[i], p0V[i]))
        if p1V[i] > -4.5:
            top_NY.append((vocab_list[i], p1V[i]))
    sorted_SF = sorted(top_SF, key=lambda pair: pair[1], reverse=True)
    print('SF**' * 15)
    for item in sorted_SF:
        print(item[0])
    sorted_NY = sorted(top_NY, key=lambda pair: pair[1], reverse=True)
    print('NY**' * 15)
    for item in sorted_NY:
        print(item[0])

In [78]:
get_top_words(ny, sf)

The error rate is: 0.4
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
something
sensual
pretty
tonight
latino
very
real
sex
passionate
going
young
NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**
more
girl
feet
such
them
asian
but
hang
over
may
time
look
know
talk
mom
our
pic
best
very
ride
these
there
girls
friend
some
ladies
massages
friends
game


  return _compile(pattern, flags).split(string, maxsplit)


# 4.8 本章小结