# Chap 4. Naive Bayes and Sentiment Classification

In [1]:
import pandas as pd
import numpy as np

## 4.1

Assume the following likelihoods for each word being part of a positive or negative movie review, and equal prior probabilities for each class.

What class will Naive bayes assign to the sentence “I always like foreign films.”?

In [2]:
a = 'I 0.09 0.16 always 0.07 0.06 like 0.29 0.06 foreign 0.04 0.15 films 0.08 0.11'.split()
b = pd.DataFrame(a).values.reshape([-1,3])
df = pd.DataFrame(b, columns=['word', 'pos', 'neg']).set_index('word')
df

Unnamed: 0_level_0,pos,neg
word,Unnamed: 1_level_1,Unnamed: 2_level_1
I,0.09,0.16
always,0.07,0.06
like,0.29,0.06
foreign,0.04,0.15
films,0.08,0.11


In [3]:
p_pos = 0.5
p_neg = 0.5
for index, row in df.iterrows():
    p_pos *= float(row['pos'])
    p_neg *= float(row['neg'])

cate = 'pos' if p_pos > p_neg else 'neg'
print('category: %s, P(pos)=%.2e, P(neg)=%.2e' % (cate, p_pos, p_neg))

category: neg, P(pos)=2.92e-06, P(neg)=4.75e-06


In [4]:
# Only count: I, like, foreign
# I and foreign are the main words that scoring neg
print('pos: %.2e' % (0.09 * 0.29 * 0.04))
print('neg: %.2e' % (0.16 * 0.06 * 0.15))

pos: 1.04e-03
neg: 1.44e-03


#### Notes

the classification is not good.

## 4.2

Given the following short movie reviews, each labeled with a genre, either comedy or action:

1. fun, couple, love, love comedy
2. fast, furious, shoot action
3. couple, fly, fast, fun, fun comedy
4. furious, shoot, shoot, fun action
5. fly, fast, shoot, love action

a new document D:

fast, couple, shoot, fly

compute the most likely class for D. Assume a naive Bayes classifier and use add-1 smoothing for the likelihoods.

In [5]:
a = '''1. fun, couple, love, love comedy
2. fast, furious, shoot action
3. couple, fly, fast, fun, fun comedy
4. furious, shoot, shoot, fun action
5. fly, fast, shoot, love action'''
docs = [line.replace(',', '').split()[1:] for line in a.split('\n')]
docs

[['fun', 'couple', 'love', 'love', 'comedy'],
 ['fast', 'furious', 'shoot', 'action'],
 ['couple', 'fly', 'fast', 'fun', 'fun', 'comedy'],
 ['furious', 'shoot', 'shoot', 'fun', 'action'],
 ['fly', 'fast', 'shoot', 'love', 'action']]

In [6]:
label_cnt = {}
word_cnt = {}
corpus = set()

for row in docs:
    label = row[-1]
    if label not in label_cnt:
        label_cnt[label] = 1
        word_cnt[label] = {}
    else:
        label_cnt[label] += 1

    label_word_cnt = word_cnt[label]
    for word in row[:-1]:
        corpus.add(word)
        if word not in label_word_cnt:
            label_word_cnt[word] = 1
        else:
            label_word_cnt[word] +=1

doc_cnt = len(docs)
p_label = {label: 1.0 * cnt / doc_cnt for label, cnt in label_cnt.items()}
print(p_label)
print(word_cnt)
print(corpus)

{'comedy': 0.4, 'action': 0.6}
{'comedy': {'fun': 3, 'couple': 2, 'love': 2, 'fly': 1, 'fast': 1}, 'action': {'fast': 2, 'furious': 2, 'shoot': 4, 'fun': 1, 'fly': 1, 'love': 1}}
{'love', 'fun', 'shoot', 'couple', 'fast', 'fly', 'furious'}


In [7]:
likely_hoods = {}
V = len(corpus)
for label, cnts in word_cnt.items():
    likely_hood = {}
    total = sum(cnts.values()) + V
    
    for w in corpus:
        likely_hood[w] = 1.0 * (cnts.get(w, 0) + 1) / total
    likely_hoods[label] = likely_hood

df=pd.DataFrame(likely_hoods)
print('verify the sum of likely hood in each label:')
print(df.sum())
df

verify the sum of likely hood in each label:
comedy    1.0
action    1.0
dtype: float64


Unnamed: 0,comedy,action
couple,0.1875,0.055556
fast,0.125,0.166667
fly,0.125,0.111111
fun,0.25,0.111111
furious,0.0625,0.166667
love,0.1875,0.111111
shoot,0.0625,0.277778


In [8]:
doc = 'fast, couple, shoot, fly'.replace(',', '').split()
doc

['fast', 'couple', 'shoot', 'fly']

In [9]:
for label, likely in likely_hoods.items():
    p_i = p_label[label]
    for w in doc:
        p_i *= likely[w]
    print('prob of %s: %.2e' % (label, p_i))

prob of comedy: 7.32e-05
prob of action: 1.71e-04


属于 action，其中 shoot 是关键识别词。

## 4.3

Train two models, multinominal naive Bayes and binarized naive Bayes, both with add-1 smoothing, on the following document counts for key sentiment words, with positive or negative class assigned as noted.

Use both naive Bayes models to assign a class (pos or neg) to this sentence:

A good, good plot and great characters, but poor acting.

Do the two models agree or disagree?

In [10]:
a = 'd1.3 0 3 pos d2.0 1 2 pos d3.1 3 0 neg d4.1 5 2 neg d5.0 2 0 neg'.replace('.', ' ').split()
b = pd.DataFrame(a).values.reshape([-1, 5])
df = pd.DataFrame(b, columns='doc good poor great class'.split()).set_index('doc')
df

Unnamed: 0_level_0,good,poor,great,class
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d1,3,0,3,pos
d2,0,1,2,pos
d3,1,3,0,neg
d4,1,5,2,neg
d5,0,2,0,neg


In [11]:
doc = 'A good, good plot and great characters, but poor acting'.lower().replace(',', '').split()
doc

['a',
 'good',
 'good',
 'plot',
 'and',
 'great',
 'characters',
 'but',
 'poor',
 'acting']

#### NB

In [12]:
label_cnt = {}
word_cnt = {}

corpus = 'good poor great'.split()

for index, row in df.iterrows():
    label = row[-1]
    if label not in label_cnt:
        label_cnt[label] = 1
        word_cnt[label] = {}
    else:
        label_cnt[label] += 1

    label_word_cnt = word_cnt[label]
    for word, cnt in zip(corpus, row[:-1]):
        if word not in label_word_cnt:
            label_word_cnt[word] = int(cnt)
        else:
            label_word_cnt[word] += int(cnt)

doc_cnt = len(df)
p_label = {label: 1.0 * cnt / doc_cnt for label, cnt in label_cnt.items()}
print(p_label)
print(word_cnt)
print(corpus)

{'pos': 0.4, 'neg': 0.6}
{'pos': {'good': 3, 'poor': 1, 'great': 5}, 'neg': {'good': 2, 'poor': 10, 'great': 2}}
['good', 'poor', 'great']


In [13]:
likely_hoods = {}
V = len(corpus)
for label, cnts in word_cnt.items():
    likely_hood = {}
    total = sum(cnts.values()) + V
    
    for w in corpus:
        likely_hood[w] = 1.0 * (cnts.get(w, 0) + 1) / total
    likely_hoods[label] = likely_hood

lh = pd.DataFrame(likely_hoods)
print('verify the sum of likely hood in each label:')
print(lh.sum())
lh

verify the sum of likely hood in each label:
pos    1.0
neg    1.0
dtype: float64


Unnamed: 0,pos,neg
good,0.333333,0.176471
great,0.5,0.176471
poor,0.166667,0.647059


In [14]:
for label, likely in likely_hoods.items():
    p_i = p_label[label]
    for w in doc:
        p_i *= likely.get(w, 1)
    print('prob of %s: %.2e' % (label, p_i))

prob of pos: 3.70e-03
prob of neg: 2.13e-03


#### Notes

the classification is not good, since the freqencies of pos words in the target doc are too high.

#### binarized NB

In [15]:
label_cnt = {}
word_cnt = {}

corpus = 'good poor great'.split()

for index, row in df.iterrows():
    label = row[-1]
    if label not in label_cnt:
        label_cnt[label] = 1
        word_cnt[label] = {}
    else:
        label_cnt[label] += 1

    label_word_cnt = word_cnt[label]
    for word, cnt in zip(corpus, row[:-1]):
        if word not in label_word_cnt:
            label_word_cnt[word] = int(cnt != '0')
        else:
            label_word_cnt[word] += int(cnt != '0')

doc_cnt = len(df)
p_label = {label: 1.0 * cnt / doc_cnt for label, cnt in label_cnt.items()}
print(p_label)
print(word_cnt)
print(corpus)

{'pos': 0.4, 'neg': 0.6}
{'pos': {'good': 1, 'poor': 1, 'great': 2}, 'neg': {'good': 2, 'poor': 3, 'great': 1}}
['good', 'poor', 'great']


In [16]:
likely_hoods = {}
V = len(corpus)
for label, cnts in word_cnt.items():
    likely_hood = {}
    total = sum(cnts.values()) + V
    
    for w in corpus:
        likely_hood[w] = 1.0 * (cnts.get(w, 0) + 1) / total
    likely_hoods[label] = likely_hood

lh = pd.DataFrame(likely_hoods)
print('verify the sum of likely hood in each label:')
print(lh.sum())
lh

verify the sum of likely hood in each label:
pos    1.0
neg    1.0
dtype: float64


Unnamed: 0,pos,neg
good,0.285714,0.333333
great,0.428571,0.222222
poor,0.285714,0.444444


In [17]:
for label, likely in likely_hoods.items():
    p_i = p_label[label]
    for w in doc:
        p_i *= likely.get(w, 1)
    print('prob of %s: %.2e' % (label, p_i))

prob of pos: 4.00e-03
prob of neg: 6.58e-03


#### Notes

the outout is 'neg' which is expected. the improvement is caused by reducing the frequencies of pos words.

#### Summary

the two models are disagree.

binarized NB works better for sentiment classification