# Week 3
# Rasika Bhalerao

# Agenda

- Inspect dataset
- Bernoulli Naive Bayes (binary features)
- Multinomial Naive Bayes (discrete features)
- Naive Bayes to generate documents
- (If time) Gaussian Naive Bayes (continuous features)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# This tells matplolib not to try opening a new window for each plot
%matplotlib inline

In [None]:
# Inspect dataset


df = pd.DataFrame({
    'document': [
        'whiskers tail tail paw purr',
        'meow whiskers whiskers',
        'meow meow paw purr',
        'paw bark woof bark',
        'paw paw bark bark'
    ],
    'category': [
        'cat', 'cat', 'cat', 'dog', 'dog'
    ]
})

print(f'Shape: {df.shape}')
print(f'Categories: {df["category"].unique()}')
print()
df.head()

Shape: (5, 2)
Categories: ['cat' 'dog']



Unnamed: 0,document,category
0,whiskers tail tail paw purr,cat
1,meow whiskers whiskers,cat
2,meow meow paw purr,cat
3,paw bark woof bark,dog
4,paw paw bark bark,dog


In [None]:
# Get features

train_docs = np.array(df['document'])
count = CountVectorizer()
X_train = count.fit_transform(train_docs)

print(f'features: {count.get_feature_names()}')
print(f'X_train:\n{X_train.toarray()}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_train:
[[0 0 1 1 2 1 0]
 [0 1 0 0 0 2 0]
 [0 2 1 1 0 0 0]
 [2 0 1 0 0 0 1]
 [2 0 2 0 0 0 0]]


In [None]:
# Make training set

X_train = pd.DataFrame(X_train.toarray())
X_train.columns = count.get_feature_names()

y_train = df['category']

train_df = pd.concat((y_train, X_train), axis=1)
train_df

Unnamed: 0,category,bark,meow,paw,purr,tail,whiskers,woof
0,cat,0,0,1,1,2,1,0
1,cat,0,1,0,0,0,2,0
2,cat,0,2,1,1,0,0,0
3,dog,2,0,1,0,0,0,1
4,dog,2,0,2,0,0,0,0


## Bernoulli: all features are 0 or 1

In [None]:
# Let's make it so for each feature, it is 1 if the word is there, and 0 if not

X_bernoulli = X_train.copy()

for feature in X_train.columns:
    X_bernoulli[feature] = np.where(X_bernoulli[feature] >= 1, 1, 0)
X_bernoulli.head()

Unnamed: 0,bark,meow,paw,purr,tail,whiskers,woof
0,0,0,1,1,1,1,0
1,0,1,0,0,0,1,0
2,0,1,1,1,0,0,0
3,1,0,1,0,0,0,1
4,1,0,1,0,0,0,0


In [None]:
# Test set

test_docs = np.array([
  'whiskers bark paw paw paw',
  'purr hello'
])

X_test = count.transform(test_docs).toarray()

print(f'features: {count.get_feature_names()}')
print(f'X_test before Bernoulli:\n{X_test}')

X_test = np.where(X_test > 0.5, 1, 0)

print(f'X_test after Bernoulli:\n{X_test}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_test before Bernoulli:
[[1 0 3 0 0 1 0]
 [0 0 0 1 0 0 0]]
X_test after Bernoulli:
[[1 0 1 0 0 1 0]
 [0 0 0 1 0 0 0]]


In [None]:
# Sklearn Bernoulli Naive Bayes

bnb = BernoulliNB()
bnb.fit(X_bernoulli, y_train)

print(f'Test set predictions: {bnb.predict(X_test)}')

Test set predictions: ['dog' 'cat']


In [None]:
# Explore priors

print(f'Sklearn\'s priors: {np.exp(bnb.class_log_prior_)}')

Sklearn's priors: [0.6 0.4]


In [None]:
count_y = len(train_df)
count_y_train_dog = len(train_df[train_df.category=='dog'])
count_y_train_cat = len(train_df[train_df.category=='cat'])

print(f'Our P(class=dog)=', count_y_train_dog/count_y)
print(f'Our P(class=cat)=', count_y_train_cat/count_y)

Our P(class=dog)= 0.4
Our P(class=cat)= 0.6


In [None]:
# Explore conditional probabilities

print(f'Sklearn\'s conditional probabilities:\n{np.exp(bnb.feature_log_prob_)}')
print()
print(f'features: {count.get_feature_names()}')

Sklearn's conditional probabilities:
[[0.2  0.6  0.6  0.6  0.4  0.6  0.2 ]
 [0.75 0.25 0.75 0.25 0.25 0.25 0.5 ]]

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']


In [None]:
count_1 = pd.DataFrame({
    feature: {
      category: np.sum(X_bernoulli[train_df['category']==category][feature])
      for category in ['dog', 'cat']
    }
    for feature in count.get_feature_names()
})

count_1

count_1.loc['dog'] = count_1.loc['dog']/count_y_train_dog
count_1.loc['cat'] = count_1.loc['cat']/count_y_train_cat

print('Our conditional probabilities:')
count_1.head()

Our conditional probabilities:


Unnamed: 0,bark,meow,paw,purr,tail,whiskers,woof
dog,1.0,0.0,1.0,0.0,0.0,0.0,0.5
cat,0.0,0.666667,0.666667,0.666667,0.333333,0.666667,0.0


## Multinomial: discrete features (word counts)

In [None]:
# Features are just word counts

X_multinomial = X_train.copy()
X_multinomial

Unnamed: 0,bark,meow,paw,purr,tail,whiskers,woof
0,0,0,1,1,2,1,0
1,0,1,0,0,0,2,0
2,0,2,1,1,0,0,0
3,2,0,1,0,0,0,1
4,2,0,2,0,0,0,0


In [None]:
# Test set

test_docs = np.array([
  'whiskers bark paw paw paw',
  'purr hello'
])

X_test = count.transform(test_docs).toarray()

print(f'features: {count.get_feature_names()}')
print(f'X_test:\n{X_test}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_test:
[[1 0 3 0 0 1 0]
 [0 0 0 1 0 0 0]]


In [None]:
# Sklearn Multinomial Naive Bayes

mnb = MultinomialNB()  # default alpha is 1.0
mnb.fit(X_multinomial, y_train)

print(f'Test set predictions: {mnb.predict(X_test)}')

Test set predictions: ['dog' 'cat']


In [None]:
# Explore priors

print(f'Sklearn\'s priors: {np.exp(mnb.class_log_prior_)}')

Sklearn's priors: [0.6 0.4]


In [None]:
count_y = len(train_df)
count_y_train_dog = len(train_df[train_df.category=='dog'])
count_y_train_cat = len(train_df[train_df.category=='cat'])

print(f'Our P(class=dog)=', count_y_train_dog/count_y)
print(f'Our P(class=cat)=', count_y_train_cat/count_y)

Our P(class=dog)= 0.4
Our P(class=cat)= 0.6


In [None]:
# Explore conditional probabilities

print(f'Sklearn\'s conditional probabilities:\n{np.exp(mnb.feature_log_prob_)}')
print()
print(f'features: {count.get_feature_names()}')

Sklearn's conditional probabilities:
[[0.05263158 0.21052632 0.15789474 0.15789474 0.15789474 0.21052632
  0.05263158]
 [0.33333333 0.06666667 0.26666667 0.06666667 0.06666667 0.06666667
  0.13333333]]

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']


In [None]:
count_m = pd.DataFrame({
    feature: {
      category: np.sum(X_multinomial[train_df['category']==category][feature])
      for category in ['dog', 'cat']
    }
    for feature in count.get_feature_names()
})

count_m

# note the new denominators here!
count_words_train_dog = np.sum(count_m.loc['dog']) + 7
count_words_train_cat = np.sum(count_m.loc['cat']) + 7

count_m.loc['dog'] = (count_m.loc['dog'] +1)/count_words_train_dog
count_m.loc['cat'] = (count_m.loc['cat'] +1)/count_words_train_cat

print('Our conditional probabilities:')
count_m.head()

Our conditional probabilities:


Unnamed: 0,bark,meow,paw,purr,tail,whiskers,woof
dog,0.333333,0.066667,0.266667,0.066667,0.066667,0.066667,0.133333
cat,0.052632,0.210526,0.157895,0.157895,0.157895,0.210526,0.052632


Exercise: add smoothing with alpha to our computed conditional probabilities in Multinomial Naive Bayes so they match the conditional probabilites output by Sklearn!

## Naive Bayes to generate documents

In [None]:
# Generate cat document using Bernoulli Naive Bayes

doc = ''
for idx, word in enumerate(count.get_feature_names()):
  if np.random.uniform() < count_1.loc['cat'][word]:
    doc += word + ' '

print(doc.strip())

purr whiskers


## Gaussian Naive Bayes: continuous features

Exercise: calculate the mean and variance of each feature in each class to fit a Gaussian distribution!

Next exercise: use the Gaussian distributions to classify the test documents above!