# Week 3
# Rasika Bhalerao

# Agenda

- Inspect dataset
- Bernoulli Naive Bayes (binary features)
- Multinomial Naive Bayes (discrete features)
- Naive Bayes to generate documents
- Feature Engineering
  - CountVectorizer
  - fit vs. transform vs. fit_transform
  - ngrams
  - stop words
  - stemming and lemmatization

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk

# This tells matplolib not to try opening a new window for each plot
%matplotlib inline

In [29]:
# Inspect dataset


df = pd.DataFrame({
    'document': [
        'whiskers tail tail paw purr',
        'meow whiskers whiskers',
        'meow meow paw purr',
        'paw bark woof bark',
        'paw paw bark bark'
    ],
    'category': [
        'cat', 'cat', 'cat', 'dog', 'dog'
    ]
})

print(f'Shape: {df.shape}')
print(f'Categories: {df["category"].unique()}')
print()
df.head()

Shape: (5, 2)
Categories: ['cat' 'dog']



Unnamed: 0,document,category
0,whiskers tail tail paw purr,cat
1,meow whiskers whiskers,cat
2,meow meow paw purr,cat
3,paw bark woof bark,dog
4,paw paw bark bark,dog


In [30]:
# Get features

train_docs = np.array(df['document'])
count = CountVectorizer()
X_train = count.fit_transform(train_docs)

print(f'features: {count.get_feature_names()}')
print(f'X_train:\n{X_train.toarray()}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_train:
[[0 0 1 1 2 1 0]
 [0 1 0 0 0 2 0]
 [0 2 1 1 0 0 0]
 [2 0 1 0 0 0 1]
 [2 0 2 0 0 0 0]]




In [31]:
# Make training set

X_train = pd.DataFrame(X_train.toarray())
X_train.columns = count.get_feature_names()

y_train = df['category']

train_df = pd.concat((y_train, X_train), axis=1)
train_df

Unnamed: 0,category,bark,meow,paw,purr,tail,whiskers,woof
0,cat,0,0,1,1,2,1,0
1,cat,0,1,0,0,0,2,0
2,cat,0,2,1,1,0,0,0
3,dog,2,0,1,0,0,0,1
4,dog,2,0,2,0,0,0,0


## Bernoulli: all features are 0 or 1

In [32]:
# Let's make it so for each feature, it is 1 if the word is there, and 0 if not

X_bernoulli = X_train.copy()

for feature in X_train.columns:
    X_bernoulli[feature] = np.where(X_bernoulli[feature] >= 1, 1, 0)
X_bernoulli.head()

Unnamed: 0,bark,meow,paw,purr,tail,whiskers,woof
0,0,0,1,1,1,1,0
1,0,1,0,0,0,1,0
2,0,1,1,1,0,0,0
3,1,0,1,0,0,0,1
4,1,0,1,0,0,0,0


In [33]:
# Test set

test_docs = np.array([
  'whiskers bark paw paw paw',
  'purr hello'
])

X_test = count.transform(test_docs).toarray()

print(f'features: {count.get_feature_names()}')
print(f'X_test before Bernoulli:\n{X_test}')

X_test = np.where(X_test > 0.5, 1, 0)

print(f'X_test after Bernoulli:\n{X_test}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_test before Bernoulli:
[[1 0 3 0 0 1 0]
 [0 0 0 1 0 0 0]]
X_test after Bernoulli:
[[1 0 1 0 0 1 0]
 [0 0 0 1 0 0 0]]




In [34]:
# Sklearn Bernoulli Naive Bayes

bnb = BernoulliNB(alpha=0)
bnb.fit(X_bernoulli, y_train)

print(f'Test set predictions: {bnb.predict(X_test)}')

Test set predictions: ['dog' 'cat']


  % _ALPHA_MIN
  "X does not have valid feature names, but"


In [35]:
# Explore priors

print(f'Sklearn\'s priors: {np.exp(bnb.class_log_prior_)}')

Sklearn's priors: [0.6 0.4]


In [36]:
count_y = len(train_df)
count_y_train_dog = len(train_df[train_df.category=='dog'])
count_y_train_cat = len(train_df[train_df.category=='cat'])

print(f'Our P(class=dog)=', count_y_train_dog/count_y)
print(f'Our P(class=cat)=', count_y_train_cat/count_y)

Our P(class=dog)= 0.4
Our P(class=cat)= 0.6


In [37]:
# Explore conditional probabilities

print(f'Sklearn\'s conditional probabilities:\n{np.exp(bnb.feature_log_prob_)}')
print()
print(f'features: {count.get_feature_names()}')

Sklearn's conditional probabilities:
[[3.33333333e-11 6.66666667e-01 6.66666667e-01 6.66666667e-01
  3.33333333e-01 6.66666667e-01 3.33333333e-11]
 [1.00000000e+00 5.00000000e-11 1.00000000e+00 5.00000000e-11
  5.00000000e-11 5.00000000e-11 5.00000000e-01]]

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']




In [38]:
count_1 = pd.DataFrame({
    feature: {
      category: np.sum(X_bernoulli[train_df['category']==category][feature])
      for category in ['cat', 'dog']
    }
    for feature in count.get_feature_names()
})

count_1

count_1.loc['dog'] = count_1.loc['dog']/count_y_train_dog
count_1.loc['cat'] = count_1.loc['cat']/count_y_train_cat

print('Our conditional probabilities:')
count_1.head()

Our conditional probabilities:


Unnamed: 0,bark,meow,paw,purr,tail,whiskers,woof
cat,0,0.666667,0.666667,0.666667,0.333333,0.666667,0.0
dog,1,0.0,1.0,0.0,0.0,0.0,0.5


## Multinomial: discrete features (word counts)

In [39]:
# Features are just word counts

X_multinomial = X_train.copy()
X_multinomial

Unnamed: 0,bark,meow,paw,purr,tail,whiskers,woof
0,0,0,1,1,2,1,0
1,0,1,0,0,0,2,0
2,0,2,1,1,0,0,0
3,2,0,1,0,0,0,1
4,2,0,2,0,0,0,0


In [40]:
# Test set

test_docs = np.array([
  'whiskers bark paw paw paw',
  'purr hello'
])

X_test = count.transform(test_docs).toarray()

print(f'features: {count.get_feature_names()}')
print(f'X_test:\n{X_test}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_test:
[[1 0 3 0 0 1 0]
 [0 0 0 1 0 0 0]]




In [41]:
# Sklearn Multinomial Naive Bayes

mnb = MultinomialNB()  # default alpha is 1.0
mnb.fit(X_multinomial, y_train)

print(f'Test set predictions: {mnb.predict(X_test)}')

Test set predictions: ['dog' 'cat']


  "X does not have valid feature names, but"


In [42]:
# Explore priors

print(f'Sklearn\'s priors: {np.exp(mnb.class_log_prior_)}')

Sklearn's priors: [0.6 0.4]


In [43]:
count_y = len(train_df)
count_y_train_dog = len(train_df[train_df.category=='dog'])
count_y_train_cat = len(train_df[train_df.category=='cat'])

print(f'Our P(class=dog)=', count_y_train_dog/count_y)
print(f'Our P(class=cat)=', count_y_train_cat/count_y)

Our P(class=dog)= 0.4
Our P(class=cat)= 0.6


In [44]:
# Explore conditional probabilities

print(f'Sklearn\'s conditional probabilities:\n{np.exp(mnb.feature_log_prob_)}')
print()
print(f'features: {count.get_feature_names()}')

Sklearn's conditional probabilities:
[[0.05263158 0.21052632 0.15789474 0.15789474 0.15789474 0.21052632
  0.05263158]
 [0.33333333 0.06666667 0.26666667 0.06666667 0.06666667 0.06666667
  0.13333333]]

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']




In [46]:
count_m = pd.DataFrame({
    feature: {
      category: np.sum(X_multinomial[train_df['category']==category][feature])
      for category in ['cat', 'dog']
    }
    for feature in count.get_feature_names()
})

count_m

# note the new denominators here!
count_words_train_dog = np.sum(count_m.loc['dog']) + 7
count_words_train_cat = np.sum(count_m.loc['cat']) + 7

count_m.loc['dog'] = (count_m.loc['dog'] +1)/count_words_train_dog
count_m.loc['cat'] = (count_m.loc['cat'] +1)/count_words_train_cat

print('Our conditional probabilities:')
count_m.head()

Our conditional probabilities:




Unnamed: 0,bark,meow,paw,purr,tail,whiskers,woof
cat,0.052632,0.210526,0.157895,0.157895,0.157895,0.210526,0.052632
dog,0.333333,0.066667,0.266667,0.066667,0.066667,0.066667,0.133333


## Naive Bayes to generate documents

In [47]:
# Generate cat document using Bernoulli Naive Bayes

doc = ''
for idx, word in enumerate(count.get_feature_names()):
  if np.random.uniform() < count_1.loc['cat'][word]:
    doc += word + ' '

print(doc.strip())

meow purr whiskers




## Feature Engineering!!

In [48]:
# Fresh look at dataset

df = pd.DataFrame({
    'document': [
        'whiskers tail tail paw purr',
        'meow whiskers whiskers',
        'meow meow paw purr',
        'paw bark woof bark',
        'paw paw bark bark'
    ],
    'category': [
        'cat', 'cat', 'cat', 'dog', 'dog'
    ]
})

df.head()

Unnamed: 0,document,category
0,whiskers tail tail paw purr,cat
1,meow whiskers whiskers,cat
2,meow meow paw purr,cat
3,paw bark woof bark,dog
4,paw paw bark bark,dog


In [49]:
vectorizer = CountVectorizer()
train_docs = np.array(df['document'])

vectorizer.fit(train_docs) # make the vectorizer learn the words as features

X_train = vectorizer.transform(train_docs) # make the vectorizer transform the training set into numbers

# tip: fit_transform(train_docs) = fit and then transform

print(f'features: {vectorizer.get_feature_names()}')
print(f'X_train:\n{X_train.toarray()}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_train:
[[0 0 1 1 2 1 0]
 [0 1 0 0 0 2 0]
 [0 2 1 1 0 0 0]
 [2 0 1 0 0 0 1]
 [2 0 2 0 0 0 0]]




In [50]:
# train a model, do whatever with those features above representing docs
# then consider this test set:

test_docs = np.array([
  'whiskers bark paw paw paw',
  'purr hello'
])

# What is wrong with the code below? What should we do instead?

X_test = vectorizer.fit_transform(test_docs).toarray()

print(f'features: {vectorizer.get_feature_names()}')
print(f'X:\n{X_test}')

features: ['bark', 'hello', 'paw', 'purr', 'whiskers']
X:
[[1 0 3 0 1]
 [0 1 0 1 0]]


In [51]:
# ngrams

vectorizer = CountVectorizer(
    ngram_range=(1,4),
    analyzer='word'
)

docs = np.array([
  'whiskers bark paw paw paw',
  'purr hello'
])
X = vectorizer.fit_transform(docs).toarray()

print(f'features: {vectorizer.get_feature_names()}')
print(f'X:\n{X}')

features: ['bark', 'bark paw', 'bark paw paw', 'bark paw paw paw', 'hello', 'paw', 'paw paw', 'paw paw paw', 'purr', 'purr hello', 'whiskers', 'whiskers bark', 'whiskers bark paw', 'whiskers bark paw paw']
X:
[[1 1 1 1 0 3 2 1 0 0 1 1 1 1]
 [0 0 0 0 1 0 0 0 1 1 0 0 0 0]]


In [52]:
# stop words

vectorizer = CountVectorizer(
    stop_words=['the', 'a', 'and'],
    max_df=1.0,
    min_df=0.1,
    max_features=None,

    lowercase=True,
    binary=False
)

docs = np.array([
  'the whiskers bark paw paw paw',
  'the purr hello'
])
X = vectorizer.fit_transform(docs).toarray()

print(f'features: {vectorizer.get_feature_names()}')
print(f'X:\n{X}')

features: ['bark', 'hello', 'paw', 'purr', 'whiskers']
X:
[[1 0 3 0 1]
 [0 1 0 1 0]]


Tip: stop_words='english' uses a default english stop word set! But it might not be exactly right for your dataset

In [55]:
# stemming

stemmer = nltk.stem.PorterStemmer()
print(stemmer.stem('cats'))
print(stemmer.stem('cat'))
print(stemmer.stem('purrs'))
print(stemmer.stem('purring'))
print(stemmer.stem('does'))

cat
cat
purr
pur
doe


In [60]:
# lemmatization

nltk.download('wordnet')
nltk.download('omw-1.4') # Open Multilingual Wordnet (as of Feb 2022)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [61]:
lemmatizer = nltk.stem.WordNetLemmatizer()
print(lemmatizer.lemmatize('cats', pos='n'))
print(lemmatizer.lemmatize('cat', pos='n'))
print(lemmatizer.lemmatize('purrs', pos='v'))
print(lemmatizer.lemmatize('purring', pos='v'))
print(lemmatizer.lemmatize('purred', pos='v'))
print(lemmatizer.lemmatize('does', pos='v'))
print(lemmatizer.lemmatize('is', pos='v'))
print(lemmatizer.lemmatize('friendlier', pos='a'))

cat
cat
purr
purr
purr
do
be
friendly
