# Week 7: Text and Natural Language Processing

# Rasika Bhalerao

# Agenda

- [Tensorflow playground](https://playground.tensorflow.org/)
- Intro to hw project 3

Specifically:
- CountVectorizer
- fit vs. transform vs. fit_transform
- ngrams
- stop words
- stemming and lemmatization
- TF-IDF

In [None]:
# Imports directly copied from hw

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk

In [None]:
# This is our small dataset from week 3

df = pd.DataFrame({
    'document': [
        'whiskers tail tail paw purr',
        'meow whiskers whiskers',
        'meow meow paw purr',
        'paw bark woof bark',
        'paw paw bark bark'
    ],
    'category': [
        'cat', 'cat', 'cat', 'dog', 'dog'
    ]
})

df.head()

Unnamed: 0,document,category
0,whiskers tail tail paw purr,cat
1,meow whiskers whiskers,cat
2,meow meow paw purr,cat
3,paw bark woof bark,dog
4,paw paw bark bark,dog


###Note: CountVectorizer was imported in this line above:

`from sklearn.feature_extraction.text import *`

In [None]:
vectorizer = CountVectorizer()
train_docs = np.array(df['document'])

vectorizer.fit(train_docs) # make the vectorizer learn the words as features

X_train = vectorizer.transform(train_docs) # make the vectorizer transform the training set into numbers

# tip: fit_transform(train_docs) = fit and then transform

print(f'features: {vectorizer.get_feature_names()}')
print(f'X_train:\n{X_train.toarray()}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_train:
[[0 0 1 1 2 1 0]
 [0 1 0 0 0 2 0]
 [0 2 1 1 0 0 0]
 [2 0 1 0 0 0 1]
 [2 0 2 0 0 0 0]]


In [None]:
# train a model, do whatever with those features above representing docs
# then consider this test set:

test_docs = np.array([
  'whiskers bark paw paw paw',
  'purr hello'
])

# What is wrong with the code below? What should we do instead?

X_test = vectorizer.fit_transform(test_docs).toarray()

print(f'features: {vectorizer.get_feature_names()}')
print(f'X:\n{X_test}')

features: ['bark', 'hello', 'paw', 'purr', 'whiskers']
X:
[[1 0 3 0 1]
 [0 1 0 1 0]]


In [None]:
# ngrams

vectorizer = CountVectorizer(
    ngram_range=(1,4),
    analyzer='word'
)

docs = np.array([
  'whiskers bark paw paw paw',
  'purr hello'
])
X = vectorizer.fit_transform(docs).toarray()

print(f'features: {vectorizer.get_feature_names()}')
print(f'X:\n{X}')

features: ['bark', 'bark paw', 'bark paw paw', 'bark paw paw paw', 'hello', 'paw', 'paw paw', 'paw paw paw', 'purr', 'purr hello', 'whiskers', 'whiskers bark', 'whiskers bark paw', 'whiskers bark paw paw']
X:
[[1 1 1 1 0 3 2 1 0 0 1 1 1 1]
 [0 0 0 0 1 0 0 0 1 1 0 0 0 0]]


In [None]:
# stop words

vectorizer = CountVectorizer(
    stop_words=['the', 'a', 'and'],
    max_df=1.0,
    min_df=0.1,
    max_features=None,

    lowercase=True,
    binary=False
)

docs = np.array([
  'the whiskers bark paw paw paw',
  'the purr hello'
])
X = vectorizer.fit_transform(docs).toarray()

print(f'features: {vectorizer.get_feature_names()}')
print(f'X:\n{X}')

features: ['bark', 'hello', 'paw', 'purr', 'whiskers']
X:
[[1 0 3 0 1]
 [0 1 0 1 0]]


### Tip: `stop_words='english'` uses a default english stop word set! But it might not be exactly right for your dataset...

In [None]:
# stemming

stemmer = nltk.stem.PorterStemmer()
print(stemmer.stem('cats'))
print(stemmer.stem('cat'))
print(stemmer.stem('purrs'))
print(stemmer.stem('purring'))
print(stemmer.stem('does'))

cat
cat
purr
pur
doe


In [None]:
# lemmatization

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()
print(lemmatizer.lemmatize('cats', pos='n'))
print(lemmatizer.lemmatize('cat', pos='n'))
print(lemmatizer.lemmatize('purrs', pos='v'))
print(lemmatizer.lemmatize('purring', pos='v'))
print(lemmatizer.lemmatize('purred', pos='v'))
print(lemmatizer.lemmatize('does', pos='v'))
print(lemmatizer.lemmatize('is', pos='v'))
print(lemmatizer.lemmatize('friendlier', pos='a'))

cat
cat
purr
purr
purr
do
be
friendly


### TfidfVectorizer was imported in the same line as CountVectorizer!

TF = term frequency
- this is the same as the CountVectorizer word count

IDF = inverse document frequency
- the reciprocal of the fraction of documents that have this word

Tf-IDF = TF * IDF for each word
- the elementwise product of the TF and IDF vectors (each word in the vocab is an element)

In [None]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(train_docs)

print(f'features: {vectorizer.get_feature_names()}')
print(f'X_train:\n{X_train}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_train:
  (0, 3)	0.3403486372984673
  (0, 2)	0.23766482637991668
  (0, 4)	0.843706726582144
  (0, 5)	0.3403486372984673
  (1, 1)	0.4472135954999579
  (1, 5)	0.8944271909999159
  (2, 1)	0.85376425497417
  (2, 3)	0.426882127487085
  (2, 2)	0.29809100315256176
  (3, 6)	0.5050077256599805
  (3, 0)	0.8148741065505222
  (3, 2)	0.2845125436553228
  (4, 0)	0.8198869039412204
  (4, 2)	0.5725255144931797


In [None]:
# make it a dense matrix (easier to look at)

from scipy.sparse import csr_matrix

print(f'features: {vectorizer.get_feature_names()}')
print(f'X_train:\n{csr_matrix.todense(X_train)}')

features: ['bark', 'meow', 'paw', 'purr', 'tail', 'whiskers', 'woof']
X_train:
[[0.         0.         0.23766483 0.34034864 0.84370673 0.34034864
  0.        ]
 [0.         0.4472136  0.         0.         0.         0.89442719
  0.        ]
 [0.         0.85376425 0.298091   0.42688213 0.         0.
  0.        ]
 [0.81487411 0.         0.28451254 0.         0.         0.
  0.50500773]
 [0.8198869  0.         0.57252551 0.         0.         0.
  0.        ]]
