In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [2]:
corpus = ['This is document one',
          'This is document two']

# CountVectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(type(X))

<class 'scipy.sparse.csr.csr_matrix'>


In [5]:
X.toarray()

array([[1, 0, 1, 1],
       [0, 1, 1, 1]])

In [6]:
vectorizer.get_feature_names()

[u'document one', u'document two', u'is document', u'this is']

In [7]:
x_new = vectorizer.transform(['another document', 'totally new words'])
x_new.toarray()

array([[0, 0, 0, 0],
       [0, 0, 0, 0]])

In [8]:
corpus = ['This is Document one',
          'This is document two']

vectorizer = CountVectorizer(lowercase=False)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'Document', u'This', u'document', u'is', u'one', u'two']


## Stop Words

In [9]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'document']


## n-grams

In [4]:
vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'document one', u'document two', u'is document', u'this is']


In [5]:
vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'document', u'document one', u'document two', u'is', u'is document', u'one', u'this', u'this is', u'two']


In [6]:
vectorizer = CountVectorizer(ngram_range=(2,3))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'document one', u'document two', u'is document', u'is document one', u'is document two', u'this is', u'this is document']


In [7]:
X.toarray()

array([[1, 0, 1, 1, 0, 1, 1],
       [0, 1, 1, 0, 1, 1, 1]])

## Binary

In [8]:
corpus = ['This is document document one',
          'This is document two']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.todense())

[u'document', u'is', u'one', u'this', u'two']
[[2 1 1 1 0]
 [1 1 0 1 1]]


In [9]:
corpus = ['This is document document one',
          'This is document two']

vectorizer = CountVectorizer(binary = True)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.todense())

[u'document', u'is', u'one', u'this', u'two']
[[1 1 1 1 0]
 [1 1 0 1 1]]


# Stemming

In [10]:
import nltk.stem
from nltk.stem.wordnet import WordNetLemmatizer

In [11]:
stemmer = nltk.stem.SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [12]:
print(stemmer.stem('running'))
print(stemmer.stem('run'))

print(lemmatizer.lemmatize('running'))
print(lemmatizer.lemmatize('run'))

run
run
running
run


In [13]:
print(stemmer.stem('going'))
print(stemmer.stem('go'))

print(lemmatizer.lemmatize('going'))
print(lemmatizer.lemmatize('go'))

go
go
going
go


# NLTK Tokenizer

Demo of different tokenizers: http://text-processing.com/demo/tokenize/

In [15]:
from nltk import word_tokenize

s = "I was watching TV. I liked it it was a lot of fun"
word_tokenize(s)

['I',
 'was',
 'watching',
 'TV',
 '.',
 'I',
 'liked',
 'it',
 'it',
 'was',
 'a',
 'lot',
 'of',
 'fun']

In [17]:
from nltk.tokenize import sent_tokenize

text = """Hello. How are you, dear Mr. Sir? Are you well?
          Here: drink this! It will make you feel better.
          I mean, it won't make you feel worse!"""

sentences = sent_tokenize(text)
print(sentences)

['Hello.', 'How are you, dear Mr. Sir?', 'Are you well?', 'Here: drink this!', 'It will make you feel better.', "I mean, it won't make you feel worse!"]


# Part of Speech Tagging

Part-of-speech (POS) tagging refers to the process of assigning part-of-speech tags, such as “noun” or “verb,” to words in documents.

##### Some of POS tags: 
WP: wh-pronoun ("who", "what")  
VBZ: verb, 3rd person sing. present ("takes")  
VBG: verb, gerund/present participle ("taking")  
TO: to ("to go", "to him")   
DT: determiner ("the", "this")  
NN: noun, singular or mass ("door")  
.: Punctuation (".", "?")  

In [27]:
word_tokenize(s)

['I',
 'was',
 'watching',
 'TV',
 '.',
 'I',
 'liked',
 'it',
 'it',
 'was',
 'a',
 'lot',
 'of',
 'fun']

In [18]:
nltk.pos_tag(word_tokenize(s))

[('I', 'PRP'),
 ('was', 'VBD'),
 ('watching', 'VBG'),
 ('TV', 'NN'),
 ('.', '.'),
 ('I', 'PRP'),
 ('liked', 'VBD'),
 ('it', 'PRP'),
 ('it', 'PRP'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('fun', 'NN')]

# TFIDF

In [19]:
corpus = ['This is document one',
          'This is document two',
         'This is a third article']

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(type(X))

<class 'scipy.sparse.csr.csr_matrix'>


In [21]:
X.toarray()

array([[ 0.        ,  0.50410689,  0.39148397,  0.66283998,  0.        ,
         0.39148397,  0.        ],
       [ 0.        ,  0.50410689,  0.39148397,  0.        ,  0.        ,
         0.39148397,  0.66283998],
       [ 0.6088451 ,  0.        ,  0.35959372,  0.        ,  0.6088451 ,
         0.35959372,  0.        ]])

In [22]:
vectorizer.get_feature_names()

[u'article', u'document', u'is', u'one', u'third', u'this', u'two']

In [23]:
# given a new document we can use the transform method to get the tfidf values
x_new = vectorizer.transform(['another document with some new words'])
x_new.toarray()

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.]])

In [24]:
vectorizer.idf_

array([ 1.69314718,  1.28768207,  1.        ,  1.69314718,  1.69314718,
        1.        ,  1.69314718])

In [25]:
pwd

u'/Users/mike/Desktop/PythonWorkshopText/Notebooks'