In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [3]:
corpus = ['This is document one',
          'This is document two']

# CountVectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(corpus)
print(type(X))

<class 'scipy.sparse.csr.csr_matrix'>


In [5]:
X.toarray()

array([[1, 0, 1, 1],
       [0, 1, 1, 1]])

In [6]:
vectorizer.get_feature_names()

[u'document one', u'document two', u'is document', u'this is']

In [7]:
x_new = vectorizer.transform(['another document', 'totally new words'])
x_new.toarray()

array([[0, 0, 0, 0],
       [0, 0, 0, 0]])

In [8]:
corpus = ['This is Document one',
          'This is document two']

vectorizer = CountVectorizer(lowercase=False)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'Document', u'This', u'document', u'is', u'one', u'two']


## Stop Words

In [9]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'document']


## n-grams

In [10]:
vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'document', u'document one', u'document two', u'is', u'is document', u'one', u'this', u'this is', u'two']


In [11]:
vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'document one', u'document two', u'is document', u'this is']


In [12]:
vectorizer = CountVectorizer(ngram_range=(2,3))
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

[u'document one', u'document two', u'is document', u'is document one', u'is document two', u'this is', u'this is document']


In [13]:
X.toarray()

array([[1, 0, 1, 1, 0, 1, 1],
       [0, 1, 1, 0, 1, 1, 1]])

## Binary

In [14]:
corpus = ['This is document document one',
          'This is document two']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.todense())

[u'document', u'is', u'one', u'this', u'two']
[[2 1 1 1 0]
 [1 1 0 1 1]]


In [15]:
corpus = ['This is document document one',
          'This is document two']

vectorizer = CountVectorizer(binary = True)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.todense())

[u'document', u'is', u'one', u'this', u'two']
[[1 1 1 1 0]
 [1 1 0 1 1]]


# Stemming

In [16]:
import nltk.stem

In [25]:
stemmer = nltk.stem.SnowballStemmer('english')
print(stemmer.stem('running'))
print(stemmer.stem('run'))

print(stemmer.stem('going'))
print(stemmer.stem('go'))

run
run
go
go


# TFIDF

In [33]:
corpus = ['This is document one',
          'This is document two',
         'This is a third article']

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(type(X))

<class 'scipy.sparse.csr.csr_matrix'>


In [35]:
X.toarray()

array([[ 0.        ,  0.50410689,  0.39148397,  0.66283998,  0.        ,
         0.39148397,  0.        ],
       [ 0.        ,  0.50410689,  0.39148397,  0.        ,  0.        ,
         0.39148397,  0.66283998],
       [ 0.6088451 ,  0.        ,  0.35959372,  0.        ,  0.6088451 ,
         0.35959372,  0.        ]])

In [36]:
vectorizer.get_feature_names()

[u'article', u'document', u'is', u'one', u'third', u'this', u'two']

In [37]:
# given a new document we can use the transform method to get the tfidf values
x_new = vectorizer.transform(['another document with some new words'])
x_new.toarray()

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.]])