# Converting raw text into a bag-of-words

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

In [2]:
print vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [3]:
content = ["How to format my hard disk", " Hard disk format problems "]
X = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

[u'disk', u'format', u'hard', u'how', u'my', u'problems', u'to']

In [4]:
X.toarray().T

array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0]], dtype=int64)

# Counting words

In [5]:
import os

In [6]:
DIR = 'data/toy/'
posts = [open(os.path.join(DIR, f)).read() for f in os.listdir(DIR)]

In [7]:
# same beginning as above section:
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(min_df=1)

In [8]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 5, #features: 25


In [9]:
print vectorizer.get_feature_names()

[u'about', u'actually', u'capabilities', u'contains', u'data', u'databases', u'images', u'imaging', u'interesting', u'is', u'it', u'learning', u'machine', u'most', u'much', u'not', u'permanently', u'post', u'provide', u'safe', u'storage', u'store', u'stuff', u'this', u'toy']


In [10]:
# vectorize our new post as follows:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

# Note that the count vectors returned by the transform method are sparse. That is,
# each vector does not store one count value for each word, as most of those counts
# would be zero (post does not contain the word). Instead, it uses the more memory
# efficient implementation coo_matrix (for "COOrdinate"). Our new post, for instance,
# actually contains only two elements:
print(new_post_vec)

# Via its member toarray(), we can again access full ndarray as follows:
print(new_post_vec.toarray())

  (0, 5)	1
  (0, 7)	1
[[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [11]:
# We need to use the full array if we want to use it as a vector for similarity
# calculations. For the similarity measurement (the naive one), we calculate the
# Euclidean distance between the count vectors of the new post and all the old
# posts as follows:
import scipy as sp

def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta)

# The norm() function calculates the Euclidean norm (shortest distance).

In [12]:
post1 = X_train.toarray().T[:,0]
post2 = X_train.toarray().T[:,1]

print post1 - post2
print ''
print dist_raw(post1, post2)

[ 1  1 -1  1  0 -1  0 -1  1  1  1  1  1  0  1  1  0  1 -1  0 -1  0  1  1  1]

4.35889894354


In [13]:
# messing around with norm
def vec_norm(v):
    return sum([x**2 for x in v])**.5
                
vec_norm(post1-post2)

4.358898943540674

In [14]:
best_post = None
best_score = 5000
# best_i = -1

print "Post candidate:\n\t%s\n" % (new_post)

for i in range(len(posts)):
    dist = dist_raw(new_post_vec.toarray(), X_train.toarray().T[:, i])
    print "### Post %i distance == %f" % (i, dist)
    print posts[i]
    if dist < best_score:
        best_post = "Post %i" % (i)
        best_score = dist
#         best_i = 

print "\n%s is the best match with a score of %f" % (best_post, best_score)

Post candidate:
	imaging databases

### Post 0 distance == 4.000000
This is a toy post about machine learning. Actually, it contains not much interesting stuff.
### Post 1 distance == 1.732051
Imaging databases provide storage capabilities.
### Post 2 distance == 2.000000
Most imaging databases safe images permanently.
### Post 3 distance == 1.414214
Imaging databases store data.
### Post 4 distance == 5.099020
Imaging databases store data. Imaging databases store data. Imaging databases store data.

Post 3 is the best match with a score of 1.414214


In [15]:
# Looking at posts 3 and 4, however, the picture is not so clear any more. Post 4 is the
# same as Post 3, duplicated three times. So, it should also be of the same similarity to
# the new post as Post 3.
# Printing the corresponding feature vectors explains the reason:
print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())

# Obviously, using only the counts of the raw words is too simple. We will have to
# normalize them to get vectors of unit length.

[[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
[[0 0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]


## create function of above

In [16]:
def post_cluster(new_post, prev_posts, dist_measure=dist_raw):
    
    best_post = None
    best_score = 50000
    X_train = vectorizer.fit_transform(prev_posts)

    print "Post candidate:\n\t%s\n" % (new_post)
    new_post_vec = vectorizer.transform([new_post])

    for i in range(len(prev_posts)):
        dist = dist_measure(new_post_vec.toarray(), X_train.toarray().T[:, i])
        print "### Post %i distance == %.3f" % (i, dist)
        print posts[i]
        if dist < best_score:
            best_post = "Post %i" % (i)
            best_score = dist
    #         best_i = 

    print "\n%s is the best match with a score of %.3f" % (best_post, best_score)
    
post_cluster('imagine imagining', posts[:3])

Post candidate:
	imagine imagining

### Post 0 distance == 3.742
This is a toy post about machine learning. Actually, it contains not much interesting stuff.
### Post 1 distance == 2.236
Imaging databases provide storage capabilities.
### Post 2 distance == 2.449
Most imaging databases safe images permanently.

Post 1 is the best match with a score of 2.236


# Normalizing the word count vectors

In [17]:
print(X_train.getrow(3).toarray())
print sp.linalg.norm(X_train.getrow(3).toarray())
print (X_train.getrow(3).toarray())/sp.linalg.norm(X_train.getrow(3).toarray())
print ''
print(X_train.getrow(4).toarray())
print sp.linalg.norm(X_train.getrow(4).toarray())
print (X_train.getrow(4).toarray())/sp.linalg.norm(X_train.getrow(4).toarray())

[[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
2.0
[[ 0.   0.   0.   0.   0.5  0.5  0.   0.5  0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.5  0.   0.   0. ]]

[[0 0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]
6.0
[[ 0.   0.   0.   0.   0.5  0.5  0.   0.5  0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.5  0.   0.   0. ]]


In [18]:
def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1)
    v2_normalized = v2/sp.linalg.norm(v2)
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta)

In [19]:
# use clustering function
post_cluster(new_post, posts, dist_norm)

Post candidate:
	imaging databases

### Post 0 distance == 1.414
This is a toy post about machine learning. Actually, it contains not much interesting stuff.
### Post 1 distance == 0.857
Imaging databases provide storage capabilities.
### Post 2 distance == 0.919
Most imaging databases safe images permanently.
### Post 3 distance == 0.765
Imaging databases store data.
### Post 4 distance == 0.765
Imaging databases store data. Imaging databases store data. Imaging databases store data.

Post 3 is the best match with a score of 0.765


# Removing less important words

Let us have another look at Post 2. Of its words that are not in the new post, we have
"most", "safe", "images", and "permanently". They are actually quite different in the
overall importance to the post. Words such as "most" appear very often in all sorts of
different contexts, and words such as this are called stop words. They do not carry
as much information, and thus should not be weighed as much as words such as
"images", that don't occur often in different contexts. The best option would be to
remove all words that are so frequent that they do not help to distinguish between
different texts. These words are called stop words.
As this is such a common step in text processing, there is a simple parameter in
CountVectorizer to achieve this, as follows:

In [20]:
vectorizer = CountVectorizer(min_df=1, stop_words='english')
print sorted(vectorizer.get_stop_words())[0:20]

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst']


In [21]:
# The new word list is seven words lighter:
X_train = vectorizer.fit_transform(posts)
print vectorizer.get_feature_names()

[u'actually', u'capabilities', u'contains', u'data', u'databases', u'images', u'imaging', u'interesting', u'learning', u'machine', u'permanently', u'post', u'provide', u'safe', u'storage', u'store', u'stuff', u'toy']


In [22]:
# Without stop words
post_cluster(new_post, posts, dist_norm)

Post candidate:
	imaging databases

### Post 0 distance == 1.414
This is a toy post about machine learning. Actually, it contains not much interesting stuff.
### Post 1 distance == 0.857
Imaging databases provide storage capabilities.
### Post 2 distance == 0.857
Most imaging databases safe images permanently.
### Post 3 distance == 0.765
Imaging databases store data.
### Post 4 distance == 0.765
Imaging databases store data. Imaging databases store data. Imaging databases store data.

Post 3 is the best match with a score of 0.765


# Stemming
## Installing and using NLTK
[already present with Anaconda; nice]

In [23]:
import nltk.stem
s= nltk.stem.SnowballStemmer('english')

In [24]:
[s.stem(x) for x in ["graphics", "imaging", "image", "imagination", "imagine"]]

[u'graphic', u'imag', u'imag', u'imagin', u'imagin']

In [25]:
[s.stem(x) for x in ["buys", "buying", "bought"]]

[u'buy', u'buy', u'bought']

## Extending the vectorizer with NLTK's stemmer
add'l source: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [26]:
# HELP: super

# Docstring:
# super(type, obj) -> bound super object; requires isinstance(obj, type)
# super(type) -> unbound super object
# super(type, type2) -> bound super object; requires issubclass(type2, type)

# Typical use to call a cooperative superclass method:
# class C(B):
#     def meth(self, arg):
#         super(C, self).meth(arg)
# Type:      type

In [27]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

In [28]:
# we now have one feature less, because "images" and "imaging" collapsed to one
X_train = vectorizer.fit_transform(posts)
print vectorizer.get_feature_names()

[u'actual', u'capabl', u'contain', u'data', u'databas', u'imag', u'interest', u'learn', u'machin', u'perman', u'post', u'provid', u'safe', u'storag', u'store', u'stuff', u'toy']


In [29]:
post_cluster(new_post, posts, dist_norm)

Post candidate:
	imaging databases

### Post 0 distance == 1.414
This is a toy post about machine learning. Actually, it contains not much interesting stuff.
### Post 1 distance == 0.857
Imaging databases provide storage capabilities.
### Post 2 distance == 0.630
Most imaging databases safe images permanently.
### Post 3 distance == 0.765
Imaging databases store data.
### Post 4 distance == 0.765
Imaging databases store data. Imaging databases store data. Imaging databases store data.

Post 2 is the best match with a score of 0.630


# Stop words on steroids

In [30]:
# naive approach to tf-idf
import numpy as np

def tfidf(word, doc, doc_collection):
    # 'tf' NOT normalized by length/sum of terms in 'doc'
    tf = len([one for one in doc if word == one])
    idf = np.log(len(doc_collection)/float(len([match for match in doc_collection if word in match])))
    
    ### when I thought I'd be working with strings
#     tf = doc.lower().split().count(word.lower())
#     idf =  np.log(len(doc_collection)/float(len([match for match in doc_collection if word in match])))

    return tf * idf

In [31]:
a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
D = [a, abb, abc]
D, a

([['a'], ['a', 'b', 'b'], ['a', 'b', 'c']], ['a'])

In [32]:
print(tfidf("a", a, D))

print(tfidf("b", abb, D))

print(tfidf("a", abc, D))

print(tfidf("b", abc, D))

print(tfidf("c", abc, D))

0.0
0.810930216216
0.0
0.405465108108
1.09861228867
