# Converting raw text into a bag-of-words

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

In [2]:
print vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [3]:
content = ["How to format my hard disk", " Hard disk format problems "]
X = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

[u'disk', u'format', u'hard', u'how', u'my', u'problems', u'to']

In [7]:
X.toarray().T

array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0]], dtype=int64)

# Counting words

In [8]:
import os

In [16]:
DIR = 'data/toy/'
posts = [open(os.path.join(DIR, f)).read() for f in os.listdir(DIR)]

In [17]:
# same beginning as above section:
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(min_df=1)

In [19]:
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 5, #features: 25


In [25]:
print vectorizer.get_feature_names()

[u'about', u'actually', u'capabilities', u'contains', u'data', u'databases', u'images', u'imaging', u'interesting', u'is', u'it', u'learning', u'machine', u'most', u'much', u'not', u'permanently', u'post', u'provide', u'safe', u'storage', u'store', u'stuff', u'this', u'toy']


In [26]:
# vectorize our new post as follows:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

# Note that the count vectors returned by the transform method are sparse. That is,
# each vector does not store one count value for each word, as most of those counts
# would be zero (post does not contain the word). Instead, it uses the more memory
# efficient implementation coo_matrix (for "COOrdinate"). Our new post, for instance,
# actually contains only two elements:
print(new_post_vec)

# Via its member toarray(), we can again access full ndarray as follows:
print(new_post_vec.toarray())

  (0, 5)	1
  (0, 7)	1
[[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [38]:
# We need to use the full array if we want to use it as a vector for similarity
# calculations. For the similarity measurement (the naive one), we calculate the
# Euclidean distance between the count vectors of the new post and all the old
# posts as follows:
import scipy as sp

def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta)

# The norm() function calculates the Euclidean norm (shortest distance).

In [39]:
post1 = X_train.toarray().T[:,0]
post2 = X_train.toarray().T[:,1]

print post1 - post2
print ''
print dist_raw(post1, post2)

[ 1  1 -1  1  0 -1  0 -1  1  1  1  1  1  0  1  1  0  1 -1  0 -1  0  1  1  1]

4.35889894354


In [49]:
# messing around with norm
def vec_norm(v):
    return sum([x**2 for x in v])**.5
                
vec_norm(post1-post2)

4.358898943540674

In [59]:
best_post = None
best_score = 5000
best_i = -1

print "Post candidate:\n\t%s\n" % (new_post)

for i in range(len(posts)):
    dist = dist_raw(new_post_vec.toarray(), X_train.toarray().T[:, i])
    print "### Post %i distance == %f" % (i, dist)
    print posts[i]
    if dist < best_score:
        best_post = "Post %i" % (i)
        best_score = dist
#         best_i = 

print "\n%s is the best match with a score of %f" % (best_post, best_score)

Post candidate:
	imaging databases

### Post 0 distance == 4.000000
This is a toy post about machine learning. Actually, it contains not much interesting stuff.
### Post 1 distance == 1.732051
Imaging databases provide storage capabilities.
### Post 2 distance == 2.000000
Most imaging databases safe images permanently.
### Post 3 distance == 1.414214
Imaging databases store data.
### Post 4 distance == 5.099020
Imaging databases store data. Imaging databases store data. Imaging databases store data.

Post 3 is the best match with a score of 1.414214


In [60]:
# Looking at posts 3 and 4, however, the picture is not so clear any more. Post 4 is the
# same as Post 3, duplicated three times. So, it should also be of the same similarity to
# the new post as Post 3.
# Printing the corresponding feature vectors explains the reason:
print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())

# Obviously, using only the counts of the raw words is too simple. We will have to
# normalize them to get vectors of unit length.

[[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
[[0 0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]


# Normalizing the word count vectors