# Count Vectorizer

Goal: Create an algo that computes vectors of counts of words for texts.

In [50]:
import numpy  as np
import scipy as sp
import pandas as pd
from sklearn.cluster import kmeans_plusplus
import glob

Get full vocabulary size from dataset

In [65]:
vocab = set()

def getTextVocabulary(filepath, vocab):
   vect = dict()
   with open(filepath) as file:
      for line in file:
         for word in line.lower().translate({ord(i): None for i in '.?!-,"()'}).split():
            if word in vect:
               vect[word]+=1
            else:
               vect[word]=1
   return vocab.union({*vect.keys()})

filepaths = glob.glob("../datasets/bbc/*/*.txt")
dataset_size = len(filepaths)

for file in filepaths:
   try:
      vocab = getTextVocabulary(file,vocab)
   except Exception:
      print(Exception)
   
vocab

<class 'Exception'>


{'airliner',
 'ellman',
 'daley',
 'challenger',
 'supports',
 '2628',
 "snow's",
 '51yearold',
 'efforts',
 '6600',
 'convert',
 'damming',
 "child's",
 'competitve',
 'near',
 'pitt',
 'henri',
 'spends',
 'blewitt',
 'cameroon',
 'hybrid',
 'easterby;',
 'algorithm',
 'andrew',
 'honestly',
 '$151m',
 'grab',
 'heir',
 'traviata',
 'toppling',
 'lovers',
 'intentions',
 'dimech',
 'interpreting',
 'seemed',
 'handcuffed',
 '£27473m',
 "offer'",
 'rockwool',
 'carries',
 'deficit',
 'vpcc4',
 'oil',
 'grants',
 'interactivity',
 'us$370m',
 'penalty',
 'nutritional',
 '1723',
 'clad',
 'viticulture',
 'yukos',
 'licences',
 'tours',
 'bbc',
 'sibneft',
 '51%',
 'flyhalf',
 '2101',
 'walker',
 'mbabane',
 'fm',
 'rossi',
 'rebound',
 'takeaway',
 'buyer',
 'relying',
 'quarrying',
 "jacob's",
 'highestcharting',
 'heroics',
 'camille',
 'thayer',
 'antiques',
 'arrangements',
 'coincided',
 'schwarzenegger',
 '311764',
 'inflated',
 'carter',
 "competitors'",
 'spikey',
 'hots',
 'pre

Then compute vectors for each text

In [89]:
vocabList = list(vocab)
vocabulary_size = len(vocabList)
print("vocab size: ", vocabulary_size)

def getTextVector(filepath):
  textVect = np.zeros((vocabulary_size,1))
  with open(filepath) as file:
      for line in file:
        for word in line.lower().translate({ord(i): None for i in '.?!-,"()'}).split():
          textVect[vocabList.index(word),0]+=1
  # normalize vector
  textVect = textVect/np.sum(textVect)
  return textVect

#initialization
X = getTextVector(filepaths[0])
print("single vector shape: ", X.shape) 
print(X)

for filepath in filepaths[1:]:
   try:
      X = np.c_[X, getTextVector(filepath)]
   except Exception:
      print(Exception)
  

X.shape

vocab size:  37124
single vector shape:  (37124, 1)
[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
<class 'Exception'>


(37124, 2224)

In [94]:
# Calculate seeds from k-means++
centers_init, indices = kmeans_plusplus(X.T, n_clusters=5, random_state=0)

In [95]:
print(indices)
print(centers_init.shape)

[1653 1853  787  112 1744]
(5, 37124)


Computing distance between K-centers and texts

In [96]:
V_sport = getTextVector("../datasets/bbc/sport/001.txt")
V_business = getTextVector("../datasets/bbc/business/001.txt")
V_entertainment = getTextVector("../datasets/bbc/entertainment/001.txt")
V_politics = getTextVector("../datasets/bbc/politics/001.txt")
V_tech = getTextVector("../datasets/bbc/tech/001.txt")


In [98]:
def vector_distance(v1, v2):
    return np.sqrt(np.sum(np.square(v1 - v2)))

for i in range(5):
    print(vector_distance(V_sport, centers_init[i,:]))

31.648981274266795
29.99181733299662
32.14846102245641
32.37248943908978
32.321061723628375


Print the N most common words from the clusters

In [112]:
N = 30
ind_0 = np.argpartition(centers_init[4,:], -N)[-N:]

for i in range(N):
    print(vocabList[ind_0[i]])


### Results
# Cluster 5 seems to be politics
# Cluster 3 seems to be business
# Cluster 2 seems to tech
#

each
that
same
also
civil
as
agendas
chancellor
are
a
were
baume
government
in
blair
he
prime
minister
union
not
brown
on
said
was
and
the
of
between
mr
to
