In [1]:
import numpy as np
import pylab as pl
import pandas as pd
from sklearn.cluster import KMeans 

In [2]:
Data = pd.read_csv('term-doc-mat.csv', header=None)
TD = Data.iloc[:,1:]
TD

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,24,32,12,6,43,2,0,3,1,6,4,0,0,0,0
1,9,5,5,2,20,0,1,0,0,0,27,14,3,2,11
2,0,3,0,0,3,7,12,4,27,4,0,1,0,0,0
3,3,0,0,0,0,16,0,2,25,23,7,12,21,3,2
4,1,0,0,0,0,33,2,0,7,12,14,5,12,4,0
5,12,2,0,0,27,0,0,0,0,22,9,4,0,5,3
6,0,0,0,0,0,18,32,22,34,17,0,0,0,0,0
7,1,0,0,0,2,0,0,0,3,9,27,7,5,4,4
8,21,10,16,7,31,0,0,0,0,0,0,0,0,1,0
9,2,0,0,2,0,27,4,2,11,8,33,16,14,7,3


In [3]:
terms = Data.iloc[:,0]
terms

0      database
1         index
2    likelihood
3        linear
4        matrix
5         query
6    regression
7     retrieval
8           sql
9        vector
Name: 0, dtype: object

#### First, we want to do some document clustering. Since the data is in term-document format, we need to obtain the transpose of the TD matrix.

In [4]:
DT = TD.T

#### Now we have a document-term matrix:

In [5]:
DT

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,24,9,0,3,1,12,0,1,21,2
2,32,5,3,0,0,2,0,0,10,0
3,12,5,0,0,0,0,0,0,16,0
4,6,2,0,0,0,0,0,0,7,2
5,43,20,3,0,0,27,0,2,31,0
6,2,0,7,16,33,0,18,0,0,27
7,0,1,12,0,2,0,32,0,0,4
8,3,0,4,2,0,0,22,0,0,2
9,1,0,27,25,7,0,34,3,0,11
10,6,0,4,23,12,22,17,9,0,8


In [6]:
numTerms=len(terms)
numTerms

10

#### Next, we will transform the data to TFxIDF weights:

In [7]:
# Find doucment frequencies for each term
DF = np.array([(DT!=0).sum(0)])
print DF

[[10 11  8 10  9  8  5  9  6 12]]


In [8]:
NDocs = len(DT[0])
print NDocs

15


In [9]:
# Create a matrix with all entries = NDocs
NMatrix=np.ones(np.shape(DT), dtype=float)*NDocs

In [10]:
# Convert each entry into IDF values
# Note that IDF is only a function of the term, so all rows will be identical.
DivM = np.divide(NMatrix, DF)
IDF = np.log2(DivM)

In [11]:
np.set_printoptions(precision=2,suppress=True)
print IDF[0:2,]

[[0.58 0.45 0.91 0.58 0.74 0.91 1.58 0.74 1.32 0.32]
 [0.58 0.45 0.91 0.58 0.74 0.91 1.58 0.74 1.32 0.32]]


In [12]:
# Finally compute the TFxIDF values for each document-term entry
DT_tfidf = DT * IDF

In [13]:
DT_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,14.0391,4.027131,0.0,1.754888,0.736966,10.882687,0.0,0.736966,27.76049,0.643856
2,18.7188,2.237295,2.720672,0.0,0.0,1.813781,0.0,0.0,13.219281,0.0
3,7.01955,2.237295,0.0,0.0,0.0,0.0,0.0,0.0,21.15085,0.0
4,3.509775,0.894918,0.0,0.0,0.0,0.0,0.0,0.0,9.253497,0.643856
5,25.153388,8.94918,2.720672,0.0,0.0,24.486046,0.0,1.473931,40.979771,0.0
6,1.169925,0.0,6.348234,9.3594,24.319865,0.0,28.529325,0.0,0.0,8.692059
7,0.0,0.447459,10.882687,0.0,1.473931,0.0,50.7188,0.0,0.0,1.287712
8,1.754888,0.0,3.627562,1.169925,0.0,0.0,34.869175,0.0,0.0,0.643856
9,0.584963,0.0,24.486046,14.624063,5.158759,0.0,53.888725,2.210897,0.0,3.541209
10,3.509775,0.0,3.627562,13.454138,8.843587,19.951593,26.944363,6.63269,0.0,2.575425


#### Now we are ready for clustering

In [20]:
import kMeans

In [21]:
reload(kMeans)

<module 'kMeans' from 'kMeans.py'>

In [23]:
DT_tfidf = np.array(DT_tfidf)
centroids_tfidf, clusters_tfidf = kMeans.kMeans(DT_tfidf, 3, kMeans.distEuclid, kMeans.randCent)

#### Let's take a look at the cluster centroids