# import libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.io
import math
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF

# function for Kmeans

In [2]:
def kmeans(num_clusters,matrix):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(matrix)
    clusters=kmeans.predict(matrix)
    return clusters

# function for NMF

In [3]:
def nmf(num_clusters,matrix):
    model = NMF(n_components=num_clusters,init='nndsvda', random_state=0)
    W = model.fit_transform(matrix)
    H = model.components_
    return W,H

# number of documents in each cluster

In [4]:
def num_of_docs_each_cluster(num_clusters,list):
    counter_clusters = np.zeros((num_clusters))
    for i in list:
        counter_clusters[i] = counter_clusters[i] + 1
    counter_clusters=np.array(counter_clusters)
    return counter_clusters

# function for Purity

In [5]:
def purity(num_clusters,list1,list2):
    sum=0
    for i in range(num_clusters):
        counter1 = np.zeros((num_clusters))
        w=0
        for j in range(len(list1)):
            k=int(list1[j])
            t = w + k
            for j2 in list2[w:t]:
                if(j2==i):
                    counter1[j]=counter1[j] + 1
            w=t 
        sum=sum+max(counter1)
    purity=sum/ int(np.sum(list1))
    return purity

# function for Entropy

In [6]:
def entropy(num_clusters,list1,list2):
    sum2=0
    for i in range(num_clusters):
        sum1=0
        w=0
        for j in range(len(list1)):
            counter2=0
            k=int(list1[j])
            t = w + k
            for j2 in list2[w:t]:
                if(j2==i):
                    counter2 +=1
            w=t
            if(counter2 !=0):
                sum1=sum1+ counter2* (math.log(counter2/(int(num_of_docs_each_cluster(num_clusters,list2)[i]))))
        sum2=sum2+sum1
    entropy= (-1)*sum2/ (int(np.sum(list1)*math.log(len(list1))))
    return entropy

# TDT2 dataset

In [7]:
mat_tdt2 = scipy.io.loadmat('TDT2.mat')
mat_tdt2

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN, Created on: Wed Apr 25 13:53:55 2007',
 '__version__': '1.0',
 '__globals__': [],
 'gnd': array([[ 1],
        [ 1],
        [ 1],
        ...,
        [30],
        [30],
        [30]], dtype=uint8),
 'fea': <9394x36771 sparse matrix of type '<class 'numpy.float64'>'
 	with 1224135 stored elements in Compressed Sparse Column format>}

In [8]:
fea_tdt2 = mat_tdt2.get('fea')
gnd_tdt2 = mat_tdt2.get('gnd')

## number of documents in each category

In [9]:
counter_tdt2 = np.zeros((30))
for i in gnd_tdt2:
    counter_tdt2[i-1] = counter_tdt2[i-1] + 1
counter_tdt2=np.array(counter_tdt2)
counter_tdt2

array([1844., 1828., 1222.,  811.,  441.,  407.,  272.,  238.,  226.,
        167.,  160.,  145.,  141.,  140.,  131.,  123.,  123.,  120.,
        104.,   98.,   76.,   74.,   72.,   71.,   66.,   65.,   63.,
         58.,   56.,   52.])

In [10]:
Top10_tdt2 = counter_tdt2[:10]
Last10_tdt2 =counter_tdt2[-10:]

In [11]:
Top10_tdt2

array([1844., 1828., 1222.,  811.,  441.,  407.,  272.,  238.,  226.,
        167.])

In [12]:
num_top10_tdt2=np.sum(Top10_tdt2)
num_top10_tdt2

7456.0

In [13]:
Top10_tdt2_10percent=[]
for a in Top10_tdt2:
    Top10_tdt2_10percent.append(math.ceil(a*0.1))
Top10_tdt2_10percent

[185, 183, 123, 82, 45, 41, 28, 24, 23, 17]

In [14]:
num_top10_tdt2_10percent=np.sum(Top10_tdt2_10percent)
num_top10_tdt2_10percent

751

In [15]:
Last10_tdt2

array([76., 74., 72., 71., 66., 65., 63., 58., 56., 52.])

In [16]:
num_last10_tdt2=np.sum(Last10_tdt2)
num_last10_tdt2

653.0

In [17]:
satr_tdt2=fea_tdt2.shape[0]
sotoon_tdt2=fea_tdt2.shape[1]
print(satr_tdt2)
print(sotoon_tdt2)

9394
36771


In [18]:
matrix_tdt2 = fea_tdt2.toarray()
matrix_tdt2

array([[2., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 2., 2., ..., 0., 0., 0.]])

In [19]:
np.shape(matrix_tdt2)

(9394, 36771)

## Top 10 of TDT2

In [20]:
matrix_tdt2_top10=matrix_tdt2[:int(num_top10_tdt2)]
matrix_tdt2_top10

array([[2., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 6., 0., ..., 0., 0., 0.]])

In [21]:
matrix_tdt2_top10.shape

(7456, 36771)

In [26]:
matrix_tdt2_top10_10percent=[]
e=0
for i in range(10):
    tp=Top10_tdt2_10percent[i]
    mp=matrix_tdt2_top10[e:e+tp]
    for j in mp:
        matrix_tdt2_top10_10percent.append(list(j))
    e+=int(Top10_tdt2[i])

In [27]:
np.shape(matrix_tdt2_top10_10percent)

(751, 36771)

### Kmeans for Top 10 of TDT2

In [28]:
clusters_tdt2_top10=kmeans(10,matrix_tdt2_top10_10percent)

#### clusters of documents with Kmeans for Top 10 of TDT2

In [29]:
clusters_tdt2_top10

array([2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 7, 1, 7, 1,
       8, 9, 1, 7, 1, 1, 7, 7, 7, 8, 8, 8, 8, 1, 7, 7, 1, 1, 1, 1, 7, 7,
       8, 1, 8, 8, 7, 7, 7, 8, 8, 1, 8, 7, 7, 1, 1, 7, 7, 7, 7, 1, 1, 7,
       7, 1, 1, 8, 1, 1, 7, 8, 7, 1, 7, 1, 1, 7, 1, 7, 7, 7, 1, 1, 8, 1,
       1, 7, 8, 1, 7, 7, 7, 7, 7, 8, 7, 7, 7, 1, 7, 7, 7, 7, 1, 8, 8, 1,
       1, 7, 7, 7, 7, 7, 3, 1, 3, 1, 7, 3, 7, 3, 3,

In [30]:
clusters_tdt2_top10.shape

(751,)

#### number of documents in each cluster

In [31]:
num_of_docs_each_cluster(10,clusters_tdt2_top10)

array([ 26., 553.,  26.,   7.,   6.,   1.,   5.,  89.,  37.,   1.])

#### purity of Kmeans clustering for Top 10 of TDT2

In [32]:
purity_kmeans_top10_tdt2=purity(10,Top10_tdt2_10percent,clusters_tdt2_top10)
purity_kmeans_top10_tdt2

0.4633821571238349

#### entropy of Kmeans clustering for Top 10 of TDT2

In [33]:
entropy_kmeans_top10_tdt2=entropy(10,Top10_tdt2_10percent,clusters_tdt2_top10)
entropy_kmeans_top10_tdt2

0.6693741797162261

### NMF for Top 10 of TDT2

In [34]:
W_tdt2_top10,H_tdt2_top10=nmf(10,matrix_tdt2_top10_10percent)

In [35]:
W_tdt2_top10.shape

(751, 10)

In [36]:
H_tdt2_top10.shape

(10, 36771)

#### clusters of documents with NMF for Top 10 of TDT2

In [37]:
max_index_tdt2_top10=[]
for document in W_tdt2_top10:
    max_index_tdt2_top10.append(np.argmax(document))
max_index_tdt2_top10

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 9,
 6,
 9,
 6,
 1,
 1,
 6,
 7,
 6,
 1,
 1,
 6,
 6,
 6,
 6,
 9,
 6,
 9,
 9,
 6,
 9,
 1,
 6,
 6,
 1,
 6,
 9,
 1,
 9,
 6,
 6,
 1,
 6,
 6,
 1,
 1,
 1,
 9,
 6,
 6,
 1,
 6,
 1,
 1,
 9,
 6,
 6,
 6,
 1,
 1,
 1,
 9,
 9,
 6,
 1,
 1,
 6,
 1,
 6,
 7,
 1,
 6,
 1,
 9,
 6,
 4,
 6,
 8,
 6,
 1,
 1,
 1,
 9,
 9,
 6,
 9,
 9,
 1,
 6,
 9,
 6,
 9,
 6,
 6,
 6,
 6,
 6,
 9,
 6,
 6,
 1,
 1,
 5,
 6,
 6,
 6,
 9,
 1,
 6,
 1,
 9,
 6,
 6,
 6,
 6,
 9,
 9,
 6,
 1,
 9,
 1,
 6,
 6,
 6,
 6,
 6,
 6,
 1,
 9,
 6,
 6,
 1,
 6,
 6,
 6,
 6,
 1,
 4,
 4,
 4,
 7,
 9,
 9,
 9,
 9,
 6,
 6,
 6,
 6,
 9,
 6,
 9,
 6,
 6,
 6,
 6,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 6,
 1,
 6,
 1,
 9,
 9,
 1,
 9,
 1,
 6,
 9,
 9,
 6,
 6,
 6,
 6,
 9,
 5,
 9,
 9,
 9,
 9,
 1,
 6,
 4,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 3,
 7,
 0,
 0,
 6,
 6,
 0,
 0,
 0,
 3,
 3,
 3,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 3,
 0,
 3,
 3,
 1,
 0,
 0,
 3,
 3,
 6,
 3,
 0,
 3,
 6,
 6,
 0,
 0,
 0,
 0,
 3,
 3,
 0,
 0,
 0,
 0,
 3,
 3,
 0,
 0,
 3,


In [38]:
np.shape(max_index_tdt2_top10)

(751,)

#### number of documents in each cluster

In [39]:
num_of_docs_each_cluster(10,max_index_tdt2_top10)

array([116.,  57.,  43.,  54.,  83.,  40., 149.,  10., 145.,  54.])

#### purity of NMF clustering for Top 10 of TDT2

In [40]:
purity_nmf_top10_tdt2=purity(10,Top10_tdt2_10percent,max_index_tdt2_top10)
purity_nmf_top10_tdt2

0.7190412782956058

#### entropy of NMF clustering for Top 10 of TDT2

In [41]:
entropy_nmf_top10_tdt2=entropy(10,Top10_tdt2_10percent,max_index_tdt2_top10)
entropy_nmf_top10_tdt2

0.36667244059857834

## Last 10 of TDT2

In [42]:
matrix_tdt2_last10=matrix_tdt2[-int(num_last10_tdt2):]
matrix_tdt2_last10

array([[8., 5., 1., ..., 0., 0., 0.],
       [9., 0., 1., ..., 0., 0., 0.],
       [4., 2., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 2., 2., ..., 0., 0., 0.]])

In [43]:
matrix_tdt2_last10.shape

(653, 36771)

### Kmeans for Last 10 of TDT2

In [44]:
clusters_tdt2_last10=kmeans(10,matrix_tdt2_last10)

#### clusters of documents with Kmeans for Last 10 of TDT2

In [45]:
clusters_tdt2_last10

array([7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 7, 0, 7, 7, 7, 7, 7, 7, 7, 0, 7, 7,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 0, 7, 0, 0, 7, 0, 7, 0,
       7, 4, 4, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0,
       0, 0, 0, 7, 7, 0, 7, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0,

In [46]:
clusters_tdt2_last10.shape

(653,)

#### number of documents in each cluster

In [47]:
num_of_docs_each_cluster(10,clusters_tdt2_last10)

array([481.,  32.,   4.,  52.,   2.,  19.,   1.,  43.,  15.,   4.])

#### purity of Kmeans clustering for Last 10 of TDT2

In [48]:
purity_kmeans_last10_tdt2=purity(10,Last10_tdt2,clusters_tdt2_last10)
purity_kmeans_last10_tdt2

0.3767228177641654

#### entropy of Kmeans clustering for Top 10 of TDT2

In [49]:
entropy_kmeans_last10_tdt2=entropy(10,Last10_tdt2,clusters_tdt2_last10)
entropy_kmeans_last10_tdt2

0.7114184237947471

### NMF for Last 10 of TDT2

In [50]:
W_tdt2_last10,H_tdt2_last10=nmf(10,matrix_tdt2_last10)

In [51]:
W_tdt2_last10.shape

(653, 10)

In [52]:
H_tdt2_last10.shape

(10, 36771)

#### clusters of documents with NMF for Last 10 of TDT2

In [53]:
max_index_tdt2_last10=[]
for document in W_tdt2_last10:
    max_index_tdt2_last10.append(np.argmax(document))
max_index_tdt2_last10

[5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 5,
 0,
 5,
 5,
 0,
 5,
 5,
 0,
 5,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 5,
 5,
 5,
 8,
 7,
 9,
 9,
 5,
 8,
 1,
 1,
 5,
 5,
 8,
 5,
 5,
 5,
 5,
 3,
 3,
 3,
 5,
 5,
 5,
 5,
 5,
 1,
 1,
 6,
 3,
 1,
 3,
 5,
 7,
 6,
 3,
 9,
 3,
 5,
 0,
 4,
 9,
 1,
 7,
 6,
 9,
 9,
 0,
 9,
 8,
 9,
 9,
 3,
 3,
 5,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 9,
 9,
 1,
 5,
 1,
 5,
 5,
 5,
 5,
 7,
 5,
 8,
 8,
 5,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 5,
 5,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 5,
 2,
 2,
 8,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [54]:
np.shape(max_index_tdt2_last10)

(653,)

#### number of documents in each cluster

In [56]:
num_of_docs_each_cluster(10,max_index_tdt2_last10)

array([68., 63., 67., 79., 65., 86., 54., 22., 81., 68.])

#### purity of NMF clustering for Last 10 of TDT2

In [57]:
purity_nmf_last10_tdt2=purity(10,Last10_tdt2,max_index_tdt2_last10)
purity_nmf_last10_tdt2

0.8177641653905053

#### entropy of NMF clustering for Top 10 of TDT2

In [58]:
entropy_nmf_last10_tdt2=entropy(10,Last10_tdt2,max_index_tdt2_last10)
entropy_nmf_last10_tdt2

0.23441438872854614

# Reuters-21578 dataset

In [59]:
mat_reu = scipy.io.loadmat('Reuters21578.mat')
mat_reu

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN, Created on: Wed Jan 16 00:53:49 2008',
 '__version__': '1.0',
 '__globals__': [],
 'gnd': array([[ 1],
        [ 1],
        [ 1],
        ...,
        [63],
        [64],
        [65]], dtype=uint8),
 'trainIdx': array([[   1],
        [   2],
        [   3],
        ...,
        [8286],
        [8288],
        [8290]], dtype=uint16),
 'testIdx': array([[2674],
        [2675],
        [2676],
        ...,
        [8291],
        [8292],
        [8293]], dtype=uint16),
 'fea': <8293x18933 sparse matrix of type '<class 'numpy.float64'>'
 	with 389455 stored elements in Compressed Sparse Column format>}

In [60]:
fea_reu = mat_reu.get('fea')
gnd_reu = mat_reu.get('gnd')

## number of documents in each category

In [61]:
counter_reu = np.zeros((65))
for i in gnd_reu:
    counter_reu[i-1] = counter_reu[i-1] + 1
counter_reu=np.array(counter_reu)
counter_reu

array([3.713e+03, 2.055e+03, 3.210e+02, 2.980e+02, 2.450e+02, 1.970e+02,
       1.420e+02, 1.140e+02, 1.100e+02, 9.000e+01, 8.700e+01, 6.300e+01,
       6.000e+01, 5.300e+01, 4.500e+01, 4.500e+01, 4.400e+01, 4.200e+01,
       3.800e+01, 3.800e+01, 3.700e+01, 3.600e+01, 3.300e+01, 3.000e+01,
       2.700e+01, 2.400e+01, 2.300e+01, 2.000e+01, 1.900e+01, 1.800e+01,
       1.800e+01, 1.800e+01, 1.600e+01, 1.500e+01, 1.400e+01, 1.200e+01,
       1.100e+01, 1.100e+01, 1.100e+01, 1.000e+01, 1.000e+01, 9.000e+00,
       8.000e+00, 7.000e+00, 6.000e+00, 5.000e+00, 5.000e+00, 5.000e+00,
       4.000e+00, 4.000e+00, 4.000e+00, 3.000e+00, 3.000e+00, 3.000e+00,
       2.000e+00, 2.000e+00, 2.000e+00, 1.000e+00, 1.000e+00, 1.000e+00,
       1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00])

In [62]:
Top10_reu = counter_reu[:10]
Top20_reu = counter_reu[:20]

In [63]:
Top10_reu

array([3713., 2055.,  321.,  298.,  245.,  197.,  142.,  114.,  110.,
         90.])

In [64]:
num_top10_reu=np.sum(Top10_reu)
num_top10_reu

7285.0

In [65]:
Top10_reu_2=[]
Top10_reu_2.append(math.ceil(Top10_reu[0]*0.05))
Top10_reu_2.append(math.ceil(Top10_reu[1]*0.1))
for a1 in Top10_reu[2:]:
    Top10_reu_2.append(math.ceil(a1*0.4))
Top10_reu_2

[186, 206, 129, 120, 98, 79, 57, 46, 44, 36]

In [66]:
num_top10_reu_2=np.sum(Top10_reu_2)
num_top10_reu_2

1001

In [67]:
Top20_reu

array([3713., 2055.,  321.,  298.,  245.,  197.,  142.,  114.,  110.,
         90.,   87.,   63.,   60.,   53.,   45.,   45.,   44.,   42.,
         38.,   38.])

In [68]:
num_top20_reu=np.sum(Top20_reu)
num_top20_reu

7800.0

In [69]:
Top20_reu_2=[]
Top20_reu_2.append(math.ceil(Top20_reu[0]*0.05))
Top20_reu_2.append(math.ceil(Top20_reu[1]*0.1))
for a1 in Top20_reu[2:]:
    Top20_reu_2.append(math.ceil(a1))
Top20_reu_2

[186,
 206,
 321,
 298,
 245,
 197,
 142,
 114,
 110,
 90,
 87,
 63,
 60,
 53,
 45,
 45,
 44,
 42,
 38,
 38]

In [70]:
num_top20_reu_2=np.sum(Top20_reu_2)
num_top20_reu_2

2424

In [71]:
satr_reu=fea_reu.shape[0]
sotoon_reu=fea_reu.shape[1]
print(satr_reu)
print(sotoon_reu)

8293
18933


In [72]:
matrix_reu = fea_reu.toarray()
matrix_reu

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [73]:
np.shape(matrix_reu)

(8293, 18933)

## Top 10 of Reuters

In [74]:
matrix_reu_top10=matrix_reu[:int(num_top10_reu)]
matrix_reu_top10

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 2., 1., ..., 0., 0., 0.]])

In [75]:
matrix_reu_top10.shape

(7285, 18933)

In [76]:
matrix_reu_top10_2=[]
e=0
for i in range(10):
    tp=Top10_reu_2[i]
    mp=matrix_reu_top10[e:e+tp]
    for j in mp:
        matrix_reu_top10_2.append(list(j))
    e+=int(Top10_reu[i])

In [77]:
np.shape(matrix_reu_top10_2)

(1001, 18933)

### Kmeans for Top 10 of Reuters

In [78]:
clusters_reu_top10=kmeans(10,matrix_reu_top10_2)

#### clusters of documents with Kmeans for Top 10 of Reuters

In [79]:
clusters_reu_top10

array([1, 2, 2, ..., 2, 2, 2])

In [80]:
clusters_reu_top10.shape

(1001,)

#### number of documents in each cluster

In [81]:
num_of_docs_each_cluster(10,clusters_reu_top10)

array([ 29.,  89., 794.,   6.,   1.,   1.,  11.,   2.,  20.,  48.])

#### purity of Kmeans clustering for Top 10 of Reuters

In [82]:
purity_kmeans_top10_reu=purity(10,Top10_reu_2,clusters_reu_top10)
purity_kmeans_top10_reu

0.38161838161838163

#### entropy of Kmeans clustering for Top 10 of Reuters

In [83]:
entropy_kmeans_top10_reu=entropy(10,Top10_reu_2,clusters_reu_top10)
entropy_kmeans_top10_reu

0.7858856966447364

### NMF for Top 10 of Reuters

In [84]:
W_reu_top10,H_reu_top10=nmf(10,matrix_reu_top10_2)

In [85]:
W_reu_top10.shape

(1001, 10)

In [86]:
H_reu_top10.shape

(10, 18933)

#### clusters of documents with NMF for Top 10 of Reuters

In [87]:
max_index_reu_top10=[]
for document in W_reu_top10:
    max_index_reu_top10.append(np.argmax(document))
max_index_reu_top10

[2,
 2,
 0,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 4,
 2,
 2,
 8,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 0,
 4,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 2,
 4,
 2,
 2,
 4,
 3,
 2,
 2,
 8,
 4,
 2,
 4,
 4,
 2,
 8,
 8,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 2,
 2,
 4,
 0,
 4,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 8,
 2,
 2,
 4,
 2,
 4,
 0,
 8,
 4,
 2,
 7,
 2,
 2,
 2,
 9,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 0,
 0,
 8,
 4,
 2,
 2,
 2,
 3,
 4,
 2,
 2,
 2,
 4,
 2,
 0,
 2,
 0,
 2,
 0,
 4,
 2,
 2,
 2,
 2,
 0,
 4,
 2,
 2,
 2,
 4,
 3,
 8,
 4,
 4,
 2,
 0,
 2,
 2,
 4,
 2,
 4,
 2,
 9,
 2,
 4,
 2,
 2,
 8,
 2,
 4,
 4,
 8,
 2,
 2,
 2,
 2,
 0,
 2,
 4,
 2,
 2,
 0,
 4,
 2,
 3,
 2,
 4,
 2,
 2,
 0,
 0,
 4,
 4,
 0,
 4,
 4,
 4,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 4,
 0,
 4,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 9,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 4,
 0,
 0,
 3,
 4,
 0,
 0,
 3,
 0,
 4,
 8,
 4,
 0,
 0,
 0,
 3,


In [88]:
np.shape(max_index_reu_top10)

(1001,)

#### number of documents in each cluster

In [89]:
num_of_docs_each_cluster(10,max_index_reu_top10)

array([182., 102., 113., 146., 126.,  96.,  43.,  48.,  49.,  96.])

#### purity of NMF clustering for Top 10 of Reuters

In [90]:
purity_nmf_top10_reu=purity(10,Top10_reu_2,max_index_reu_top10)
purity_nmf_top10_reu

0.6653346653346653

#### entropy of NMF clustering for Top 10 of Reuters

In [91]:
entropy_nmf_top10_reu=entropy(10,Top10_reu_2,max_index_reu_top10)
entropy_nmf_top10_reu

0.44095378785685646

## Top 20 of Reuters

In [92]:
matrix_reu_top20=matrix_reu[:int(num_top20_reu)]
matrix_reu_top20

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 2., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [93]:
matrix_reu_top20.shape

(7800, 18933)

In [94]:
matrix_reu_top20_2=[]
e=0
for i in range(20):
    tp=Top20_reu_2[i]
    mp=matrix_reu_top20[e:e+tp]
    for j in mp:
        matrix_reu_top20_2.append(list(j))
    e+=int(Top20_reu[i])

In [95]:
np.shape(matrix_reu_top20_2)

(2424, 18933)

### Kmeans for Top 20 of Reuters

In [96]:
clusters_reu_top20=kmeans(20,matrix_reu_top20_2)

#### clusters of documents with Kmeans for Top 10 of Reuters

In [97]:
clusters_reu_top20

array([ 4, 11, 11, ...,  8, 11, 11])

In [98]:
clusters_reu_top20.shape

(2424,)

#### number of documents in each cluster

In [99]:
num_of_docs_each_cluster(20,clusters_reu_top20)

array([1.520e+02, 6.000e+01, 1.340e+02, 1.000e+00, 7.000e+01, 1.130e+02,
       8.000e+01, 6.000e+01, 2.300e+01, 2.300e+01, 2.000e+00, 1.381e+03,
       6.800e+01, 1.820e+02, 1.000e+00, 2.000e+01, 3.000e+00, 1.000e+00,
       4.400e+01, 6.000e+00])

#### purity of Kmeans clustering for Top 20 of Reuters

In [100]:
purity_kmeans_top20_reu=purity(20,Top20_reu_2,clusters_reu_top20)
purity_kmeans_top20_reu

0.33663366336633666

#### entropy of Kmeans clustering for Top 20 of Reuters

In [101]:
entropy_kmeans_top20_reu=entropy(20,Top20_reu_2,clusters_reu_top20)
entropy_kmeans_top20_reu

0.6765307064747804

### NMF for Top 20 of Reuters

In [102]:
W_reu_top20,H_reu_top20=nmf(20,matrix_reu_top20_2)

In [103]:
W_reu_top20.shape

(2424, 20)

In [104]:
H_reu_top20.shape

(20, 18933)

#### clusters of documents with NMF for Top 20 of Reuters

In [105]:
max_index_reu_top20=[]
for document in W_reu_top20:
    max_index_reu_top20.append(np.argmax(document))
max_index_reu_top20

[10,
 10,
 17,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 3,
 10,
 10,
 3,
 10,
 10,
 10,
 10,
 10,
 3,
 17,
 3,
 10,
 10,
 10,
 10,
 10,
 3,
 10,
 17,
 3,
 10,
 17,
 10,
 10,
 10,
 17,
 17,
 10,
 10,
 10,
 10,
 3,
 1,
 10,
 10,
 7,
 17,
 10,
 10,
 17,
 10,
 7,
 7,
 10,
 17,
 10,
 10,
 10,
 10,
 10,
 10,
 1,
 10,
 10,
 10,
 17,
 10,
 10,
 10,
 17,
 10,
 17,
 10,
 10,
 17,
 17,
 3,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 17,
 17,
 10,
 10,
 10,
 10,
 10,
 10,
 17,
 10,
 10,
 10,
 17,
 10,
 10,
 10,
 18,
 10,
 3,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 17,
 17,
 7,
 17,
 10,
 10,
 10,
 1,
 10,
 10,
 10,
 10,
 3,
 10,
 17,
 10,
 17,
 17,
 3,
 3,
 10,
 10,
 10,
 10,
 17,
 10,
 10,
 10,
 10,
 10,
 1,
 7,
 10,
 17,
 10,
 17,
 10,
 10,
 17,
 10,
 3,
 10,
 18,
 10,
 3,
 10,
 10,
 7,
 10,
 17,
 3,
 7,
 10,
 10,
 10,
 10,
 3,
 10,
 10,
 10,
 10,
 17,
 17,
 10,
 1,
 10,
 3,
 10,
 10,
 17,
 17,
 3,
 17,
 17,
 17,
 17,
 17,
 12,
 17,
 17,
 17,
 17,
 17,
 17,
 1,
 17,
 3,
 17,
 17,
 1

In [106]:
np.shape(max_index_reu_top20)

(2424,)

#### number of documents in each cluster

In [107]:
num_of_docs_each_cluster(20,max_index_reu_top20)

array([ 19., 288., 158., 139.,  53.,  57., 113., 112., 142., 110., 126.,
       163., 114.,  52.,  49.,  12.,  37., 403., 147., 130.])

#### purity of NMF clustering for Top 20 of Reuters

In [108]:
purity_nmf_top20_reu=purity(20,Top20_reu_2,max_index_reu_top20)
purity_nmf_top20_reu

0.5878712871287128

#### entropy of NMF clustering for Top 20 of Reuters

In [109]:
entropy_nmf_top20_reu=entropy(20,Top20_reu_2,max_index_reu_top20)
entropy_nmf_top20_reu

0.4372482361822522

In [110]:
compare = pd.DataFrame({
    "Data set": ["TDT2_Top10","TDT2_Last10","Reuters_Top10","Reuters_Top20"],
    "Kmeans_purity": [purity_kmeans_top10_tdt2,purity_kmeans_last10_tdt2,purity_kmeans_top10_reu,purity_kmeans_top20_reu],
    "Kmeans_entropy": [entropy_kmeans_top10_tdt2,entropy_kmeans_last10_tdt2,entropy_kmeans_top10_reu,entropy_kmeans_top20_reu],
    "NMF_purity": [purity_nmf_top10_tdt2,purity_nmf_last10_tdt2,purity_nmf_top10_reu,purity_nmf_top20_reu],
    "NMF_entropy": [entropy_nmf_top10_tdt2,entropy_nmf_last10_tdt2,entropy_nmf_top10_reu,entropy_nmf_top20_reu]
})
compare

Unnamed: 0,Data set,Kmeans_purity,Kmeans_entropy,NMF_purity,NMF_entropy
0,TDT2_Top10,0.463382,0.669374,0.719041,0.366672
1,TDT2_Last10,0.376723,0.711418,0.817764,0.234414
2,Reuters_Top10,0.381618,0.785886,0.665335,0.440954
3,Reuters_Top20,0.336634,0.676531,0.587871,0.437248
