In [1]:
import os
from pprint import pprint
import pandas as pd 
import numpy as np
from nltk import WordNetLemmatizer
from nltk import PorterStemmer
from nltk.corpus import stopwords
from gensim.test.utils import datapath
from nltk import WordPunctTokenizer
import enchant
import math
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
from sklearn.metrics.cluster import *
from collections import Counter
import random
from tqdm import tqdm
from tqdm import tqdm_notebook
import seaborn as sns
import copy
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer
random.seed(1)
%matplotlib inline
# plt.rcParams['figure.figsize'] = [8, 8]
sns.set(style = "darkgrid")

%run fns.ipynb



In [2]:
d = enchant.Dict("en_US")
stop_ws = set(stopwords.words('english'))
path = "dataset/"

In [3]:
docs, docs_label, label_mapped = collect_documents(path)
C = 5

{'rec.sport.hockey': 0, 'sci.space': 1, 'comp.graphics': 2, 'sci.med': 3, 'talk.politics.misc': 4}


In [4]:
data = pd.DataFrame.from_dict(docs, orient = 'index', columns = ['text'])
data['doc_id'] = data.index
data['class'] = docs_label
data.head()

Unnamed: 0,text,doc_id,class
415,b'Newsgroups: comp.graphics\nPath: cantaloupe....,415,2
926,b'Newsgroups: comp.graphics\nPath: cantaloupe....,926,2
511,"b""Xref: cantaloupe.srv.cs.cmu.edu comp.sys.hp:...",511,2
492,b'Newsgroups: comp.graphics\nPath: cantaloupe....,492,2
141,b'Newsgroups: comp.graphics\nPath: cantaloupe....,141,2


In [5]:
def joinList(inputList):
    return " ".join(inputList)  

In [6]:
def applyPreprocessing(data, col_name):
    data[col_name] = data[col_name].apply(tokenizeDocument)
    data[col_name] = data[col_name].apply(joinList)
  
    return data[col_name]

In [7]:
def getBagofwords(data, col_name):
    vectorizer = CountVectorizer(analyzer = 'word')
    bow = vectorizer.fit_transform(data[col_name])
    return vectorizer, bow


def featureNameToIndex(B,feat):
    return B.vocabulary_.get(feat)

In [8]:
data['text'] = applyPreprocessing(data, 'text')
vectorizer_content, bag_of_words  = getBagofwords(data, 'text')

print('Total no. of documents : ', bag_of_words.shape[0])
print('Total no. of features (full text) : ', bag_of_words.shape[1])

bag_of_words = bag_of_words.toarray()
bag_of_words = bag_of_words * (1/np.linalg.norm(bag_of_words, axis = 1))[:, np.newaxis]

Total no. of documents :  5000
Total no. of features (full text) :  19714


## KMeans

In [9]:
def claculateMetrics(final_assignments, true_labels):
    pred_labels = np.zeros((data.shape[0],))
    
    purity = 0
    for k, v in final_assignments.items():
        unq, cnt = np.unique(true_labels[v], return_counts = True)
        purity+= np.max(cnt)
        pred_labels[v] = unq[np.argmax(cnt)]
    
    # A clustering result satisfies homogeneity if all of its clusters 
    # contain only data points which are members of a single class
    print('Homogeneity Score: ', homogeneity_score(true_labels, pred_labels))
    print('Completeness Score: ', completeness_score(true_labels, pred_labels))
    print('Purity :', purity/true_labels.shape[0] )
    print('ARI: ', adjusted_rand_score(true_labels, pred_labels))
    return

In [10]:
final_assignments = k_means(bag_of_words, 5, 0.001)
claculateMetrics(final_assignments, data['class'])

Intitial Centroids:  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Iteration number:  1
cluster no.,  0  # points  561
cluster no.,  1  # points  3479
cluster no.,  2  # points  81
cluster no.,  3  # points  548
cluster no.,  4  # points  331
SSE- 5970.409958420469
Iteration number:  2
cluster no.,  0  # points  486
cluster no.,  1  # points  2724
cluster no.,  2  # points  519
cluster no.,  3  # points  750
cluster no.,  4  # points  521
SSE- 4483.061052322285
Iteration number:  3
cluster no.,  0  # points  438
cluster no.,  1  # points  2344
cluster no.,  2  # points  914
cluster no.,  3  # points  725
cluster no.,  4  # points  579
SSE- 4452.638915669482
Iteration number:  4
cluster no.,  0  # points  424
cluster no.,  1  # points  2024
cluster no.,  2  # points  1253
cluster no.,  3  # points  702
cluster no.,  4  # points  597
SSE- 4439.513883877621
Iteration number:  5
cluster no.,  0  # points  419
c

## Word2Vec

In [11]:
w2v_model =  KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [12]:
print('total vocab', len(w2v_model.vocab))

total vocab 3000000


In [13]:
w2v_data = np.zeros((data.shape[0], 300)) # 300 is fixed

In [14]:
for i in range(0, data.shape[0]):
    w2v_data[i,:] = sentenceVector(w2v_model, data.loc[data.index[i], 'text'])

In [15]:
final_assignments_w2v = k_means(w2v_data, 5, 0.001)
claculateMetrics(final_assignments_w2v, data['class'])

Intitial Centroids:  [[ 0.01692458  0.03138805  0.03153076 ... -0.06549072 -0.02737968
  -0.01486005]
 [-0.01915077  0.05320601  0.00872223 ... -0.03938987 -0.02153669
   0.02047918]
 [ 0.00499488  0.03233916 -0.03454566 ... -0.00706422 -0.00031306
  -0.03999811]
 [-0.05045553  0.05799685  0.01443399 ... -0.03261097 -0.00331161
   0.04978638]
 [ 0.0421592   0.03538631  0.0506137  ... -0.04778818 -0.03675092
  -0.01349187]]
Iteration number:  1
cluster no.,  0  # points  1931
cluster no.,  1  # points  1488
cluster no.,  2  # points  727
cluster no.,  3  # points  724
cluster no.,  4  # points  130
SSE- 2769.629772199673
Iteration number:  2
cluster no.,  0  # points  1402
cluster no.,  1  # points  1308
cluster no.,  2  # points  1087
cluster no.,  3  # points  801
cluster no.,  4  # points  402
SSE- 2124.2396316456334
Iteration number:  3
cluster no.,  0  # points  1189
cluster no.,  1  # points  1252
cluster no.,  2  # points  1168
cluster no.,  3  # points  750
cluster no.,  4  # po