In [30]:
import os
texts=set()
for item in os.listdir('/home/meeka/Desktop/NU/453/assn1/philosophy/corpus'):
    texts.add(item.split('_', 1)[0])

In [51]:
import re
import string
from nltk.corpus import stopwords

def loadtxt(filename):
    file=open(filename, 'r')
    text=file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens=doc.split()
    tokens=[word.lower() for word in tokens]
    re_punc=re.compile('[%s]' % re.escape(string.punctuation))
    tokens=[re_punc.sub('',w) for w in tokens]
    tokens=[word for word in tokens if word.isalpha()]
    stop_words=set(stopwords.words('english'))
    tokens=[word for word in tokens if not word in stop_words]
    tokens=[word for word in tokens if len(word)>1]
    return tokens

def doc_to_line(filename):
    doc=loadtxt(filename)
    tokens=clean_doc(doc)
    #tokens=[w for w in tokens if w in vocab]
    return ' '.join(tokens)

def process_docs(directory):
    lines=list()
    titles=list()
    for filename in os.listdir(directory):
        path=directory + '/' + filename
        line=doc_to_line(path)
        lines.append(line)
        titles.append(filename.strip('.txt'))
    return lines, titles


directory='/home/meeka/Desktop/NU/453/assn1/philosophy/corpus'
docs, labels=process_docs(directory)

vocab_filename='/home/meeka/Desktop/NU/453/assn1/philosophy/vocab.txt'
vocab=loadtxt(vocab_filename)

In [124]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
import numpy as np

vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000, min_df=2, stop_words='english', use_idf=True)
X=vectorizer.fit_transform(docs)
terms=vectorizer.get_feature_names()

print("Performing dimensionality reduction using LSA")
# Vectorizer results are normalized, which makes KMeans behave as spherical k-means 
#for better results. Since LSA/SVD results are not normalized, we have to redo the normalization.
svd = TruncatedSVD()
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))
print()

# #############################################################################
# Do the actual clustering

true_k = np.unique(labels).shape[0]
clust_nums=4

#km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000)
km = KMeans(n_clusters=clust_nums, init='k-means++', max_iter=100, n_init=1)
print("Clustering sparse data with %s" % km, '\n')
km.fit(X)

# homogeneity: each cluster contains only members of a single class.
# completeness: all members of a given class are assigned to the same cluster.
# v = (1 + beta) * homogeneity * completeness / (beta * homogeneity + completeness)
# Silhouette Coefficient: (b - a) / max(a, b)
# a: The mean distance between a sample and all other points in the same class.
# b: The mean distance between a sample and all other points in the next nearest cluster.                                     
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))
print()

print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
#order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(clust_nums):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

clusters = km.labels_.tolist()

Performing dimensionality reduction using LSA
Explained variance of the SVD step: 2%

Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1) 

Homogeneity: 0.196
Completeness: 1.000
V-measure: 0.328
Adjusted Rand-Index: 0.000
Silhouette Coefficient: 0.562

Top terms per cluster:
Cluster 0: logic logical set propositions russell truth numbers secondorder mathematics language
Cluster 1: mental properties experience content states moral knowledge truth world objects
Cluster 2: logic logical propositions truth russell properties language proposition set sentences
Cluster 3: mental experience content states properties phenomenal consciousness moral intentionality experiences


In [140]:
import pandas as pd
import re

subject=[]
topic=[]
for x in labels:
    subject.append(x.split('_')[0])
    topic.append(re.sub('[0-9]+','', x.split('_')[1]).lstrip())
    
dfc=pd.DataFrame()
dfc['cluster']=clusters
dfc['subject']=subject
dfc['topic']=topic

In [160]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS

dist = 1 - cosine_similarity(X)

mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]

dfc['x']=xs
dfc['y']=ys
dfc.head()

Unnamed: 0,cluster,subject,topic,x,y
0,1,Neutral Monism,Neutrality,0.131076,0.140197
1,3,The Contents of Perception,Varieties of Conten,0.225731,0.241463
2,2,Vienna Circle,The Basics People Activities and Overview of D...,-0.061857,-0.06614
3,1,Plato,Can we know Platos mind,0.021418,0.022922
4,2,Medieval Theories: Properties of Terms,Appellation,-0.041804,-0.044694


In [159]:
from sklearn.metrics.pairwise import linear_kernel

cosine_similarities = linear_kernel(X[0:1], X).flatten()
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
print(related_docs_indices)
print(cosine_similarities[related_docs_indices])
print()
print(dfc.loc[[0]])
print(dfc.loc[[918]])
print(dfc.loc[[650]])

[  0 918 650 162]
[1.         0.99999995 0.99999949 0.99999791]

   cluster         subject       topic         x         y
0        1  Neutral Monism  Neutrality  0.131076  0.140197
     cluster      subject                      topic         x        y
918        1  Behaviorism  Popularity of Behaviorism  0.130977  0.14009
     cluster                                 subject               topic  \
650        1  Moral Psychology: Empirical Approaches  Egoism vs Altruism   

            x         y  
650  0.131406  0.140549  
