# Clustering the gutenberg corpus

In [12]:
import gensim
import logging, bz2, os
from corpushash import CorpusHash
from nltk.corpus import gutenberg
import numpy as np
import string
from gensim import corpora, models, similarities

In [13]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [14]:
np.random.seed(42)

In [15]:
%%time
decoded_gutencorpus = []
for document_name in gutenberg.fileids():
    document = [word.lower() for word in gutenberg.words(document_name) if word not in string.punctuation and not word.isdigit()]
    decoded_gutencorpus.append(document)

CPU times: user 19.2 s, sys: 392 ms, total: 19.6 s
Wall time: 19.6 s


In [16]:
id2word = gensim.corpora.Dictionary(decoded_gutencorpus)

2017-05-22 21:37:39,599 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-05-22 21:37:43,520 : INFO : built Dictionary(42020 unique tokens: ['gothic', 'covering', 'crypts', 'unrolling', 'willes']...) from 18 documents (total 2161659 corpus positions)


In [17]:
id2word[0]

'covering'

In [18]:
mm = [id2word.doc2bow(text) for text in decoded_gutencorpus]
gensim.corpora.MmCorpus.serialize('wiki_pt_tfidf.mm', mm)

2017-05-22 21:37:46,589 : INFO : storing corpus in Matrix Market format to wiki_pt_tfidf.mm
2017-05-22 21:37:46,595 : INFO : saving sparse matrix to wiki_pt_tfidf.mm
2017-05-22 21:37:46,602 : INFO : PROGRESS: saving document #0
2017-05-22 21:37:47,893 : INFO : saved 18x42020 matrix, density=16.126% (121967/756360)
2017-05-22 21:37:47,898 : INFO : saving MmCorpus index to wiki_pt_tfidf.mm.index


In [19]:
%%time
if os.path.exists('wiki_tfidf_model'):
    tfidf = models.TfidfModel.load('wiki_tfidf_model')
else:
    tfidf = models.TfidfModel(mm)

2017-05-22 21:37:47,930 : INFO : collecting document frequencies
2017-05-22 21:37:47,936 : INFO : PROGRESS: processing document #0
2017-05-22 21:37:48,104 : INFO : calculating IDF weights for 18 documents and 42019 features (121967 matrix non-zeros)


CPU times: user 320 ms, sys: 16 ms, total: 336 ms
Wall time: 345 ms


In [20]:
tfidf.save('wiki_tfidf_model')

2017-05-22 21:37:48,295 : INFO : saving TfidfModel object under wiki_tfidf_model, separately None
2017-05-22 21:37:48,335 : INFO : saved wiki_tfidf_model


The next step is to train the LSI model with a tfidf transformed corpus. So we will need yet another generator to yield the transformed corpus.

In [21]:
def tfidf_corpus_stream(corpus):
    for doc in corpus:
        yield tfidf[doc]

In [22]:
tfidf_corpus_s = tfidf_corpus_stream(mm)

## Calculating the LSI model

In [23]:
if os.path.exists('wiki_lsi_model'):
    lsi = gensim.models.LsiModel.load('wiki_lsi_model')
else:
    lsi = gensim.models.lsimodel.LsiModel(corpus=tfidf_corpus_s, id2word=id2word, num_topics=18)
    lsi.save('wiki_lsi_model')

2017-05-22 21:37:48,439 : INFO : loading LsiModel object from wiki_lsi_model
2017-05-22 21:37:48,579 : INFO : loading id2word recursively from wiki_lsi_model.id2word.* with mmap=None
2017-05-22 21:37:48,585 : INFO : setting ignored attribute dispatcher to None
2017-05-22 21:37:48,589 : INFO : setting ignored attribute projection to None
2017-05-22 21:37:48,594 : INFO : loaded wiki_lsi_model
2017-05-22 21:37:48,598 : INFO : loading LsiModel object from wiki_lsi_model.projection
2017-05-22 21:37:48,739 : INFO : loaded wiki_lsi_model.projection


In [24]:
for n in range(10):
    print("====================")
    print("Topic {}:".format(n))
    print("Coef.\t Token")
    print("--------------------")
    for tok,coef in lsi.show_topic(n):
        print("{:.3}\t{}".format(coef,tok))

Topic 0:
Coef.	 Token
--------------------
0.22	thee
0.176	--
0.165	thou
0.158	."
0.146	,"
0.143	unto
0.143	have
0.125	haue
0.12	flambeau
0.114	thy
Topic 1:
Coef.	 Token
--------------------
0.422	haue
0.253	macb
0.195	ham
0.19	bru
0.174	vs
0.157	vpon
0.153	brutus
0.135	selfe
0.133	cassi
0.122	heere
Topic 2:
Coef.	 Token
--------------------
-0.236	thee
-0.221	unto
-0.164	thou
0.16	elinor
0.153	,"
0.152	mrs
0.147	."
0.144	mr
-0.143	thel
0.129	haue
Topic 3:
Coef.	 Token
--------------------
0.32	syme
-0.278	elinor
0.253	turnbull
0.237	buster
0.198	macian
-0.187	elliot
-0.183	marianne
-0.182	emma
-0.174	mrs
0.151	alice
Topic 4:
Coef.	 Token
--------------------
0.505	buster
0.443	alice
-0.361	syme
-0.287	turnbull
-0.224	macian
0.169	joe
0.151	blacky
0.12	,'
0.119	!'
0.113	billy
Topic 5:
Coef.	 Token
--------------------
-0.639	alice
0.45	buster
-0.176	,'
-0.174	!'
0.152	joe
0.135	blacky
-0.112	whale
-0.109	duchess
-0.108	gryphon
-0.104	dormouse
Topic 6:
Coef.	 Token
--------------------


# LSI on the hashed corpus

Now all of the original documents have been hashed, and we can run the same analysis we ran with the plain corpus. 

In [25]:
np.random.seed(42)

In [26]:
%%time
hashed = CorpusHash(decoded_gutencorpus, 'wiki')

2017-05-22 21:40:34,387 - corpushash.hashers - INFO - 18 documents hashed and saved to wiki/public/2017-05-22_21-40-04-186540.
2017-05-22 21:40:34,387 : INFO : 18 documents hashed and saved to wiki/public/2017-05-22_21-40-04-186540.


CPU times: user 29.3 s, sys: 856 ms, total: 30.2 s
Wall time: 30.2 s


In [27]:
id2word = gensim.corpora.Dictionary(hashed.read_hashed_corpus())

2017-05-22 21:40:34,578 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-05-22 21:40:39,434 : INFO : built Dictionary(42020 unique tokens: ['<_h#q((ev+VAnbMpr7Y7rt1q3wkNGs`}k7*wq1Gq', 'fiKFE)v7e~#`YH)c&mEuf}PJKFKc%&5YPz&!920c', 'CVS4!Nuh;nfCVK?2k2qKbcau~kf*i&Gi5g^&Z4WD', 'v+;cf6@KaG&VKt5FZn(cvD95`EuTlF0E6iiJGN1l', 'J3WGBj7?nK#itn#Q1t0fNS@nu5NPO^wrd5-s3#|l']...) from 18 documents (total 2161659 corpus positions)


In [28]:
id2word[0]

'fiKFE)v7e~#`YH)c&mEuf}PJKFKc%&5YPz&!920c'

In [29]:
mm = [id2word.doc2bow(text) for text in hashed.read_hashed_corpus()]
gensim.corpora.MmCorpus.serialize('hashed_wiki_pt_tfidf.mm', mm)

2017-05-22 21:40:44,196 : INFO : storing corpus in Matrix Market format to hashed_wiki_pt_tfidf.mm
2017-05-22 21:40:44,200 : INFO : saving sparse matrix to hashed_wiki_pt_tfidf.mm
2017-05-22 21:40:44,203 : INFO : PROGRESS: saving document #0
2017-05-22 21:40:45,397 : INFO : saved 18x42020 matrix, density=16.126% (121967/756360)
2017-05-22 21:40:45,400 : INFO : saving MmCorpus index to hashed_wiki_pt_tfidf.mm.index


In [30]:
%%time
if os.path.exists('hashed_wiki_tfidf_model'):
    tfidf = models.TfidfModel.load('hashed_wiki_tfidf_model')
else:
    tfidf = models.TfidfModel(mm)

2017-05-22 21:40:45,428 : INFO : collecting document frequencies
2017-05-22 21:40:45,446 : INFO : PROGRESS: processing document #0
2017-05-22 21:40:45,613 : INFO : calculating IDF weights for 18 documents and 42019 features (121967 matrix non-zeros)


CPU times: user 344 ms, sys: 4 ms, total: 348 ms
Wall time: 363 ms


In [31]:
tfidf.save('hashed_wiki_tfidf_model')

2017-05-22 21:40:45,807 : INFO : saving TfidfModel object under hashed_wiki_tfidf_model, separately None
2017-05-22 21:40:45,845 : INFO : saved hashed_wiki_tfidf_model


The next step is to train the LSI model with a tfidf transformed corpus. So we will need yet another generator to yield the transformed corpus.

In [32]:
def tfidf_corpus_stream(corpus):
    for doc in corpus:
        yield tfidf[doc]

In [33]:
tfidf_corpus_s = tfidf_corpus_stream(mm)

## Calculating the LSI model

In [34]:
if os.path.exists('hashed_wiki_lsi_model'):
    lsi = gensim.models.LsiModel.load('hashed_wiki_lsi_model')
else:
    lsi = gensim.models.lsimodel.LsiModel(corpus=tfidf_corpus_s, id2word=id2word, num_topics=18)
    lsi.save('hashed_wiki_lsi_model')

2017-05-22 21:40:46,265 : INFO : using serial LSI version on this node
2017-05-22 21:40:46,268 : INFO : updating model with new documents
2017-05-22 21:40:46,985 : INFO : preparing a new chunk of documents
2017-05-22 21:40:47,096 : INFO : using 100 extra samples and 2 power iterations
2017-05-22 21:40:47,099 : INFO : 1st phase: constructing (42020, 118) action matrix
2017-05-22 21:40:47,226 : INFO : orthonormalizing (42020, 118) action matrix
2017-05-22 21:41:01,902 : INFO : 2nd phase: running dense svd on (118, 18) matrix
2017-05-22 21:41:01,977 : INFO : computing the final decomposition
2017-05-22 21:41:01,983 : INFO : keeping 18 factors (discarding 0.000% of energy spectrum)
2017-05-22 21:41:02,297 : INFO : processed documents up to #18
2017-05-22 21:41:02,306 : INFO : topic #0(1.365): -0.220*"kn!QR@4!l<6q%I9=Wh1gzS5BGk9pkVCO#u=Zc%xL" + -0.176*"9JtE1@db$^p@;&;tgoK@FfZkBVn<7!NhNrhLaI{h" + -0.165*"c~k-@SxB5VlPj?88N%8YDJk^uk(~<QgbwvN#Tl1C" + -0.158*"nFV+mj3yuyH;LPpsXmyoWu&|%M?hv+7pereF

Let now look at the topics generated, decoding the hashed tokens using the `decode_dictionary`.

In [36]:
for n in range(17):
    print("====================")
    print("Topic {}:".format(n))
    print("Coef.\t Token")
    print("--------------------")
    for tok,coef in lsi.show_topic(n):
        tok = hashed.decode_dictionary[tok.strip()][0]
        print("{:.3}\t{}".format(coef,tok))

Topic 0:
Coef.	 Token
--------------------
-0.22	thee
-0.176	--
-0.165	thou
-0.158	."
-0.146	,"
-0.143	unto
-0.143	have
-0.125	haue
-0.12	flambeau
-0.114	thy
Topic 1:
Coef.	 Token
--------------------
-0.422	haue
-0.253	macb
-0.195	ham
-0.19	bru
-0.174	vs
-0.157	vpon
-0.153	brutus
-0.135	selfe
-0.133	cassi
-0.122	heere
Topic 2:
Coef.	 Token
--------------------
0.236	thee
0.221	unto
0.164	thou
-0.16	elinor
-0.153	,"
-0.152	mrs
-0.147	."
-0.144	mr
0.143	thel
-0.129	haue
Topic 3:
Coef.	 Token
--------------------
-0.32	syme
0.278	elinor
-0.253	turnbull
-0.237	buster
-0.198	macian
0.187	elliot
0.183	marianne
0.182	emma
0.174	mrs
-0.151	alice
Topic 4:
Coef.	 Token
--------------------
-0.505	buster
-0.443	alice
0.361	syme
0.287	turnbull
0.224	macian
-0.169	joe
-0.151	blacky
-0.12	,'
-0.119	!'
-0.113	billy
Topic 5:
Coef.	 Token
--------------------
-0.639	alice
0.45	buster
-0.176	,'
-0.174	!'
0.152	joe
0.135	blacky
-0.112	whale
-0.109	duchess
-0.108	gryphon
-0.104	dormouse
Topic 6:
Coef.	 T