In [180]:
import numpy as np
import scipy.stats as stats
import pandas as pd

from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [257]:
book_scarlet = open('study-in-scarlet.txt','r').read()
chapters_scarlet = book_scarlet.split('CHAPTER')
words_scarlet = [list(set(nltk.tokenize.word_tokenize(chapter))) for chapter in chapters_scarlet]

book_venice = open('merchant_venice.txt','r').read()
chapters_venice = book_venice.split('Actus')
words_venice = [list(set(nltk.tokenize.word_tokenize(chapter))) for chapter in chapters_venice]

In [273]:
book_kamasutra = open('kamasutra.txt','r').read()
chapters_kamasutra = book_kamasutra.split('CHAPTER')
words_kamasutra = [list(set(nltk.tokenize.word_tokenize(chapter))) for chapter in chapters_kamasutra]

In [265]:
def topic_identifier(words_from_chapters, n_topics=10,top_topics=10):
    vec = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    X = vec.fit_transform([" ".join(w) for w in words_from_chapters])
    vocab = vec.get_feature_names()

    count_matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

    lda = LatentDirichletAllocation(n_components=n_topics)
    lda.fit(count_matrix)

    topic_words = {}

    for topic, comp in enumerate(lda.components_):
        # for the n-dimensional array "arr":
        # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
        # which contains the indices that would sort arr in a descending fashion
        # for the ith element in ranked_array, ranked_array[i] represents the index of the
        # element in arr that should be at the ith index in ranked_array
        # ex. arr = [3,7,1,0,3,6]
        # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
        # word_idx contains the indices in "topic" of the top num_top_words most relevant
        # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)    
        word_idx = np.argsort(comp)[::-1][:top_topics]

        # store the words most relevant to the topic
        topic_words[topic] = [vocab[i] for i in word_idx]

    for topic, words in topic_words.items():
        print('Topic: %d' % topic)
        print('  %s' % ', '.join(words))

In [266]:
topic_identifier(words_scarlet, n_topics=10,top_topics=10)

Topic: 0
  answered, just, morning, did, hardly, situation, pipe, little, leather, dead
Topic: 1
  ebook, gutenberg, study, use, character, note, scarlet, original, transcriber, project
Topic: 2
  like, good, life, shall, let, right, far, long, room, fellow
Topic: 3
  track, troubled, yes, stangerson, impossible, night, days, fell, seat, men
Topic: 4
  good, just, white, john, far, long, new, small, lay, hope
Topic: 5
  house, good, looking, long, half, day, hope, fierce, city, thank
Topic: 6
  shall, let, stangerson, afternoon, circumstances, mystery, work, avenging, did, time
Topic: 7
  road, know, young, fellow, white, hard, just, room, holmes, night
Topic: 8
  come, don, having, night, good, let, room, eyes, just, morning
Topic: 9
  road, yes, mystery, looking, street, affair, key, house, day, let




In [270]:
topic_identifier(words_venice, n_topics=20,top_topics=20)

Topic: 0
  st, husband, neere, teaches, blisse, lady, colour, barre, sweet, offend, haire, daie, sacrifice, bloud, seene, bosome, motion, merchant, eyed, alcides
Topic: 1
  search, ll, dwell, present, table, returne, clowne, conuerse, sand, allay, hopes, pound, caskets, example, prouided, fathers, loose, behinde, condition, father
Topic: 2
  sand, teach, mexico, tis, colour, siluer, cheere, offices, ventures, merrie, fie, sonne, saying, bed, extend, porke, purpose, raise, trouble, marrie
Topic: 3
  light, boy, sola, sweet, master, beg, silence, gold, lie, pardon, sit, deseru, fee, methinkes, musicke, deere, vntill, ring, steale, waies
Topic: 4
  st, weigh, gentlemen, attempt, pay, sweete, marrie, ground, bound, great, lor, backe, letter, vnderstand, satisfied, wilt, end, meete, nay, common
Topic: 5
  fals, bars, notwithstanding, foure, girle, post, plea, parts, bankrout, haste, sommer, argosies, merchants, manly, keene, argosie, beg, things, twill, nose
Topic: 6
  wine, marry, betweene



In [280]:
topic_identifier(words_kamasutra, n_topics=5,top_topics=10)



Topic: 0
  men, man, women, footnote, having, love, means, good, woman, union
Topic: 1
  viz, various, man, way, kinds, body, woman, pressing, different, nails
Topic: 2
  women, things, union, men, time, people, cause, vatsyayana, hand, place
Topic: 3
  women, wife, woman, love, footnote, husband, place, known, according, time
Topic: 4
  woman, love, women, man, way, time, place, footnote, work, follows
