In [1]:
import nltk
import parse_corpus
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import numpy as np

import lda

# Split into sections

In [2]:
filename = "preprocessed_shakespeare.txt"
castfile = "curated_cast.txt"

sections = parse_corpus.process(filename)
castdict = {}
for line in open(castfile):
    words = nltk.word_tokenize(line.strip())
    for word in words:
        castdict[word.lower()] = True

In [3]:
# with open("/Users/markmartinez/Desktop/shakespeare.txt","r") as F:
#     allText = F.read().lower()

# Remove stopwords, tokenize, stem (takes about 30 seconds)

In [4]:
stops = stopwords.words("english")
stopdict = dict((s.lower(),None) for s in stops) # Sets are really terrible in Python
# print [s for s in stopdict.iterkeys()]

ps = PorterStemmer()

clean_sections = []

# NOTE(tfs): I am using tokenization now. Mainly it really cleans up punctuation and handles contractions well
for section in sections:
#     secwords = section.split()
#     tokens = nltk.word_tokenize(" ".join(secwords))
    tokens = nltk.word_tokenize(section)
    nonstops = [w for w in tokens if not (w.lower() in stopdict or w.lower() in castdict)]
    stemmed = [ps.stem(t.lower()) for t in nonstops if t.isalnum()]
#     nonstops = [w for w in stemmed if not (w in stopdict or w in castdict)]
    clean_sections.append(" ".join(stemmed)) # Note this does not preserve structure,
                                              #      but all words are now present in the section string
print clean_sections[0][:500]

palac enter black deliv buri husband go madam weep death anew must attend majesti command ward evermor subject shall find husband madam sir gener time good must necess hold virtu whose worthi would stir want rather lack abund hope majesti amend hath abandon physician madam whose practic hath persecut time hope find advantag process lose hope time sad passag skill almost great honesti stretch far would made natur immort death play lack work would sake live think would death diseas call speak mada


In [5]:
# stops = set(stopwords.words("english"))
# words = allText.split()
# meaningful_words = [w for w in words if not w in stops]
# cleanedUp =  " ".join( meaningful_words )

In [6]:
# cleanedUp[0:500]

In [7]:
#now split on 'scene'
# scenes = [x.strip() for x in cleanedUp.split("scene")]

In [8]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word")

data = vectorizer.fit_transform(clean_sections)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = data.toarray()

vocab = vectorizer.get_feature_names()

In [9]:
print sum(train_data_features[300])
print len(vocab)
print vocab[100:300]

402
12740
[u'acquit', u'acquitt', u'acr', u'across', u'act', u'actaeon', u'action', u'actium', u'activ', u'actor', u'actual', u'acut', u'ad', u'adag', u'adalla', u'adam', u'add', u'adder', u'addict', u'addit', u'addl', u'address', u'addrest', u'adher', u'adieu', u'adjac', u'adjoin', u'adjourn', u'adjudg', u'adjunct', u'administ', u'administr', u'admir', u'admiringli', u'admit', u'admitt', u'admonish', u'admonit', u'ado', u'adoni', u'adopt', u'adoptedli', u'adopti', u'ador', u'adorest', u'adoreth', u'adorn', u'adramadio', u'adriat', u'adsum', u'adul', u'adulter', u'adulteress', u'adulteri', u'adultress', u'advanc', u'advantag', u'adventur', u'advers', u'adversari', u'adverti', u'advertis', u'advi', u'advic', u'advis', u'advisedli', u'advoc', u'aeacid', u'aeacida', u'aedil', u'aegl', u'aeolu', u'aer', u'aeri', u'aerial', u'aesculapiu', u'aeson', u'aesop', u'aetna', u'afar', u'afear', u'afeard', u'affabl', u'affair', u'affect', u'affecteth', u'affection', u'affeer', u'affi', u'affianc', u

In [10]:
# vocab = vectorizer.get_feature_names()
# print(len(vocab))

print type(data)

<class 'scipy.sparse.csr.csr_matrix'>


In [11]:
# # import numpy as np

# # Sum up the counts of each vocabulary word
# dist = np.sum(data, axis=0)

# # For each, print the vocabulary word and the number of times it 
# # appears in the training set
# for tag, count in zip(vocab, dist):
#     print(count, tag)

# Run LDA (takes a bit of time)

In [12]:
# Settings
num_topics = 20
num_iter = 500
n_top_words = 5

In [13]:
model = lda.LDA(n_topics=num_topics, n_iter=num_iter)
model.fit(data)

topic_word = model.topic_word_

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

INFO:lda:n_documents: 643
INFO:lda:vocab_size: 12740
INFO:lda:n_words: 317704
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -3615314
INFO:lda:<10> log likelihood: -2962327
INFO:lda:<20> log likelihood: -2843815
INFO:lda:<30> log likelihood: -2793382
INFO:lda:<40> log likelihood: -2765225
INFO:lda:<50> log likelihood: -2748100
INFO:lda:<60> log likelihood: -2735338
INFO:lda:<70> log likelihood: -2723951
INFO:lda:<80> log likelihood: -2714338
INFO:lda:<90> log likelihood: -2706120
INFO:lda:<100> log likelihood: -2701096
INFO:lda:<110> log likelihood: -2694974
INFO:lda:<120> log likelihood: -2691444
INFO:lda:<130> log likelihood: -2687876
INFO:lda:<140> log likelihood: -2683818
INFO:lda:<150> log likelihood: -2680089
INFO:lda:<160> log likelihood: -2676093
INFO:lda:<170> log likelihood: -2674596
INFO:lda:<180> log likelihood: -2669693
INFO:lda:<190> log likelihood: -2666825
INFO:lda:<200> log likelihood: -2664069
INFO:lda:<210> log likelihood: -2662800
INFO:lda:<

Topic 0: rome first coriolanu nobl third
Topic 1: enter us shall fight soldier
Topic 2: thou thi thee art dost
Topic 3: thi thou thee life death
Topic 4: kate fluellen katherin william alic
Topic 5: come let may shall must
Topic 6: sir hath well signior thee
Topic 7: sing sweet play enter night
Topic 8: murder tell go kiss troy
Topic 9: come sleep go enter good
Topic 10: england crown unto peac men
Topic 11: shall let hath come us
Topic 12: th sir us made follow
Topic 13: first hath life natur good
Topic 14: sister ham law hold mad
Topic 15: love shall fair see madam
Topic 16: grace god high brother day
Topic 17: sir come good go shall
Topic 18: like may god hot cousin
Topic 19: know make would yet one


# TODO

* Look for more lines to remove while parsing corpus (e.g. Enter [NAME], ...)
* Downweight words by frequency? tf-idf for example?
