In [7]:
import os
import nltk

Set path where all documents within corpus will be found

In [8]:
path = 'corpus/williamsburg_strollers'


Set directory from path in order to access documents within corpus

In [9]:
myfolder = os.listdir(path)
raw = []

Loop through all documents and append to list object called "files"

In [10]:
slash = '/'

In [13]:
for filename in myfolder:
    if filename.endswith('.txt'):
        with open(path + slash + filename) as fh:
            contents = fh.read()
            raw.append(contents)
print raw



Preprocessing corpus for analysis. As you can see, the corpus contains the text taken directly from the articles themselves. In order to prepare for analysis, we need to tokenize text, set it to lowercase, stem words, and lemmatize the text. Tokenization will be executed later using TfidfVectorizer for NMF and CountVectorizer for LDA. 

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

#function to display the topics and terms that make up each topic for both models -- will be called at the end
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])


#documents will come from the list created from above
documents = raw

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

#specify number of topics
no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

#specify number of topic words
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
said children mr store parents new people like kids babies
Topic 1:
sundays admission tickets saturdays 30 973 fridays jersey free tuesdays
Topic 2:
police firefighters yesterday engine firehouse protesters department said firehouses company
Topic 3:
marathon runners said race year mile zinser lynn pack gebrselassie
Topic 4:
kiryas joel blooming grove village villages families growth land communities
Topic 5:
brooklyn jazz park borough friend west upper museum poor shop
Topic 6:
fortuna island couple apartment ms bedroom helped white bridge arrived
Topic 7:
shin ms museum work history painting said exhibition people art
Topic 8:
ossining hernández mayor wants white like villages mr river 000
Topic 9:
coffee espresso street com 212 718 beans avenue baristas roasters
Topic 10:
tamarkin blog photographs mr said started began message new york
Topic 11:
bushwick theater says space neighborhood williamsburg le estate real new
Topic 12:
gay chelsea straight said families west residen