In [18]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import datetime

# NLTK
import nltk
from nltk.corpus import stopwords

# Gensim
import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases # TODO: to create bigrams with

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt 

  and should_run_async(code)


In [2]:
stop_words = stopwords.words('english')
stop_words.extend(['mr'])

  and should_run_async(code)


In [3]:
# def lemmatize(content, tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en', disable=['parser', 'ner'])
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

# Tokenize and remove stop words from content
def tokenize(content, lemmatize=False):
    words = gensim.utils.simple_preprocess(content, deacc=True)  # tokenizes
    return words

def remove_stopwords(content):
    words = []
    for word in content:
        if word in stop_words:
            continue
        words.append(word)
    return words

  and should_run_async(code)


I don't think we should lemmatize, but just in case we want to test this later, we have it 
I'm not considering stemming at all because it removes a lot of content/context which may not be a good thing.

But I'm increasingly considering bigrams because there are a lot of words that should be together in the topic models (i.e. George Bush, Al Gore, vice president, New York, etc.)

In [5]:
rows = []
dates = []
articles = []
for month in range(5, 11):
    with open("Data/NYTimes/"+ str(month) + ".txt") as f:
        for i, line in enumerate(f):
            date, article = line.split(",", 1)
            timestamp = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z")
            tokenized = tokenize(article)
            destopped = remove_stopwords(tokenized)

            articles.append(destopped)
            dates.append(timestamp)
            rows.append([timestamp, destopped])

df = pd.DataFrame(rows, columns=["date", "content"]) 
df

  and should_run_async(code)


Unnamed: 0,date,content
0,2000-05-03 05:00:00+00:00,"[two, years, ago, homer, bush, came, yankee, b..."
1,2000-05-02 05:00:00+00:00,"[texas, record, tell, op, ed, april, paul, bur..."
2,2000-05-01 05:00:00+00:00,"[top, foreign, policy, adviser, gov, george, b..."
3,2000-05-03 05:00:00+00:00,"[aides, gov, george, bush, fought, back, today..."
4,2000-05-03 05:00:00+00:00,"[gov, tommy, thompson, wisconsin, named, chair..."
...,...,...
5801,2000-10-31 05:00:00+00:00,"[new, york, times, cbs, news, poll, var, strin..."
5802,2000-10-31 05:00:00+00:00,"[tick, tock, diner, ted, friedrich, stockbroke..."
5803,2000-11-01 05:00:00+00:00,"[difference, us, vital, issue, would, go, wash..."
5804,2000-11-01 05:00:00+00:00,"[bush, administration, wanted, overturn, would..."


In [23]:
# Create Dictionary
id2word = corpora.Dictionary(articles)

# Attempt at filtering out words that appear too frequently
# id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# id2word.filter_extremes(no_above=0.5)


# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in articles]


tfidf = models.TfidfModel(corpus)
corpus = tfidf[corpus]


# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

  and should_run_async(code)


[[('ago', 0.07712049873418031),
  ('awesome', 0.23220574510227418),
  ('backup', 0.2198823985449398),
  ('backups', 0.2515408271170864),
  ('bases', 0.19264069440348208),
  ('bellinger', 0.27548950382241366),
  ('bench', 0.1896343958919212),
  ('bush', 0.007894722475376273),
  ('came', 0.08612993720379283),
  ('catcher', 0.26148042600790294),
  ('clay', 0.2135830725972484),
  ('games', 0.1562902360625982),
  ('girardi', 0.27548950382241366),
  ('homer', 0.21658937110880933),
  ('jim', 0.1245222966630543),
  ('joe', 0.1146922085996351),
  ('leyritz', 0.27548950382241366),
  ('speed', 0.17969479700110466),
  ('stole', 0.20587332073042908),
  ('strength', 0.13402729061444735),
  ('turner', 0.2108175460504276),
  ('two', 0.04788545375938528),
  ('versatility', 0.27548950382241366),
  ('whose', 0.0887458288821732),
  ('yankee', 0.20825706839694694),
  ('yankees', 0.19264069440348208),
  ('years', 0.05159983565074285)]]

In [24]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           minimum_phi_value=0.2, # min threshold for word probabilities
#                                            passes=10,
                                           alpha='auto',  # assuming that topic distribution is assymetric. Not all topics equally represented in corpus.
                                           update_every=1,
                                           per_word_topics=True)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=articles, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Perplexity:  -14.246194089109071

Coherence Score:  0.4558530233958457


In [25]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)


Initial thoughts:

We need to de-pluralize the words (governments vs government).
Get the coherence score above 50 would be a good start probably.

Need to extend stop words to include mr.

But topic coherency is still very low

Also, we can double check our topic coherence by comparing with Wikipedia (and other checks the paper did)

In [17]:
# Select the model and print the topics
df = pd.DataFrame(lda_model.print_topics())
pd.options.display.max_colwidth = None
display(df)

  and should_run_async(code)


Unnamed: 0,0,1
0,26,"0.027*""brennan"" + 0.017*""standardized"" + 0.008*""monroe"" + 0.008*""survived"" + 0.006*""lifestyle"" + 0.005*""rightly"" + 0.005*""tests"" + 0.005*""salaries"" + 0.004*""childless"" + 0.004*""middletown"""
1,4,"0.028*""ellis"" + 0.014*""jonathan"" + 0.013*""crash"" + 0.009*""school"" + 0.009*""pursuit"" + 0.008*""berkeley"" + 0.008*""professor"" + 0.007*""networking"" + 0.006*""museum"" + 0.006*""tow"""
2,17,"0.022*""magazine"" + 0.020*""executives"" + 0.018*""san"" + 0.017*""technology"" + 0.011*""information"" + 0.009*""cookies"" + 0.008*""diego"" + 0.008*""favored"" + 0.008*""sponsored"" + 0.007*""nurtured"""
3,24,"0.051*""misstated"" + 0.018*""button"" + 0.016*""fragile"" + 0.014*""gorelick"" + 0.011*""alan"" + 0.009*""column"" + 0.009*""day"" + 0.008*""keyes"" + 0.007*""yesterday"" + 0.007*""restricting"""
4,22,"0.011*""abm"" + 0.010*""assembly"" + 0.007*""herbert"" + 0.007*""shocking"" + 0.006*""dumping"" + 0.006*""creek"" + 0.006*""dubious"" + 0.006*""forecasts"" + 0.005*""site"" + 0.005*""methodically"""
5,15,"0.025*""collins"" + 0.020*""jews"" + 0.015*""stewart"" + 0.012*""gingrich"" + 0.010*""production"" + 0.010*""director"" + 0.010*""joke"" + 0.009*""goldstein"" + 0.008*""onstage"" + 0.008*""kelly"""
6,14,"0.068*""sept"" + 0.067*""entertainment"" + 0.061*""industry"" + 0.043*""hollywood"" + 0.025*""marketing"" + 0.014*""movie"" + 0.012*""music"" + 0.011*""material"" + 0.010*""violence"" + 0.009*""city"""
7,0,"0.061*""bushnell"" + 0.047*""ms"" + 0.026*""madison"" + 0.017*""candace"" + 0.012*""film"" + 0.011*""hartford"" + 0.010*""women"" + 0.009*""smith"" + 0.007*""star"" + 0.007*""urban"""
8,5,"0.062*""web"" + 0.048*""site"" + 0.042*""com"" + 0.027*""sites"" + 0.019*""www"" + 0.013*""characters"" + 0.012*""internet"" + 0.007*""cultural"" + 0.006*""asian"" + 0.006*""art"""
9,28,"0.029*""gorey"" + 0.015*""black"" + 0.015*""music"" + 0.013*""avenue"" + 0.009*""brown"" + 0.009*""broadway"" + 0.009*""glenn"" + 0.009*""theater"" + 0.008*""village"" + 0.008*""white"""
