# Data from New York Times
Author: Zhifeng Fu  
Date: 2019/12/29  
1109 articles are crawled and extracted from the website of New York Times. They spanned from 2019/11/30 to 2019/12/27.

In [1]:
import numpy as np
import pandas as pd

nyt = pd.read_csv('nyt_df_12.csv', sep='\t', header=0, index_col=0)
display(nyt)

Unnamed: 0,title,timestamp,content,media,link
0,Trump’s Intervention in SEALs Case Tests Penta...,2019-11-30T20:47:50-05:00,"He was limp and dusty from an explosion, consc...",New York Times,https://www.nytimes.com/2019/11/30/us/politics...
1,Prime Mover: How Amazon Wove Itself Into the L...,2019-11-30T06:00:18-05:00,Amazon EverywherePrime Mover: How Amazon Wove ...,New York Times,https://www.nytimes.com/2019/11/30/business/am...
2,Students Fainting From Hunger in Venezuela’s F...,2019-11-30T05:00:23-05:00,"BOCA DE UCHIRE, Venezuela — Hundreds of childr...",New York Times,https://www.nytimes.com/2019/11/30/world/ameri...
3,2020 Democratic Candidates Wage Escalating Fig...,2019-11-30T05:00:15-05:00,"WALPOLE, N.H. — Pete Buttigieg has a nifty pol...",New York Times,https://www.nytimes.com/2019/11/30/us/politics...
4,Latin Dictionary’s Journey: A to Zythum in 125...,2019-11-30T12:34:51-05:00,MUNICH — When German researchers began working...,New York Times,https://www.nytimes.com/2019/11/30/arts/latin-...
...,...,...,...,...,...
1104,L.S.U. and Oklahoma Both Seek to End Title Gam...,2019-12-27T16:25:58-05:00,ATLANTA — The right hand of Lee Morris’s fathe...,New York Times,https://www.nytimes.com/2019/12/27/sports/ncaa...
1105,Can They Kick It? Yes They Can. Ohio State and...,2019-12-27T15:36:57-05:00,"SCOTTSDALE, Ariz. — Pity the plight of college...",New York Times,https://www.nytimes.com/2019/12/27/sports/ncaa...
1106,Drawing the Guantánamo Bay War Court,2019-12-27T18:26:08-05:00,Times Insider explains who we are and what we ...,New York Times,https://www.nytimes.com/2019/12/27/reader-cent...
1107,"Quotation of the Day: Jerry Herman, 88, Compos...",2019-12-27T23:34:59-05:00,“There are only a couple of us who care about ...,New York Times,https://www.nytimes.com/2019/12/27/todayspaper...


## Preprocessing

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
# Stop Words Removal: Stop words are referred as the most common words in a language and are removed before processing
from nltk.stem import WordNetLemmatizer, SnowballStemmer
# Lemmatization: refers to remove inflectional endings/suffix and to return the base/dictionary form of a word,
# which is called the lemma.
# Stemming: refers to the process of reducing inflection in words to their root forms,
# such as mapping a group of words to the same stem even if the stem itself is not a valid word in the language.

In [3]:
SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [4]:
stemmer = SnowballStemmer('english')

In [98]:
# def lemmatize_stemming(text):  # lemmatize and stems the text
#     return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# try no stemming:
def lemmatize(text):  # lemmatize the text
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):  # the function takes a string, and returns a list
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:  # remove the stop words and words no larger than 3 letters
            result.append(lemmatize(token))
    return result

In [6]:
# illustrate on a sample case:
doc_sample = nyt.loc[0, 'content']
print('Original document:')
print(doc_sample)

sample_words = []
for word in doc_sample.split(' '):
    sample_words.append(word)
print('Sample words:')
print(sample_words)
print('\n\nTokenized and lemmatized document: ')
print(preprocess(doc_sample))

Original document:


Tokenized and lemmatized document: 


['limp', 'dusty', 'explosion', 'conscious', 'barely', 'fierce', 'mask', 'islamic', 'state', 'fighters', 'seize', 'vast', 'swaths', 'iraq', 'syria', 'captive', 'scraggly', 'teenager', 'tank', 'limbs', 'watch', 'slide', 'easily', 'wrist', 'chief', 'petty', 'officer', 'edward', 'gallagher', 'navy', 'seal', 'give', 'young', 'captive', 'medical', 'iraq', 'sedate', 'cut', 'airway', 'throat', 'help', 'breathe', 'warn', 'accord', 'colleagues', 'chief', 'gallagher', 'pull', 'small', 'hunt', 'knife', 'sheath', 'stab', 'sedate', 'captive', 'neck', 'chief', 'gallagher', 'later', 'pose', 'photograph', 'hold', 'dead', 'captive', 'hair', 'celebrate', 'campaign', 'trail', 'president', 'trump', 'upend', 'military', 'code', 'justice', 'protect', 'punishment', 'result', 'episode', 'prod', 'news', 'trump', 'chief', 'gallagher', 'cause', 'célèbre', 'trumpet', 'argument', 'election', 'violent', 'encounter', 'faraway', 'land', 'open', 'year', 'affair', 'pentagon', 'hierarchy', 'wed', 'longstanding', 'rule', 

In [7]:
# Since there are several video articles, there content entries are None.
if nyt['content'].isnull().values.any():
    r, = np.where(nyt['content'].isna())
    print(r.tolist(), 'has NaN values in content')
    # drop these rows:
    #nyt = nyt.dropna(subset=['content'])
    # replace with '':
    nyt['content'] = nyt['content'].fillna('').astype(str)

In [8]:
processed_nyt = nyt['content'].map(preprocess)
processed_nyt[:10]

0    [limp, dusty, explosion, conscious, barely, fi...
1    [amazon, everywhereprime, mover, amazon, weave...
2    [boca, uchire, venezuela, hundreds, children, ...
3    [walpole, pete, buttigieg, nifty, politician, ...
4    [munich, german, researchers, begin, work, lat...
5    [aboard, cyprus, north, pacific, ocean, voyage...
6    [things, wrong, power, promise, right, series,...
7    [najaf, iraq, prime, minister, adel, abdul, ma...
8    [london, midafternoon, mike, finnerty, sell, c...
9    [london, labour, party, canvassers, gather, da...
Name: content, dtype: object

## Building LDA Model

In [9]:
# Give each word an index: a mapping between words and their integer ids.
dictionary_nyt = gensim.corpora.Dictionary(processed_nyt)

count = 0
for k,v in dictionary_nyt.iteritems():
    print(k,v)
    count +=1
    if count > 10:
        break

0 abilities
1 accept
2 accord
3 account
4 accountable
5 accurate
6 accusations
7 accuse
8 acquit
9 acquittal
10 act


In [11]:
# filter the extreme tokens: those appears below 2 articles, or more than half of articles. Extract the top 100000 frequent ones.
dictionary_nyt.filter_extremes(no_below=2, no_above=0.5, keep_n=100000)

In [12]:
bow_corpus_nyt = [dictionary_nyt.doc2bow(doc) for doc in processed_nyt]
# doc2bow is a function of corpora.dictionary in gensim:
# Convert document into the bag-of-words (BoW) format, i.e. list of (token_id, token_count) tuples.
bow_corpus_nyt[99][:10]  # what is the meaning of the output:
# The index of each word in this document, with its number of appearance in this document.

[(1, 1),
 (2, 5),
 (10, 1),
 (13, 1),
 (27, 1),
 (28, 1),
 (45, 2),
 (51, 1),
 (61, 1),
 (87, 1)]

In [13]:
# to further illustrate:
for i in range(10):
    print(
          "Word {} (\"{}\") appears {} time.".format(bow_corpus_nyt[99][i][0], dictionary_nyt[bow_corpus_nyt[99][i][0]], bow_corpus_nyt[99][i][1])
         )

Word 1 ("accept") appears 1 time.
Word 2 ("accord") appears 5 time.
Word 10 ("act") appears 1 time.
Word 13 ("add") appears 1 time.
Word 27 ("aide") appears 1 time.
Word 28 ("allow") appears 1 time.
Word 45 ("ask") appears 2 time.
Word 51 ("avoid") appears 1 time.
Word 61 ("base") appears 1 time.
Word 87 ("bush") appears 1 time.


### TF-IDF
Combine the Term Frequency - Inverse Document Frequency with the model. The TF-IDF replaces the previous frequency count of each word.

In [14]:
tfidf = gensim.models.TfidfModel(bow_corpus_nyt)
tfidf_corpus_nyt = tfidf[bow_corpus_nyt]

In [15]:
# to illustrate:
tfidf_corpus_nyt[99][:10]

[(1, 0.011808785088316712),
 (2, 0.028848727760795466),
 (10, 0.012455807469639393),
 (13, 0.00562978369257338),
 (27, 0.01998397103381067),
 (28, 0.006751292498707385),
 (45, 0.011739356900826748),
 (51, 0.011657275218202395),
 (61, 0.006954442665643752),
 (87, 0.01928664178805415)]

### Training the model

In [53]:
# LDA using Bag of Words and normal frequency counts
lda_nyt_bow = gensim.models.LdaMulticore(bow_corpus_nyt, num_topics=10, id2word=dictionary_nyt, passes=10, workers=3, random_state=100)
# what is passes and workers:
# workers: Number of workers processes to be used for parallelization. Optimal choice is number of core - 1.
# passes: Number of passes through the corpus during training.

for idx, topic in lda_nyt_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx,topic))

print('Perplexity of baseline BoW LDA Model: ', lda_nyt_bow.log_perplexity(bow_corpus_nyt))

from gensim.models import CoherenceModel

coherence_nyt_bow = CoherenceModel(model=lda_nyt_bow, texts=processed_text, dictionary=dictionary_nyt, coherence='c_v')
print('\nCoherence Score of baseline BoW LDA Model: ', coherence_nyt_bow.get_coherence())

Topic: 0 
Words: 0.009*"company" + 0.006*"china" + 0.006*"government" + 0.005*"amazon" + 0.005*"trade" + 0.004*"police" + 0.004*"unite" + 0.004*"country" + 0.003*"american" + 0.003*"kill"
Topic: 1 
Words: 0.004*"women" + 0.004*"france" + 0.003*"italian" + 0.003*"bloomberg" + 0.003*"macron" + 0.003*"paris" + 0.003*"world" + 0.003*"unite" + 0.003*"start" + 0.003*"city"
Topic: 2 
Words: 0.006*"court" + 0.004*"case" + 0.004*"unite" + 0.003*"chinese" + 0.003*"officials" + 0.003*"report" + 0.003*"city" + 0.003*"water" + 0.003*"home" + 0.003*"judge"
Topic: 3 
Words: 0.020*"trump" + 0.018*"president" + 0.013*"house" + 0.013*"impeachment" + 0.009*"vote" + 0.008*"democrats" + 0.008*"party" + 0.007*"election" + 0.006*"committee" + 0.005*"political"
Topic: 4 
Words: 0.004*"live" + 0.003*"city" + 0.003*"play" + 0.003*"write" + 0.003*"home" + 0.003*"music" + 0.003*"black" + 0.003*"love" + 0.003*"world" + 0.003*"family"
Topic: 5 
Words: 0.012*"game" + 0.011*"team" + 0.009*"season" + 0.007*"play" + 0.

In [54]:
import pyLDAvis.gensim
#import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared_bow = pyLDAvis.gensim.prepare(lda_nyt_bow, bow_corpus_nyt, dictionary_nyt)
LDAvis_prepared_bow

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# LDA using TF-IDF
lda_nyt_tfidf_base = gensim.models.LdaMulticore(tfidf_corpus_nyt, num_topics=10, id2word=dictionary_nyt, passes=10, workers=3, random_state=100)

for idx, topic in lda_nyt_tfidf_base.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))


print('Perplexity of baseline TF-IDF LDA Model: ', lda_nyt_tfidf_base.log_perplexity(tfidf_corpus_nyt))

from gensim.models import CoherenceModel

coherence_nyt_tfidf_base = CoherenceModel(model=lda_nyt_tfidf_base, texts=processed_text, dictionary=dictionary_nyt, coherence='c_v')
print('\nCoherence Score of baseline TF-IDF LDA Model: ', coherence_nyt_tfidf_base.get_coherence())

In [52]:
# import pyLDAvis.gensim
# import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_nyt_tfidf_base, tfidf_corpus_nyt, dictionary_nyt)
LDAvis_prepared

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Tuning hyperparameter on the TF-IDF model

In [35]:
# supporting function
from gensim.models import CoherenceModel

processed_text = processed_nyt.tolist()
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           num_topics=k,
                                           id2word=dictionary, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           workers=3,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=False)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_text, dictionary=dictionary, coherence='c_v')
    return coherence_model_lda.get_coherence()

In [27]:
# test:
print(compute_coherence_values(corpus=tfidf_corpus_nyt, dictionary=dictionary_nyt, k=10, a='asymmetric', b='symmetric'))

0.44458724026108226


In [38]:
# test:
num_of_docs = len(tfidf_corpus_nyt)
print(num_of_docs)

test = gensim.utils.ClippedCorpus(tfidf_corpus_nyt, int(num_of_docs*0.75))
print(len(test))

1109
831


In [40]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(tfidf_corpus_nyt)
corpus_sets = [# gensim.utils.ClippedCorpus(tfidf_corpus_nyt, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(tfidf_corpus_nyt, num_of_docs*0.5), 
               # gensim.utils.ClippedCorpus(tfidf_corpus_nyt, int(num_of_docs*0.75)),  # add the int part
               tfidf_corpus_nyt]
#corpus_title = ['75% Corpus', '100% Corpus']
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    #pbar = tqdm.tqdm(total=540)
    pbar = tqdm.tqdm(total=270)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary_nyt, k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results_nyt.csv', index=False)
    pbar.close()





  0%|          | 0/270 [00:00<?, ?it/s][A[A[A[A



  0%|          | 1/270 [00:31<2:22:25, 31.77s/it][A[A[A[A



  1%|          | 2/270 [01:03<2:22:06, 31.81s/it][A[A[A[A



  1%|          | 3/270 [01:36<2:22:42, 32.07s/it][A[A[A[A



  1%|▏         | 4/270 [02:08<2:22:52, 32.23s/it][A[A[A[A



  2%|▏         | 5/270 [02:41<2:23:01, 32.38s/it][A[A[A[A



  2%|▏         | 6/270 [03:13<2:21:46, 32.22s/it][A[A[A[A



  3%|▎         | 7/270 [03:45<2:20:18, 32.01s/it][A[A[A[A



  3%|▎         | 8/270 [04:17<2:19:42, 31.99s/it][A[A[A[A



  3%|▎         | 9/270 [04:48<2:18:38, 31.87s/it][A[A[A[A



  4%|▎         | 10/270 [05:21<2:19:18, 32.15s/it][A[A[A[A



  4%|▍         | 11/270 [05:54<2:19:35, 32.34s/it][A[A[A[A



  4%|▍         | 12/270 [06:27<2:19:45, 32.50s/it][A[A[A[A



  5%|▍         | 13/270 [06:59<2:18:42, 32.38s/it][A[A[A[A



  5%|▌         | 14/270 [07:31<2:18:05, 32.37s/it][A[A[A[A



  6%|▌         | 15/270 [0

 90%|█████████ | 244/270 [2:17:11<14:59, 34.59s/it][A[A[A[A



 91%|█████████ | 245/270 [2:17:46<14:23, 34.54s/it][A[A[A[A



 91%|█████████ | 246/270 [2:18:22<14:00, 35.00s/it][A[A[A[A



 91%|█████████▏| 247/270 [2:18:55<13:15, 34.57s/it][A[A[A[A



 92%|█████████▏| 248/270 [2:19:29<12:32, 34.22s/it][A[A[A[A



 92%|█████████▏| 249/270 [2:20:02<11:54, 34.03s/it][A[A[A[A



 93%|█████████▎| 250/270 [2:20:36<11:20, 34.03s/it][A[A[A[A



 93%|█████████▎| 251/270 [2:21:13<11:02, 34.85s/it][A[A[A[A



 93%|█████████▎| 252/270 [2:21:47<10:20, 34.47s/it][A[A[A[A



 94%|█████████▎| 253/270 [2:22:21<09:44, 34.35s/it][A[A[A[A



 94%|█████████▍| 254/270 [2:22:55<09:08, 34.28s/it][A[A[A[A



 94%|█████████▍| 255/270 [2:23:28<08:29, 33.95s/it][A[A[A[A



 95%|█████████▍| 256/270 [2:24:06<08:13, 35.24s/it][A[A[A[A



 95%|█████████▌| 257/270 [2:24:40<07:32, 34.80s/it][A[A[A[A



 96%|█████████▌| 258/270 [2:25:14<06:53, 34.47s/it][A[A[A[

In [41]:
model_results

{'Validation_Set': ['100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpus',
  '100% Corpu

In [42]:
result = pd.DataFrame(model_results)
display(result)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,100% Corpus,2,0.01,0.01,0.350234
1,100% Corpus,2,0.01,0.31,0.324638
2,100% Corpus,2,0.01,0.61,0.300359
3,100% Corpus,2,0.01,0.91,0.297559
4,100% Corpus,2,0.01,symmetric,0.306848
...,...,...,...,...,...
265,100% Corpus,10,asymmetric,0.01,0.467507
266,100% Corpus,10,asymmetric,0.31,0.456387
267,100% Corpus,10,asymmetric,0.61,0.451884
268,100% Corpus,10,asymmetric,0.91,0.433051


In [43]:
result.loc[result['Coherence'].idxmax()]

Validation_Set    100% Corpus
Topics                     10
Alpha                    0.61
Beta                symmetric
Coherence               0.625
Name: 254, dtype: object

In [49]:
lda_nyt_tfidf = gensim.models.LdaMulticore(corpus=tfidf_corpus_nyt,
                                           num_topics=10,
                                           id2word=dictionary_nyt, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           workers=3,
                                           alpha=0.61,
                                           eta='symmetric',
                                           per_word_topics=False)
    
for idx, topic in lda_nyt_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))
coherence_nyt_tfidf = CoherenceModel(model=lda_nyt_tfidf, texts=processed_text, dictionary=dictionary_nyt, coherence='c_v')
print('\nCoherence Score of TF-IDF LDA Model: ', coherence_nyt_tfidf.get_coherence())

Topic: 0 
Word: 0.002*"misstate" + 0.002*"obituary" + 0.002*"farm" + 0.002*"birth" + 0.001*"anderson" + 0.001*"error" + 0.001*"errors" + 0.001*"complaints" + 0.001*"minor" + 0.001*"subway"
Topic: 1 
Word: 0.001*"lifetime" + 0.001*"pierson" + 0.000*"wausau" + 0.000*"mchenry" + 0.000*"wooller" + 0.000*"ariel" + 0.000*"folles" + 0.000*"advent" + 0.000*"lyricist" + 0.000*"wiseman"
Topic: 2 
Word: 0.002*"weinstein" + 0.001*"cream" + 0.001*"kabul" + 0.001*"borrow" + 0.001*"agreements" + 0.001*"sewage" + 0.001*"selection" + 0.000*"manchester" + 0.000*"kendra" + 0.000*"skateboard"
Topic: 3 
Word: 0.000*"wausau" + 0.000*"wooller" + 0.000*"ariel" + 0.000*"folles" + 0.000*"advent" + 0.000*"lyricist" + 0.000*"wiseman" + 0.000*"deplorable" + 0.000*"hofmann" + 0.000*"santas"
Topic: 4 
Word: 0.001*"decorate" + 0.001*"dedicate" + 0.001*"pastry" + 0.000*"metz" + 0.000*"wausau" + 0.000*"wooller" + 0.000*"ariel" + 0.000*"folles" + 0.000*"advent" + 0.000*"cinema"
Topic: 5 
Word: 0.001*"roof" + 0.001*"rape

In [50]:
import pyLDAvis.gensim
#import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_nyt_tfidf, tfidf_corpus_nyt, dictionary_nyt)
LDAvis_prepared

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Evaluation on both models

In [17]:
# mdiff, annotation = lda_nyt_bow.diff(lda_nyt_tfidf)

In [19]:
# Compute Perplexity
# Perplexity is one of the intrinsic evaluation metrics, a measure of how good the model is. Lower the better.
print('Perplexity of Bag-of-Words LDA Model: ', lda_nyt_bow.log_perplexity(bow_corpus_nyt))
print('Perplexity of TF-IDF LDA Model: ', lda_nyt_tfidf.log_perplexity(tfidf_corpus_nyt))

Perplexity of Bag-of-Words LDA Model:  -8.45080225482349
Perplexity of TF-IDF LDA Model:  -12.880247882878505


In [20]:
# Coherence Measure
from gensim.models import CoherenceModel

processed_text = processed_nyt.tolist()
coherence_nyt_bow = CoherenceModel(model=lda_nyt_bow, texts=processed_text, dictionary=dictionary_nyt, coherence='c_v')
print('\nCoherence Score of Bag-of-Words LDA Model: ', coherence_nyt_bow.get_coherence())

coherence_nyt_tfidf = CoherenceModel(model=lda_nyt_tfidf, texts=processed_text, dictionary=dictionary_nyt, coherence='c_v')
print('\nCoherence Score of TF-IDF LDA Model: ', coherence_nyt_tfidf.get_coherence())


Coherence Score of Bag-of-Words LDA Model:  0.38306805363179003

Coherence Score of TF-IDF LDA Model:  0.528381057354139


## Summary
According to the measure of perplexity and coherence score, the TF-IDF LDA Model performs better than the Bag of Words one. It is likely that the topics generated in TF-IDF LDA Model are more comprehensive and diverse.

## Visualization

In [21]:
import pyLDAvis.gensim
#import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_nyt_tfidf, tfidf_corpus_nyt, dictionary_nyt)
LDAvis_prepared

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Directions for further research
1. To scrape and crawl more articles from more media. Make comparisons with each other to explore more insights.  
2. To further improve the performance of LDA Model. Including tuning hyperparameters, etc.

In [95]:
# queries on some words:
nyt[nyt['content'].str.contains("Turley") == True]

Unnamed: 0,title,timestamp,content,media,link
98,Democrats Ready Impeachment Report as Republic...,2019-12-02T15:30:10-05:00,WASHINGTON — House Democrats pressed forward o...,New York Times,https://www.nytimes.com/2019/12/02/us/politics...
174,Scholars Call Trump’s Actions on Ukraine an Im...,2019-12-04T14:20:51-05:00,WASHINGTON — The House of Representatives on W...,New York Times,https://www.nytimes.com/2019/12/04/us/politics...
175,Trump Blocked Key Impeachment Witnesses. Shoul...,2019-12-04T13:49:57-05:00,WASHINGTON — Are House Democrats making a mist...,New York Times,https://www.nytimes.com/2019/12/04/us/politics...
811,A Law Professor’s Provocative Argument: Trump ...,2019-12-20T19:06:42-05:00,WASHINGTON — Maybe President Trump has not bee...,New York Times,https://www.nytimes.com/2019/12/20/us/trump-fe...
882,Donald Trump Has Drained the Swamp (of Good Ch...,2019-12-20T05:00:10-05:00,WASHINGTON — It would have been a swell party....,New York Times,https://www.nytimes.com/2019/12/20/style/white...


In [65]:
nyt[nyt['content'].str.contains("Hoffman") == True]

Unnamed: 0,title,timestamp,content,media,link
30,Muhammad Ali in a Broadway Musical? It Happened,2019-11-28T10:00:08-05:00,"So, as the sportscaster Howard Cosell used to ...",New York Times,https://www.nytimes.com/2019/11/28/theater/muh...
117,"Perry Hoffman, 75, Dies; Saw Family Support as...",2019-12-02T19:03:43-05:00,"Perry Hoffman, a former elementary-school teac...",New York Times,https://www.nytimes.com/2019/12/02/health/perr...
286,Video Games and Online Chats Are ‘Hunting Grou...,2019-12-07,By NELLIE BOWLES and \n \nMICHAEL H. KELLER...,New York Times,https://www.nytimes.com/interactive/2019/12/07...
851,Men Are in Trouble and Hollywood Wants to Help,2019-12-19T05:00:19-05:00,Near the end of Martin Scorsese’s “The Irishma...,New York Times,https://www.nytimes.com/2019/12/19/movies/men-...
883,Lights. Camera. Senior Center?,2019-12-22T05:00:11-05:00,"CULVER CITY, Calif. — A speeding Ford Fiesta p...",New York Times,https://www.nytimes.com/2019/12/22/style/senio...
909,‘Marriage Story’: Noah Baumbach’s First Feel-G...,2019-12-13T10:00:12-05:00,This article contains spoilers for “Marriage S...,New York Times,https://www.nytimes.com/2019/12/13/movies/marr...
983,Democrats Who Flipped Seats in 2018 Have a 202...,2019-12-24T11:15:45-05:00,WASHINGTON — The high costs of health care are...,New York Times,https://www.nytimes.com/2019/12/24/us/politics...
1033,Why Stephen Curry (Not LeBron) Is the N.B.A. P...,2019-12-25T18:48:10-05:00,The New York Times is reflecting on the past d...,New York Times,https://www.nytimes.com/2019/12/25/sports/bask...


In [76]:
nyt.loc[286, 'content']

"By NELLIE BOWLES and \n    \nMICHAEL H. KELLER\n\nDEC. 7, 2019\nCriminals are making virtual connections with children through gaming and social media platforms. One popular site warns visitors, “Please be careful.”\nCriminals are making virtual connections with children through gaming and social media platforms. One popular site warns visitors, “Please be careful.”\nBy NELLIE BOWLES and MICHAEL H. KELLER DEC. 7, 2019\n\nYou have 5 seconds to take each pic\nTake your pj's off. Just pull them down\nShow your entire stomach. Retake pic.\nim going to kill you\nI'm about to post your pics If you don't chat me back in 10 minutes.\nReply now\nEither you talk to me or you can suffer\nYou're so pretty :) How old are you\nYour time is close to running out, too\nyou boys did good\nDO you want your mom seeing your pics?\nShow full body. Not just half. Retake pic\nYou have 1 minute. Hurry up\nI mean you can trust me right\nShow us that you’re crying\n\nConversations excerpted from court documents

In [69]:
display(nyt[nyt['content'].str.contains("Bombshell") == True])
print(nyt.loc[943, 'title'])

Unnamed: 0,title,timestamp,content,media,link
94,Did ‘1917’ Just Enter the Best-Picture Battle?,2019-11-25T14:17:13-05:00,The best-picture race is currently dominated b...,New York Times,https://www.nytimes.com/2019/11/25/movies/1917...
189,"Charlize Theron on ‘Bombshell,’ Fox News and H...",2019-12-04T09:00:07-05:00,Charlize Theron knew it wouldn’t be easy to ma...,New York Times,https://www.nytimes.com/2019/12/04/movies/char...
530,Women Made Them. Viewers and Critics Liked The...,2019-12-12T15:17:07-05:00,LOS ANGELES — Elizabeth Cantillon has worked i...,New York Times,https://www.nytimes.com/2019/12/12/movies/fema...
642,‘Bombshell’ Raises a Question: What’s Megyn Ke...,2019-12-13T13:45:47-05:00,Megyn Kelly last appeared on NBC more than a y...,New York Times,https://www.nytimes.com/2019/12/13/business/me...
943,The Sexual Predator’s TV Wife,2019-12-23T12:30:06-05:00,“The Morning Show” begins with a wake-up call....,New York Times,https://www.nytimes.com/2019/12/23/arts/televi...


The Sexual Predator’s TV Wife


In [71]:
display(nyt[nyt['content'].str.contains("bombshell") == True])
print(nyt.loc[491, 'title'])

Unnamed: 0,title,timestamp,content,media,link
491,I Feel Weird Lying About Santa,2019-12-04T08:37:48-05:00,ImageCredit...Verónica GrechI was out last wee...,New York Times,https://www.nytimes.com/2019/12/04/parenting/i...


I Feel Weird Lying About Santa


In [72]:
display(nyt[nyt['content'].str.contains("burrow") == True])
for id in [316, 329, 591, 720, 969]:
    print(nyt.loc[id, 'title'])

Unnamed: 0,title,timestamp,content,media,link
316,Best Theater of 2019,2019-12-03T05:00:27-05:00,Ben Brantley’s Best | Jesse Green’s Best | Mem...,New York Times,https://www.nytimes.com/2019/12/03/arts/best-b...
329,A Season on the Rez,2019-12-06T07:00:09-05:00,The sky was milk white and vaulted. A squall t...,New York Times,https://www.nytimes.com/2019/12/06/sports/chin...
591,How McKinsey Makes Its Own Rules,2019-12-14T10:38:50-05:00,"This article is copublished with ProPublica, t...",New York Times,https://www.nytimes.com/2019/12/14/sunday-revi...
720,The Wild Woman Awakens,2019-12-17T13:30:18-05:00,"I first spied the book last spring, in the lap...",New York Times,https://www.nytimes.com/2019/12/17/arts/Women-...
969,A Trump Policy ‘Clarification’ All but Ends Pu...,2019-12-24T05:01:10-05:00,WASHINGTON — As the state of Virginia prepared...,New York Times,https://www.nytimes.com/2019/12/24/climate/tru...


Best Theater of 2019
A Season on the Rez
How McKinsey Makes Its Own Rules
The Wild Woman Awakens
A Trump Policy ‘Clarification’ All but Ends Punishment for Bird Deaths


In [74]:
display(nyt[nyt['content'].str.contains("Burrow") == True])
for id in [301, 574, 762, 924, 1104]:
    print(nyt.loc[id, 'title'])
print(nyt.loc[762, 'content'])

Unnamed: 0,title,timestamp,content,media,link
301,L.S.U. Makes Case for Top Seed in Playoff With...,2019-12-07T20:13:50-05:00,ATLANTA — The Southeastern Conference title ga...,New York Times,https://www.nytimes.com/2019/12/07/sports/ncaa...
574,Joe Burrow Wins the 2019 Heisman Trophy in a R...,2019-12-14T20:53:42-05:00,If the three quarterbacks who arrived in New Y...,New York Times,https://www.nytimes.com/2019/12/14/sports/heis...
762,"200 Years of Experience, and Still Learning On...",2019-12-18T13:21:56-05:00,"“I am rarely cast as an ingénue anymore,” Lois...",New York Times,https://www.nytimes.com/2019/12/18/theater/loi...
924,Giants’ Daniel Jones Throws 5 Touchdown Passes...,2019-12-22T17:18:59-05:00,"LANDOVER, Md. — Giants Coach Pat Shurmur thoug...",New York Times,https://www.nytimes.com/2019/12/22/sports/foot...
1104,L.S.U. and Oklahoma Both Seek to End Title Gam...,2019-12-27T16:25:58-05:00,ATLANTA — The right hand of Lee Morris’s fathe...,New York Times,https://www.nytimes.com/2019/12/27/sports/ncaa...


L.S.U. Makes Case for Top Seed in Playoff With SEC Win
Joe Burrow Wins the 2019 Heisman Trophy in a Record-Breaking Landslide
200 Years of Experience, and Still Learning Onstage
Giants’ Daniel Jones Throws 5 Touchdown Passes to Beat Washington
L.S.U. and Oklahoma Both Seek to End Title Game Droughts
“I am rarely cast as an ingénue anymore,” Lois Smith was saying on Monday afternoon. It was a joke, obviously, and her fellow actresses — Estelle Parsons, 92, and Vinie Burrows, who recently turned 95 but rounds that up to 96 — burst into laughter.At 89, Smith was the baby of this bunch. Between them, they have more than 200 years of performance experience, including the film “Lady Bird” and the title role in “Marjorie Prime” (Smith), the movie “Bonnie and Clyde” and the sitcom “Roseanne” (Parsons), the American premiere of Jean Genet’s “The Blacks” and experimental work with the director Rachel Chavkin (Burrows).They’re still busy adding to their résumés: Parsons currently at the Public Th

## Implement bigrams, trigrams

In [3]:
import numpy as np
import pandas as pd

nyt = pd.read_csv('nyt_df_12.csv', sep='\t', header=0, index_col=0)
display(nyt)

Unnamed: 0,title,timestamp,content,media,link
0,Trump’s Intervention in SEALs Case Tests Penta...,2019-11-30T20:47:50-05:00,"He was limp and dusty from an explosion, consc...",New York Times,https://www.nytimes.com/2019/11/30/us/politics...
1,Prime Mover: How Amazon Wove Itself Into the L...,2019-11-30T06:00:18-05:00,Amazon EverywherePrime Mover: How Amazon Wove ...,New York Times,https://www.nytimes.com/2019/11/30/business/am...
2,Students Fainting From Hunger in Venezuela’s F...,2019-11-30T05:00:23-05:00,"BOCA DE UCHIRE, Venezuela — Hundreds of childr...",New York Times,https://www.nytimes.com/2019/11/30/world/ameri...
3,2020 Democratic Candidates Wage Escalating Fig...,2019-11-30T05:00:15-05:00,"WALPOLE, N.H. — Pete Buttigieg has a nifty pol...",New York Times,https://www.nytimes.com/2019/11/30/us/politics...
4,Latin Dictionary’s Journey: A to Zythum in 125...,2019-11-30T12:34:51-05:00,MUNICH — When German researchers began working...,New York Times,https://www.nytimes.com/2019/11/30/arts/latin-...
...,...,...,...,...,...
1104,L.S.U. and Oklahoma Both Seek to End Title Gam...,2019-12-27T16:25:58-05:00,ATLANTA — The right hand of Lee Morris’s fathe...,New York Times,https://www.nytimes.com/2019/12/27/sports/ncaa...
1105,Can They Kick It? Yes They Can. Ohio State and...,2019-12-27T15:36:57-05:00,"SCOTTSDALE, Ariz. — Pity the plight of college...",New York Times,https://www.nytimes.com/2019/12/27/sports/ncaa...
1106,Drawing the Guantánamo Bay War Court,2019-12-27T18:26:08-05:00,Times Insider explains who we are and what we ...,New York Times,https://www.nytimes.com/2019/12/27/reader-cent...
1107,"Quotation of the Day: Jerry Herman, 88, Compos...",2019-12-27T23:34:59-05:00,“There are only a couple of us who care about ...,New York Times,https://www.nytimes.com/2019/12/27/todayspaper...


In [103]:
# # Load the regular expression library
# import re

# # Remove punctuation
# data_preprocess = nyt['content'].map(lambda x: re.sub('[,.!?]', '', x))

# # Convert the titles to lowercase
# data_preprocess = data_preprocess.map(lambda x: x.lower())

# # Print out the first rows of papers
# display(data_preprocess)
# print(data_preprocess.loc[0])

0       he was limp and dusty from an explosion consci...
1       amazon everywhereprime mover: how amazon wove ...
2       boca de uchire venezuela — hundreds of childre...
3       walpole nh — pete buttigieg has a nifty politi...
4       munich — when german researchers began working...
                              ...                        
1104    atlanta — the right hand of lee morris’s fathe...
1105    scottsdale ariz — pity the plight of college f...
1106    times insider explains who we are and what we ...
1107    “there are only a couple of us who care about ...
1108    front pagebecause of an editing error an artic...
Name: content, Length: 1109, dtype: object



In [5]:
import gensim

data = nyt['content'].map(gensim.utils.simple_preprocess)
data[:10]

0    [he, was, limp, and, dusty, from, an, explosio...
1    [amazon, everywhereprime, mover, how, amazon, ...
2    [boca, de, uchire, venezuela, hundreds, of, ch...
3    [walpole, pete, buttigieg, has, nifty, politic...
4    [munich, when, german, researchers, began, wor...
5    [aboard, the, ubc, cyprus, in, the, north, pac...
6    [when, things, go, wrong, those, in, power, of...
7    [najaf, iraq, prime, minister, adel, abdul, ma...
8    [london, it, was, midafternoon, when, mike, fi...
9    [london, the, labour, party, canvassers, gathe...
Name: content, dtype: object

In [105]:
# # Convert to list
# data = nyt.content.values.tolist()

# # Remove Emails
# data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# # Remove new line characters
# data = [re.sub('\s+', ' ', sent) for sent in data]

# # Remove distracting single quotes
# data = [re.sub("\'", "", sent) for sent in data]

# print(data[:1])

  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
  data = [re.sub('\s+', ' ', sent) for sent in data]




In [6]:
def sent_to_words(sentences):
    for sentence in sentences:  # actually here sentence is document ...
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[0])



In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



In [None]:
# Remove Stopwords, Make Bigrams and Lemmatize:

In [39]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
#stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# remove short words:


# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    #return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words and len(word)>=3] for doc in texts]
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

def lemmatization(texts, allowed_postags=[]):  # lemmatize the text
    return WordNetLemmatizer().lemmatize(texts, pos='v')

[nltk_data] Downloading package stopwords to /Users/zf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
#import spacy
#import en_core_web_sm

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

#data_words_trigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
#nlp = en_core_web_sm.load()

# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])  # so no names?
 

#print(data_lemmatized[:1])

In [47]:
data_lemmatized = [[lemmatization(token, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) for token in docs ] for docs in data_words_bigrams]

#print(data_lemmatized[:1])

How many words are in the corpus? what should be the threshold and what should be the number for words extracted?

797911

In [48]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

#print(id2word.num_pos)
print(len(id2word))
#filter
id2word.filter_extremes(no_below=4, no_above=0.5, keep_n=int(0.7*len(id2word)))
#print(id2word.num_pos)  # how many words
print(len(id2word))

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

39231
13519


In [49]:
tfidf = gensim.models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

In [50]:
lda = gensim.models.LdaMulticore(tfidf_corpus, num_topics=10, id2word=id2word, passes=10, workers=3, random_state=100)

for idx, topic in lda.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))


print('Perplexity of bigram TF-IDF LDA Model: ', lda.log_perplexity(tfidf_corpus))

from gensim.models import CoherenceModel

coherence = CoherenceModel(model=lda, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
print('\nCoherence Score of bigram TF-IDF LDA Model: ', coherence.get_coherence())

Topic: 0 
Word: 0.001*"huawei" + 0.001*"anti_semitic" + 0.001*"anti_semitism" + 0.001*"jersey_city" + 0.001*"ferencz" + 0.001*"synagogue" + 0.001*"kosher_market" + 0.001*"grewal" + 0.001*"garbage" + 0.001*"rabbi"
Topic: 1 
Word: 0.001*"broadway" + 0.001*"sandler" + 0.001*"songs" + 0.001*"kessler" + 0.001*"baldwin" + 0.001*"herman" + 0.001*"sewage" + 0.001*"coleman" + 0.001*"musicals" + 0.001*"meat"
Topic: 2 
Word: 0.003*"buttigieg" + 0.002*"abortion" + 0.002*"baseball" + 0.001*"mets" + 0.001*"yankees" + 0.001*"cole" + 0.001*"ms_klobuchar" + 0.001*"sanders" + 0.001*"hockey" + 0.001*"abortions"
Topic: 3 
Word: 0.001*"hoffman" + 0.001*"evangelical" + 0.001*"christianity_today" + 0.001*"peters" + 0.001*"vikings" + 0.001*"transgender" + 0.001*"raven" + 0.001*"jackson" + 0.001*"evangelicals" + 0.001*"seahawks"
Topic: 4 
Word: 0.002*"modi" + 0.002*"india" + 0.001*"census" + 0.001*"hindu" + 0.001*"citizenship" + 0.001*"muslims" + 0.001*"khan" + 0.001*"elliott" + 0.001*"assam" + 0.001*"indians"

In [52]:
import pyLDAvis.gensim
import pyLDAvis
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, tfidf_corpus, id2word)
LDAvis_prepared

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
