# LDA on CNN Corpus #

In this notebook, I will run Latent Dirichalet Allocation on the CNN sentences to model topics.

Experiment: LDA on snippets vs. sentences

In [61]:
import re, numpy as np, pandas as pd
import pickle

from nltk.corpus import stopwords

import gensim, spacy
from gensim.utils import lemmatize, simple_preprocess

from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [44]:
cnn_df = pd.read_csv('../data/interim/cnn-last-year-sent-comb.csv')
cnn_df, _ = train_test_split(cnn_df.drop(columns=['Unnamed: 0', 
                                                 'Unnamed: 0.1',
                                                'Unnamed: 0.1.1']).dropna(), test_size=0.9, random_state=18)
len(cnn_df)

346045

In [45]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could',
                           '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many',
                           'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily',
                           'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right',
                           'line', 'even', 'also', 'may', 'take', 'come', 'hi', 'ha', 'le', 'u', 'wa', 'thi',
                           'to', 'one'])

In [46]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub("([\d,\,\./!#$%&\'\":;>\?@\[\]`)(\+])+", "", sent) # remove digits and remove punctuation
        sent = re.sub("([-])+", " ", sent)
        sent = simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

In [47]:
cnn_df = cnn_df.dropna()
cnn_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346045 entries, 348928 to 2450552
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   sentence     346045 non-null  object
 1   start_snip   346045 non-null  int64 
 2   end_snip     346045 non-null  int64 
 3   contributor  346045 non-null  object
 4   runtime      346045 non-null  object
 5   start_time   346045 non-null  object
 6   stop_time    346045 non-null  object
 7   identifier   346045 non-null  object
 8   subjects     346045 non-null  object
dtypes: int64(2), object(7)
memory usage: 26.4+ MB


In [50]:
data = cnn_df.sentence.values.tolist()

In [51]:
data_words = list(sent_to_words(data))

In [52]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [53]:
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!
len(data_ready)

346045

In [54]:
X_corpus = [' '.join(data_ready[i]) for i in range(len(data_ready))]
X_corpus[99]

'probably high'

In [55]:
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words=stop_words, min_df=2, max_df=0.3, ngram_range=(1,2))
X = vectorizer.fit_transform(X_corpus)

In [56]:
# Tweak the two parameters below
number_topics = 75
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(X)

LatentDirichletAllocation(n_components=75, n_jobs=-1)

In [57]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [58]:
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, vectorizer, number_words)

Topics found via LDA:

Topic #0:
turn news scary credit mind distance unlimited tremendous stake forever

Topic #1:
agree thought election testify terrible represent allergic healthy situation excellent

Topic #2:
truth move different viewer strong switch street charge miss debate

Topic #3:
win minute conversation able best stock policy consistent day able help

Topic #4:
deal month course protein negative last month administration virus last month ago

Topic #5:
trust accept huge bottom racist white criticize severe people intend

Topic #6:
watch stay everywhere elect touch look closely awful music fall

Topic #7:
way vote plan describe apologize audience effective holiday guilty decline

Topic #8:
next tonight protect like concern perfect write next week coverage set

Topic #9:
love occur car bus work sanction really build body assume

Topic #10:
time care bad appreciate first time information stuff spend arrest second

Topic #11:
end expect suppose fact study recover sensitive spee

In [65]:
results = lda.transform(X)

In [66]:
pickle.dump(lda, open('../models/cnn_lda.p', 'wb'))

In [67]:
results_df = pd.DataFrame(results)
results_df.columns = ['topic_' + str(i) for i in range(75)]
results_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73,topic_74
0,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,...,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764
1,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,...,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601
2,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,...,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968
3,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333
4,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,...,0.372137,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563


In [68]:
dom_topic_list = []
for i, row in enumerate(results):
    dom_topic = -1
    topic_pct = 0
    for j, val in enumerate(row):
        if val > topic_pct:
            dom_topic = j
            topic_pct = val
    dom_topic_list.append({'dom_topic':dom_topic, 'topic_pct': topic_pct})

dom_topic_df = pd.DataFrame(dom_topic_list)
dom_topic_df.head()

Unnamed: 0,dom_topic,topic_pct
0,18,0.721435
1,35,0.659506
2,35,0.483123
3,0,0.013333
4,65,0.372137


In [69]:
cnn_results = cnn_df.reset_index().join(dom_topic_df)
cnn_results = cnn_results.join(results_df)
cnn_results.head()

Unnamed: 0,index,sentence,start_snip,end_snip,contributor,runtime,start_time,stop_time,identifier,subjects,...,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73,topic_74
0,348928,"before taking ibrance, tell your doctor if you...",780,840,CNNW,01:00:58,2019-06-30 19:00:00,2019-06-30 20:00:58,CNNW_20190630_190000_CNN_Newsroom_With_Fredric...,"['trump', 'north korea', 'humira', 'harris', '...",...,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764,0.003764
1,2915667,the army core ils ves very prepared to do as ...,540,600,CNNW,01:00:58,2020-03-17 16:00:00,2020-03-17 17:00:58,CNNW_20200317_160000_Inside_Politics,"['china', 'boeing', 'burke', 'new york', 'fauc...",...,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601,0.004601
2,1633374,but thanks to congress permanently extending t...,1920,1980,CNNW,01:00:58,2019-11-03 16:00:00,2019-11-03 17:00:59,CNNW_20191103_160000_Reliable_Sources,"['trump', 'brian', 'dovato', 'mark zuckerberg'...",...,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968,0.003968
3,2976157,"senator, so glad you can be with me right now.",1320,1380,CNNW,01:00:58,2020-03-22 20:00:00,2020-03-22 21:00:59,CNNW_20200322_200000_CNN_Newsroom_With_Fredric...,"['paul', 'new york', 'd.c.', 'germany', 'safel...",...,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333,0.013333
4,492159,reporter: democratic leaders deliberately tai...,960,1020,CNNW,01:00:59,2019-07-16 20:00:00,2019-07-16 21:00:59,CNNW_20190716_200000_The_Lead_With_Jake_Tapper,"['trump', 'usaa', 'humira', 'white house', 'ir...",...,0.372137,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563,0.003563


In [70]:
cnn_results[['sentence', 'dom_topic']].groupby('dom_topic').count()

Unnamed: 0_level_0,sentence
dom_topic,Unnamed: 1_level_1
0,58594
1,4130
2,3911
3,3527
4,4182
...,...
70,3858
71,3972
72,3794
73,3423


In [71]:
cnn_results.to_csv('../data/interim/cnn_lda_results_rand_sent.csv')