### Import modules

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import datetime

In [2]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    display(x)
    pd.set_option('display.max_row', 50)
    pd.set_option('display.max_column', 100)
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

### Load data

In [17]:
# Load politico data set
politico = pd.read_csv("data/politicopolitics.csv")

# Add column that indicates news provider
politico["NewsProvider"] = "Politico"

# Drop articles that have no title
politico = politico[~politico["Title"].isnull()]

# Drop articles that have no content
politico = politico[~politico["Content"].isnull()]

# Drop articles with the same header
politico = politico.drop_duplicates(subset='Title', keep="first")

# Create new year, day, and month columns
politico['Date'] = politico.apply(lambda x: x["Date"][:10], axis = "columns")
politico['Year'] = politico.apply(lambda x: x["Date"][6:], axis = "columns")
politico['Day'] = politico.apply(lambda x: x["Date"][3:5], axis = "columns")
politico['Month'] = politico.apply(lambda x: x["Date"][:2], axis = "columns")

# Create clean date column
politico['Date'] = politico.apply(lambda x: x["Year"] + "-" + x["Month"] + "-" + x["Day"], axis = "columns")

# Only keep articles from 2020
politico = politico[politico["Year"] == "2020"].reset_index(drop = True)

del politico["Year"]
del politico["Day"]
del politico["Month"]

# Only capitalize first letter of author string
politico['Author'] = politico.apply(lambda x: str(x["Author"]).title(), axis = "columns")

# Quick overview
display(politico.tail(1))
display(politico.count())

# Load fivethirtyeight
fivethirty = pd.read_csv("data/fivethiryeightpolitics.csv")

# Add column that indicates news provider
fivethirty["NewsProvider"] = "FiveThirtyEight" 

# Drop articles with the same header
fivethirty = fivethirty.drop_duplicates(subset='Title', keep="first")

# Drop articles that have no content
fivethirty = fivethirty[~fivethirty["Content"].isnull()]

# Quick overview
display(fivethirty.head(1))
display(fivethirty.count())

# Load axios
axios = pd.read_csv("data/axiospolitics.csv")

# Add column that indicates news provider
axios["NewsProvider"] = "Axios" 

# Quick overview
display(axios.tail(1))
display(axios.count())

# Load Breitbart
breitbart = pd.read_csv("data/breitbartpolitics.csv")

# Add column that indicates news provider
breitbart["NewsProvider"] = "Breitbart"

# Drop articles with the same header
breitbart = breitbart.drop_duplicates(subset='Title', keep="first")

# Drop articles that have no content
breitbart = breitbart[~breitbart["Content"].isnull()]

# Drop articles that have no date
breitbart = breitbart[~breitbart["Date"].isnull()]

# Quick overview
display(breitbart.tail(1))
display(breitbart.count())

# Merge as one object
usnews = axios.append([breitbart, fivethirty, politico])

Unnamed: 0,Title,Description,Content,URL,Date,Author,Category,NewsProvider
7813,Growing role for AI to combat space threats,Retired Air Force Lt. Gen. Chris Bogdan sees t...,If Russia or China were to launch a missile at...,https://www.politico.com/news/2020/01/10/artif...,2020-01-10,Jacqueline Feldscher,Space,Politico


Title           7814
Description     7814
Content         7814
URL             7814
Date            7814
Author          7814
Category        7555
NewsProvider    7814
dtype: int64

Unnamed: 0,Title,Description,Content,URL,Date,Author,Category,NewsProvider
0,Why It Took So Long For Politicians To Treat T...,America is a little matryoshka doll of panic r...,America is a little doll of panic right now; p...,https://fivethirtyeight.com/features/why-it-to...,2020-07-16T13:00:27+00:00,Clare Malone,Coronavirus,FiveThirtyEight


Title           502
Description     502
Content         502
URL             502
Date            502
Author          476
Category        501
NewsProvider    502
dtype: int64

Unnamed: 0,Title,Description,Content,URL,Date,Author,Category,NewsProvider
3719,Trump says Wisconsin governor to allow federal...,The governor already activated the Wisconsin N...,President Trump tweeted on Wednesday that Wisc...,https://www.axios.com/jacob-blake-wisconsin-ke...,2020-08-26T18:55:48.417161Z,Ursula Perano,Politics & Policy,Axios


Title           3720
Description     3720
Content         3720
URL             3720
Date            3720
Author          3720
Category        3720
NewsProvider    3720
dtype: int64

Unnamed: 0,Title,Description,Content,URL,Date,Author,Category,NewsProvider
20389,22 Years Later: How China Destroyed Legacy of ...,2019 was the year that the People’s Republic o...,2019 was the year that the People’s Republic o...,https://www.breitbart.com/europe/2020/01/01/22...,2020-01-01T11:20:30+00:00,Kurt Zindulka,"Politics,London / Europe,Asia",Breitbart


Title           20214
Description     20214
Content         20214
URL             20214
Date            20214
Author          20214
Category        20214
NewsProvider    20214
dtype: int64

### Data wrangling

In [6]:
# Transform to right data type
column_names = ["Title", "Description", "Content", "Date"]
for column_name in column_names:
    usnews[column_name] = usnews.apply(lambda x: str(x[column_name]), axis = 1)
    
# Clean date column and sort by date
usnews['Date'] = usnews.apply(lambda x: x["Date"][:10], axis = "columns")
usnews['Year'] = usnews.apply(lambda x: x["Date"][:4], axis = "columns")
usnews['Month'] = usnews.apply(lambda x: x["Date"][5:7], axis = "columns")
usnews['Day'] = usnews.apply(lambda x: x["Date"][8:], axis = "columns")
usnews['CurrentDate'] = datetime.datetime(2020, 8, 27)
usnews['Datetime'] = usnews.apply(lambda x: pd.to_datetime(x["Date"], format = '%Y-%m-%d'), axis = "columns")
usnews['Week'] = usnews.apply(lambda x: 35 + int((x["Datetime"] - x["CurrentDate"]).total_seconds()/604800), axis = 1)
usnews = usnews.sort_values(by=["Date"])

# Only keep articles from 2020
usnews = usnews[usnews["Year"] == "2020"].reset_index(drop = True)
usnews

Unnamed: 0,Title,Description,Content,URL,Date,Author,Category,NewsProvider,Year,Day,Month,CurrentDate,Datetime,Week
0,What we're watching in 2020,"Trump's post-impeachment future, Big Tech regu...",This is going to be a momentous presidential e...,https://www.axios.com/what-we-are-watching-202...,2020-01-01,"Jim VandeHei,Mike Allen",Politics & Policy,Axios,2020,01,01,2020-08-27,2020-01-01,1
1,"Chief Justice Roberts says Americans may ""take...",It reads as a mission statement ahead of Presi...,Supreme Court Chief Justice John Roberts warne...,https://www.axios.com/chief-justice-john-rober...,2020-01-01,Jacob Knutson,Politics & Policy,Axios,2020,01,01,2020-08-27,2020-01-01,1
2,Trump's twin war threats,They will truly test his diplomatic mix of blu...,President Trump suddenly faces two global cris...,https://www.axios.com/north-korea-iran-trump-w...,2020-01-01,"Mike Allen,Jim VandeHei",Politics & Policy,Axios,2020,01,01,2020-08-27,2020-01-01,1
3,"Pollak: Welcome to 2020, the Year of the Good ...",The year 2020 begins with incredible potential...,The year 2020 begins with incredible potential...,https://www.breitbart.com/2020-election/2019/1...,2020-01-01,Joel B. Pollak,"Politics,Media,2020 Election",Breitbart,2020,01,01,2020-08-27,2020-01-01,1
4,Warren Blames Unrest on Trump's 'Recklessness'...,Sen. Elizabeth Warren blamed President Trump f...,Sen. Elizabeth Warren (D-MA) on Tuesday blamed...,https://www.breitbart.com/middle-east/2019/12/...,2020-01-01,Hannah Bleau,"Politics,National Security,Israel / Middle East",Breitbart,2020,01,01,2020-08-27,2020-01-01,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32434,A Well-Connected GOP Strategist Is Helping Kan...,Lane Ruhland filed West’s filing paperwork and...,Kanye West has officially submitted signature...,https://www.vice.com/en_us/article/akzy3b/a-we...,2020-08-5,Cameron Joseph,"culture, news, lgbtq, politics, journalism, vi...",Vice,2020,5,08,2020-08-27,2020-08-05,32
32435,Kanye West Has a Senior GOP Strategist Helping...,“Would you help me get Kanye West on the ballo...,A top Colorado GOP strategist is helping Kanye...,https://www.vice.com/en_us/article/889kv5/kany...,2020-08-5,Cameron Joseph,"culture, news, lgbtq, politics, journalism, vi...",Vice,2020,5,08,2020-08-27,2020-08-05,32
32436,A Malaysian Minister Was Caught Vaping in Parl...,Malaysia’s Foreign Affairs Minister tried to s...,A Malaysian politician has apologized for vapi...,https://www.vice.com/en_us/article/bv8pad/mala...,2020-08-7,Heather Chen,"culture, news, lgbtq, politics, journalism, vi...",Vice,2020,7,08,2020-08-27,2020-08-07,33
32437,Hackers Hijack Reddit Mod Accounts to Post Pro...,Hackers took control and defaced several large...,Hackers took control of more than a dozen subr...,https://www.vice.com/en_us/article/y3zx7g/hack...,2020-08-7,Lorenzo Franceschi-Bicchierai,"culture, news, lgbtq, politics, journalism, vi...",Vice,2020,7,08,2020-08-27,2020-08-07,33


### Vader Sentiment

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Negative scores
usnews["negSentiment"] = usnews.apply(lambda x: analyzer.polarity_scores(x["Content"])["neg"], axis="columns")
 
# Neutral scores
usnews["neuSentiment"] = usnews.apply(lambda x: analyzer.polarity_scores(x["Content"])["neu"], axis="columns")
 
# Positive scores
usnews["posSentiment"] = usnews.apply(lambda x: analyzer.polarity_scores(x["Content"])["pos"], axis="columns")

# Compound scores
usnews["compoundSentiment"] = usnews.apply(lambda x: analyzer.polarity_scores(x["Content"])["compound"], axis="columns")

### Textblob

In [8]:
from textblob import TextBlob
usnews["polarity"] = usnews['Content'].apply(lambda x: TextBlob(x).sentiment[0])
usnews["subjectivity"] = usnews['Content'].apply(lambda x: TextBlob(x).sentiment[1])

### Text preparation for LDA

In [9]:
# Lowercase
usnews["Content_clean"] = usnews.apply(lambda x: x["Content"].lower(), axis = "columns")

# Delete whitespace
whitespace_regex = r"[\t\r\n]"
usnews["Content_clean"] = usnews.apply(lambda x: re.sub(whitespace_regex, "", x["Content_clean"]), axis="columns")

# Tokenize words
non_characters_regex = r"[^a-zA-Z0-9 -]"
usnews["Content_words"] = usnews.apply(lambda x: nltk.word_tokenize(re.sub(non_characters_regex, "", x["Content_clean"])), axis = "columns")

## Delete stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# stop_words.append()
# print(stop_words)
usnews["Content_filteredwords"] = usnews.apply(lambda x: [w for w in x["Content_words"] if not w in stop_words], axis = "columns")

# Word stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()
def stemWords(words):
    stemmed_words = []
    for word in words:
        stemmed_words.append(ps.stem(word))
    return stemmed_words
usnews["Content_stemmedwords"] = usnews.apply(lambda x: stemWords(x["Content_filteredwords"]), axis = "columns")

# Count words
usnews['Totalwords'] = [len(x) for x in usnews['Content_stemmedwords']]
usnews

Unnamed: 0,Title,Description,Content,URL,Date,Author,Category,NewsProvider,Year,Day,...,neuSentiment,posSentiment,compoundSentiment,polarity,subjectivity,Content_clean,Content_words,Content_filteredwords,Content_stemmedwords,Totalwords
0,What we're watching in 2020,"Trump's post-impeachment future, Big Tech regu...",This is going to be a momentous presidential e...,https://www.axios.com/what-we-are-watching-202...,2020-01-01,"Jim VandeHei,Mike Allen",Politics & Policy,Axios,2020,01,...,0.858,0.091,0.9834,0.168711,0.358021,this is going to be a momentous presidential e...,"[this, is, going, to, be, a, momentous, presid...","[going, momentous, presidential, election, yea...","[go, moment, presidenti, elect, year, face, re...",393
1,"Chief Justice Roberts says Americans may ""take...",It reads as a mission statement ahead of Presi...,Supreme Court Chief Justice John Roberts warne...,https://www.axios.com/chief-justice-john-rober...,2020-01-01,Jacob Knutson,Politics & Policy,Axios,2020,01,...,0.796,0.149,0.9779,0.144646,0.355859,supreme court chief justice john roberts warne...,"[supreme, court, chief, justice, john, roberts...","[supreme, court, chief, justice, john, roberts...","[suprem, court, chief, justic, john, robert, w...",145
2,Trump's twin war threats,They will truly test his diplomatic mix of blu...,President Trump suddenly faces two global cris...,https://www.axios.com/north-korea-iran-trump-w...,2020-01-01,"Mike Allen,Jim VandeHei",Politics & Policy,Axios,2020,01,...,0.840,0.040,-0.9804,0.029773,0.298242,president trump suddenly faces two global cris...,"[president, trump, suddenly, faces, two, globa...","[president, trump, suddenly, faces, two, globa...","[presid, trump, suddenli, face, two, global, c...",169
3,"Pollak: Welcome to 2020, the Year of the Good ...",The year 2020 begins with incredible potential...,The year 2020 begins with incredible potential...,https://www.breitbart.com/2020-election/2019/1...,2020-01-01,Joel B. Pollak,"Politics,Media,2020 Election",Breitbart,2020,01,...,0.760,0.110,-0.9320,0.052025,0.445746,the year 2020 begins with incredible potential...,"[the, year, 2020, begins, with, incredible, po...","[year, 2020, begins, incredible, potential, su...","[year, 2020, begin, incred, potenti, support, ...",240
4,Warren Blames Unrest on Trump's 'Recklessness'...,Sen. Elizabeth Warren blamed President Trump f...,Sen. Elizabeth Warren (D-MA) on Tuesday blamed...,https://www.breitbart.com/middle-east/2019/12/...,2020-01-01,Hannah Bleau,"Politics,National Security,Israel / Middle East",Breitbart,2020,01,...,0.800,0.056,-0.9968,-0.020271,0.459366,sen. elizabeth warren (d-ma) on tuesday blamed...,"[sen, elizabeth, warren, d-ma, on, tuesday, bl...","[sen, elizabeth, warren, d-ma, tuesday, blamed...","[sen, elizabeth, warren, d-ma, tuesday, blame,...",390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32434,A Well-Connected GOP Strategist Is Helping Kan...,Lane Ruhland filed West’s filing paperwork and...,Kanye West has officially submitted signature...,https://www.vice.com/en_us/article/akzy3b/a-we...,2020-08-5,Cameron Joseph,"culture, news, lgbtq, politics, journalism, vi...",Vice,2020,5,...,0.886,0.093,0.9900,0.153348,0.426474,kanye west has officially submitted signature...,"[kanye, west, has, officially, submitted, sign...","[kanye, west, officially, submitted, signature...","[kany, west, offici, submit, signatur, appear,...",335
32435,Kanye West Has a Senior GOP Strategist Helping...,“Would you help me get Kanye West on the ballo...,A top Colorado GOP strategist is helping Kanye...,https://www.vice.com/en_us/article/889kv5/kany...,2020-08-5,Cameron Joseph,"culture, news, lgbtq, politics, journalism, vi...",Vice,2020,5,...,0.906,0.067,0.9526,0.096014,0.450302,a top colorado gop strategist is helping kanye...,"[a, top, colorado, gop, strategist, is, helpin...","[top, colorado, gop, strategist, helping, kany...","[top, colorado, gop, strategist, help, kany, w...",322
32436,A Malaysian Minister Was Caught Vaping in Parl...,Malaysia’s Foreign Affairs Minister tried to s...,A Malaysian politician has apologized for vapi...,https://www.vice.com/en_us/article/bv8pad/mala...,2020-08-7,Heather Chen,"culture, news, lgbtq, politics, journalism, vi...",Vice,2020,7,...,0.920,0.034,-0.7635,0.041335,0.332055,a malaysian politician has apologized for vapi...,"[a, malaysian, politician, has, apologized, fo...","[malaysian, politician, apologized, vaping, in...","[malaysian, politician, apolog, vape, insid, p...",146
32437,Hackers Hijack Reddit Mod Accounts to Post Pro...,Hackers took control and defaced several large...,Hackers took control of more than a dozen subr...,https://www.vice.com/en_us/article/y3zx7g/hack...,2020-08-7,Lorenzo Franceschi-Bicchierai,"culture, news, lgbtq, politics, journalism, vi...",Vice,2020,7,...,0.846,0.128,0.9782,0.356296,0.512593,hackers took control of more than a dozen subr...,"[hackers, took, control, of, more, than, a, do...","[hackers, took, control, dozen, subreddits, in...","[hacker, took, control, dozen, subreddit, incl...",120


### LDA

In [14]:
# Prepare input for LDA model
import scipy
from gensim.corpora import Dictionary
texts = usnews["Content_stemmedwords"].to_list()
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Compute LDA model
from gensim.models import LdaModel
ldamodel = LdaModel(corpus=corpus, num_topics= 20, id2word=dictionary, alpha = 'auto')
ldamodel.show_topics()

  and should_run_async(code)


[(6,
  '0.042*"israel" + 0.024*"bass" + 0.023*"west" + 0.021*"antifa" + 0.016*"isra" + 0.014*"palestinian" + 0.011*"said" + 0.011*"netanyahu" + 0.011*"peac" + 0.009*"jewish"'),
 (2,
  '0.036*"hous" + 0.019*"senat" + 0.015*"committe" + 0.015*"democrat" + 0.015*"bill" + 0.012*"lawmak" + 0.012*"member" + 0.010*"republican" + 0.010*"would" + 0.010*"said"'),
 (1,
  '0.012*"eu" + 0.011*"govern" + 0.010*"uk" + 0.010*"london" + 0.010*"said" + 0.010*"countri" + 0.010*"minist" + 0.009*"european" + 0.007*"british" + 0.007*"would"'),
 (4,
  '0.029*"rep" + 0.022*"district" + 0.018*"republican" + 0.016*"trump" + 0.015*"pm" + 0.014*"congression" + 0.013*"hous" + 0.010*"seat" + 0.010*"democrat" + 0.010*"say"'),
 (9,
  '0.034*"china" + 0.024*"us" + 0.020*"chines" + 0.015*"state" + 0.012*"said" + 0.011*"unit" + 0.010*"countri" + 0.010*"world" + 0.009*"pompeo" + 0.008*"communist"'),
 (10,
  '0.043*"biden" + 0.019*"campaign" + 0.018*"democrat" + 0.016*"trump" + 0.011*"joe" + 0.011*"presid" + 0.009*"parti"

In [9]:
# Compute Coherence Score
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5015015778174381


In [9]:
# Save lda model
#ldamodel.save('ldamodel/lda.model')

# Load lda model
#from gensim.models import LdaModel
#ldamodel =  LdaModel.load('lda.model')

In [None]:
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# Find optimal numbers of topics (highest coherence scores)
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=texts, start=2, limit=40, step=6)

# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [11]:
# Assign main topic to each document
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,10.0,0.1909,"said, health, test, china, viru, us, mask, sta...","[go, moment, presidenti, elect, year, face, re..."
1,1,8.0,0.2915,"court, attorney, justic, case, judg, feder, in...","[suprem, court, chief, justic, john, robert, w..."
2,2,5.0,0.3441,"protest, us, peac, hong, iran, kong, china, at...","[presid, trump, suddenli, face, two, global, c..."
3,3,3.0,0.3016,"trump, presid, said, white, administr, hous, a...","[year, 2020, begin, incred, potenti, support, ..."
4,4,5.0,0.4748,"protest, us, peac, hong, iran, kong, china, at...","[sen, elizabeth, warren, d-ma, tuesday, blame,..."
5,5,5.0,0.5361,"protest, us, peac, hong, iran, kong, china, at...","[depart, defens, secretari, mark, esper, annou..."
6,6,18.0,0.6971,"polic, citi, offic, said, protest, enforc, new...","[polic, lancast, ohio, releas, video, moment, ..."
7,7,15.0,0.6289,"senat, trump, said, committe, hous, presid, re...","[speak, report, tuesday, trump, attorney, rudi..."
8,8,19.0,0.459,"said, offici, us, secur, nation, depart, state...","[tuesday, broadcast, fox, news, channel, speci..."
9,9,14.0,0.5691,"live, black, peopl, right, american, one, said...","[greyhound, partnership, nation, runaway, safe..."


In [12]:
usnews_final = pd.concat([usnews.reset_index(drop=True), df_dominant_topic],  axis = 1, sort = False)
usnews_final

Unnamed: 0,Title,Description,Content,URL,Date,Author,Category,NewsProvider,Year,Day,...,Text,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text.1,Document_No,Dominant_Topic.1,Topic_Perc_Contrib.1,Keywords.1,Text.2
0,What we're watching in 2020,"Trump's post-impeachment future, Big Tech regu...",This is going to be a momentous presidential e...,https://www.axios.com/what-we-are-watching-202...,2020-01-01,"Jim VandeHei,Mike Allen",Politics & Policy,Axios,2020,01,...,"[go, moment, presidenti, elect, year, face, re...",3.0,0.2675,"said, health, vaccin, like, use, work, would, ...","[go, moment, presidenti, elect, year, face, re...",0,10.0,0.1909,"said, health, test, china, viru, us, mask, sta...","[go, moment, presidenti, elect, year, face, re..."
1,"Chief Justice Roberts says Americans may ""take...",It reads as a mission statement ahead of Presi...,Supreme Court Chief Justice John Roberts warne...,https://www.axios.com/chief-justice-john-rober...,2020-01-01,Jacob Knutson,Politics & Policy,Axios,2020,01,...,"[suprem, court, chief, justic, john, robert, w...",6.0,0.5649,"court, justic, feder, attorney, law, case, sai...","[suprem, court, chief, justic, john, robert, w...",1,8.0,0.2915,"court, attorney, justic, case, judg, feder, in...","[suprem, court, chief, justic, john, robert, w..."
2,Trump's twin war threats,They will truly test his diplomatic mix of blu...,President Trump suddenly faces two global cris...,https://www.axios.com/north-korea-iran-trump-w...,2020-01-01,"Mike Allen,Jim VandeHei",Politics & Policy,Axios,2020,01,...,"[presid, trump, suddenli, face, two, global, c...",18.0,0.4016,"polic, protest, offic, said, citi, peopl, fire...","[presid, trump, suddenli, face, two, global, c...",2,5.0,0.3441,"protest, us, peac, hong, iran, kong, china, at...","[presid, trump, suddenli, face, two, global, c..."
3,"Pollak: Welcome to 2020, the Year of the Good ...",The year 2020 begins with incredible potential...,The year 2020 begins with incredible potential...,https://www.breitbart.com/2020-election/2019/1...,2020-01-01,Joel B. Pollak,"Politics,Media,2020 Election",Breitbart,2020,01,...,"[year, 2020, begin, incred, potenti, support, ...",7.0,0.2426,"peopl, polit, countri, american, us, one, part...","[year, 2020, begin, incred, potenti, support, ...",3,3.0,0.3016,"trump, presid, said, white, administr, hous, a...","[year, 2020, begin, incred, potenti, support, ..."
4,Warren Blames Unrest on Trump's 'Recklessness'...,Sen. Elizabeth Warren blamed President Trump f...,Sen. Elizabeth Warren (D-MA) on Tuesday blamed...,https://www.breitbart.com/middle-east/2019/12/...,2020-01-01,Hannah Bleau,"Politics,National Security,Israel / Middle East",Breitbart,2020,01,...,"[sen, elizabeth, warren, d-ma, tuesday, blame,...",18.0,0.4976,"polic, protest, offic, said, citi, peopl, fire...","[sen, elizabeth, warren, d-ma, tuesday, blame,...",4,5.0,0.4748,"protest, us, peac, hong, iran, kong, china, at...","[sen, elizabeth, warren, d-ma, tuesday, blame,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31985,The Best Way to Help Schools Reopen,Educators aren’t public health experts. If we ...,The unfolding complexity of school reopening r...,https://www.politico.com/news/agenda/2020/08/2...,2020-08-27,MARIO RAMIREZ,OPINION,Politico,2020,27,...,"[unfold, complex, school, reopen, reflect, sim...",3.0,0.3998,"said, health, vaccin, like, use, work, would, ...","[unfold, complex, school, reopen, reflect, sim...",31985,17.0,0.4629,"state, school, vote, elect, ballot, said, stud...","[unfold, complex, school, reopen, reflect, sim..."
31986,How Trump Mastered the Art of Telling History ...,The Republican convention has glossed over the...,"Donald Trump, according to the first three nig...",https://www.politico.com/news/magazine/2020/08...,2020-08-27,MICHAEL KRUSE,OPINION,Politico,2020,27,...,"[donald, trump, accord, first, three, night, r...",10.0,0.2054,"presid, trump, said, go, think, say, peopl, kn...","[donald, trump, accord, first, three, night, r...",31986,3.0,0.3932,"trump, presid, said, white, administr, hous, a...","[donald, trump, accord, first, three, night, r..."
31987,"Bush, McCain and Romney presidential staffers ...",“We know Mitt and Joe share those same essenti...,Support for Joe Biden's White House bid is gro...,https://www.politico.com/news/2020/08/27/mitt-...,2020-08-27,MAX COHEN,2020 Elections,Politico,2020,27,...,"[support, joe, biden, white, hous, bid, grow, ...",11.0,0.4915,"biden, campaign, democrat, said, joe, former, ...","[support, joe, biden, white, hous, bid, grow, ...",31987,6.0,0.5023,"biden, democrat, campaign, elect, joe, parti, ...","[support, joe, biden, white, hous, bid, grow, ..."
31988,GOP takes a third run at defining Republicans ...,After two days of dueling and at-time contradi...,"On Monday, the only Black GOP senator sought t...",https://www.politico.com/news/2020/08/27/gop-h...,2020-08-27,GABBY ORR,2020 Conventions,Politico,2020,27,...,"[monday, black, gop, senat, sought, convinc, a...",0.0,0.2615,"trump, presid, white, hous, said, donald, amer...","[monday, black, gop, senat, sought, convinc, a...",31988,3.0,0.3846,"trump, presid, said, white, administr, hous, a...","[monday, black, gop, senat, sought, convinc, a..."


In [13]:
del usnews_final["Document_No"]

In [14]:
usnews_final.to_pickle("data/usnews.pkl")

In [16]:
#import pyLDAvis.gensim
#pyLDAvis.enable_notebook()
#p = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
#pyLDAvis.save_html(p, r'C:\Users\Jonathan Ratschat\Google Drive\Colab Notebooks\Scraping\usnews\pyLDAvis.html')
#pyLDAvis.display(p)

  and should_run_async(code)


### Not used in final analysis