## References
* https://medium.com/@nicharuch/collocations-identifying-phrases-that-act-like-individual-words-in-nlp-f58a93a2f84a
* Sentiment Lexicons: https://sites.google.com/site/datascienceslab/projects/multilingualsentiment
* https://stackoverflow.com/questions/49088978/how-to-create-corpus-from-pandas-data-frame-to-operate-with-nltk/49104725
* https://stackoverflow.com/questions/26091769/finding-trigrams-for-entire-corpus-with-nltk

In [1]:
# Dependencies
import pymongo
import os, os.path
from dotenv import load_dotenv
import spacy
import re
import string
import nltk
from nltk.corpus import stopwords
import json
import pandas as pd
from nltk import FreqDist

In [54]:
# Load in database access info
load_dotenv()
username = os.getenv("db_username")
password = os.getenv("db_password")

In [55]:
# Connect to database
conn = f"mongodb+srv://{username}:{password}@clusterprime.mpaq0.mongodb.net/ETL?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn)

# Define the 'classDB' database in Mongo
db = client.ETL

In [25]:
# Pull all data from collection
news_data = db.NFTA.find({})

## Analyze Bigrams of Headlines

In [5]:
# Store headlines in list
headlines = []
for article in news_data:
    headlines.append(article['title'])

In [6]:
# Load spacy
nlp = spacy.load('en_core_web_sm')

# Function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub("", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

# Apply function to clean and lemmatize comments
lemmatized = clean_comments(headlines)

# Make all words lowercase
lemmatized_headlines_1 = [word.lower() for word in lemmatized]

In [46]:
# Lemmatize word "aliens" - missed by spacy
headlines_clean = []
for word in lemmatized_headlines_1:
    if word == 'aliens':
        headlines_clean.append('alien')
    else:
        headlines_clean.append(word)

In [47]:
# Calculate bigrams
bigrams = nltk.collocations.BigramAssocMeasures()

bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(headlines_clean)

In [48]:
# Limit bigrams to at least 10 occurrences
bigramFinder.apply_freq_filter(10)

# Pull out bigrams and PMI scores
headline_bigrams = list(bigramFinder.score_ngrams(bigrams.pmi))

In [69]:
# Get english stopwords
en_stopwords = set(stopwords.words('english'))

# Function to remove function words from bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    # Why is this not iterating through the tuple?
    for word in ngram:
        if word in en_stopwords or word.isspace():
            return False
        elif word in ('5', "'", '’s'):
            return False
    types_avoid = ('CC', 'DT', 'IN', 'POS', 'TO', 'SYM')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in types_avoid or tags[1][1] in types_avoid:
        return False
    else:
        return True

# Filter bigrams
filtered_headlines = [bigram for bigram in headline_bigrams if rightTypes(bigram[0])]

In [70]:
# Review filtered bigrams
filtered_headlines

[(('ilhan', 'omar'), 11.245711148410681),
 (('hong', 'kong'), 11.152601744019199),
 (('ruth', 'bader'), 10.982676742576889),
 (('north', 'carolina'), 10.84274648143241),
 (('town', 'hall'), 10.387066997656222),
 (('fact', 'check'), 10.36600538212839),
 (('pope', 'francis'), 10.294620748891628),
 (('bader', 'ginsburg'), 10.134679836021936),
 (('fox', 'news'), 10.108207624660746),
 (('amy', 'coney'), 9.760284321240439),
 (('coney', 'barrett'), 9.612185682251306),
 (('washington', 'post'), 9.608281227795388),
 (('wall', 'street'), 9.567639243298043),
 (('terror', 'attack'), 8.720249659438187),
 (('health', 'care'), 8.69590509310314),
 (('undocumented', 'immigrant'), 8.644807103820504),
 (('kamala', 'harris'), 8.545271430269587),
 (('illegal', 'alien'), 8.38504783001957),
 (('supreme', 'court'), 8.167452134490183),
 (('new', 'york'), 8.054569661058673),
 (('vp', 'debate'), 8.013050391620405),
 (('border', 'patrol'), 7.9826767425768885),
 (('white', 'house'), 7.9153917832641785),
 (('presid

In [71]:
# Read in pos/negative word lists

pos_words = []
with open('positive_words_en.txt') as p:
    for row in p:
        word = p.readline()
        pos_words.append(word.rstrip())

neg_words = [] 
with open('negative_words_en.txt') as n:
    for row in n:
        word = n.readline()
        neg_words.append(word.rstrip())

In [72]:
# Create list of words w/ sentiment from bigrams
words_sentiment = []
word_check = []
for bigram in filtered_headlines:
    word_dict = {}
    sentiment = 'not labelled'
    first_word = bigram[0][0]
    second_word = bigram[0][1]

    if first_word not in word_check:
        word_check.append(first_word)
        if first_word in pos_words:
            sentiment = 1
        if first_word in neg_words:
            sentiment = -1
        word_dict = {'id': first_word, 'group': sentiment}
        words_sentiment.append(word_dict)
        
        
    if second_word not in word_check:
        word_check.append(second_word)
        if second_word in pos_words:
            sentiment = 1
        if second_word in neg_words:
            sentiment = -1
        word_dict = {'id': second_word, 'group': sentiment}
        words_sentiment.append(word_dict)

In [73]:
# Check sentiment labels for accuracy
for word in words_sentiment:
    print(word)

{'id': 'ilhan', 'group': 'not labelled'}
{'id': 'omar', 'group': 'not labelled'}
{'id': 'hong', 'group': 'not labelled'}
{'id': 'kong', 'group': 'not labelled'}
{'id': 'ruth', 'group': 'not labelled'}
{'id': 'bader', 'group': 'not labelled'}
{'id': 'north', 'group': 'not labelled'}
{'id': 'carolina', 'group': 'not labelled'}
{'id': 'town', 'group': 'not labelled'}
{'id': 'hall', 'group': 'not labelled'}
{'id': 'fact', 'group': 'not labelled'}
{'id': 'check', 'group': 'not labelled'}
{'id': 'pope', 'group': 'not labelled'}
{'id': 'francis', 'group': 'not labelled'}
{'id': 'ginsburg', 'group': 'not labelled'}
{'id': 'fox', 'group': 'not labelled'}
{'id': 'news', 'group': 'not labelled'}
{'id': 'amy', 'group': 'not labelled'}
{'id': 'coney', 'group': 'not labelled'}
{'id': 'barrett', 'group': 'not labelled'}
{'id': 'washington', 'group': 'not labelled'}
{'id': 'post', 'group': 'not labelled'}
{'id': 'wall', 'group': 'not labelled'}
{'id': 'street', 'group': 'not labelled'}
{'id': 'terror'

In [74]:
# Revise sentiment labels
for word in words_sentiment:
    if word['group'] == 'not labelled':
        if word['id'] in ['terror', 'attack', 'battleground', 'illegal']:
            word['group'] = -1
        else:
            word['group'] = 0


In [75]:
# Check sentiment labels after editing
for word in words_sentiment:
    print(word)

{'id': 'ilhan', 'group': 0}
{'id': 'omar', 'group': 0}
{'id': 'hong', 'group': 0}
{'id': 'kong', 'group': 0}
{'id': 'ruth', 'group': 0}
{'id': 'bader', 'group': 0}
{'id': 'north', 'group': 0}
{'id': 'carolina', 'group': 0}
{'id': 'town', 'group': 0}
{'id': 'hall', 'group': 0}
{'id': 'fact', 'group': 0}
{'id': 'check', 'group': 0}
{'id': 'pope', 'group': 0}
{'id': 'francis', 'group': 0}
{'id': 'ginsburg', 'group': 0}
{'id': 'fox', 'group': 0}
{'id': 'news', 'group': 0}
{'id': 'amy', 'group': 0}
{'id': 'coney', 'group': 0}
{'id': 'barrett', 'group': 0}
{'id': 'washington', 'group': 0}
{'id': 'post', 'group': 0}
{'id': 'wall', 'group': 0}
{'id': 'street', 'group': 0}
{'id': 'terror', 'group': -1}
{'id': 'attack', 'group': -1}
{'id': 'health', 'group': 0}
{'id': 'care', 'group': 0}
{'id': 'undocumented', 'group': -1}
{'id': 'immigrant', 'group': -1}
{'id': 'kamala', 'group': 0}
{'id': 'harris', 'group': 0}
{'id': 'illegal', 'group': -1}
{'id': 'alien', 'group': 0}
{'id': 'supreme', 'group'

In [76]:
# Format bigrams
links = []
for bigram in filtered_headlines:
    word_dict = {
        'source': bigram[0][0],
        'target': bigram[0][1],
        'value': round(bigram[1])
    }
    links.append(word_dict)    

In [81]:
# Format data for arc diagram
headline_bigrams_formatted = {'nodes': words_sentiment, 'links': links}
headline_bigrams_formatted

{'nodes': [{'id': 'ilhan', 'group': 0},
  {'id': 'omar', 'group': 0},
  {'id': 'hong', 'group': 0},
  {'id': 'kong', 'group': 0},
  {'id': 'ruth', 'group': 0},
  {'id': 'bader', 'group': 0},
  {'id': 'north', 'group': 0},
  {'id': 'carolina', 'group': 0},
  {'id': 'town', 'group': 0},
  {'id': 'hall', 'group': 0},
  {'id': 'fact', 'group': 0},
  {'id': 'check', 'group': 0},
  {'id': 'pope', 'group': 0},
  {'id': 'francis', 'group': 0},
  {'id': 'ginsburg', 'group': 0},
  {'id': 'fox', 'group': 0},
  {'id': 'news', 'group': 0},
  {'id': 'amy', 'group': 0},
  {'id': 'coney', 'group': 0},
  {'id': 'barrett', 'group': 0},
  {'id': 'washington', 'group': 0},
  {'id': 'post', 'group': 0},
  {'id': 'wall', 'group': 0},
  {'id': 'street', 'group': 0},
  {'id': 'terror', 'group': -1},
  {'id': 'attack', 'group': -1},
  {'id': 'health', 'group': 0},
  {'id': 'care', 'group': 0},
  {'id': 'undocumented', 'group': -1},
  {'id': 'immigrant', 'group': -1},
  {'id': 'kamala', 'group': 0},
  {'id': 'h

In [88]:
# Remove words that only occur in one bigram
words_keep = []
for word in word_check:
    word_count = 0
    for bi_dict in links:
        if word == bi_dict['source'] or word == bi_dict['target']:
            word_count += 1
    if word_count > 1:
        words_keep.append(word)
        
links_filtered = []
words_sentiment_filtered = []
for word in words_keep:
    for bi_dict in links:
        if word == bi_dict['source'] or word == bi_dict['target']:
            links_filtered.append(bi_dict)

words_filtered = []
for bigram in links_filtered:
    if bigram['source'] not in words_filtered:
        words_filtered.append(bigram['source'])
    if bigram['target'] not in words_filtered:
        words_filtered.append(bigram['target'])

words_sentiment_filtered = []
for word in words_filtered:
    for sent_dict in words_sentiment:
        if word == sent_dict['id']:
            words_sentiment_filtered.append(sent_dict)

In [90]:
# Format data for arc diagram
headline_bigrams_filtered = {'nodes': words_sentiment_filtered, 'links': links_filtered}
headline_bigrams_filtered

{'nodes': [{'id': 'ruth', 'group': 0},
  {'id': 'bader', 'group': 0},
  {'id': 'ginsburg', 'group': 0},
  {'id': 'amy', 'group': 0},
  {'id': 'coney', 'group': 0},
  {'id': 'barrett', 'group': 0},
  {'id': 'wall', 'group': 0},
  {'id': 'street', 'group': 0},
  {'id': 'border', 'group': 0},
  {'id': 'undocumented', 'group': -1},
  {'id': 'immigrant', 'group': -1},
  {'id': 'illegal', 'group': -1},
  {'id': 'alien', 'group': 0},
  {'id': 'immigration', 'group': 0},
  {'id': 'migrant', 'group': 0},
  {'id': 'vp', 'group': 0},
  {'id': 'debate', 'group': 0},
  {'id': 'presidential', 'group': 0},
  {'id': 'patrol', 'group': 0},
  {'id': 'texas', 'group': 0},
  {'id': 'election', 'group': 0},
  {'id': 'asian', 'group': 0},
  {'id': 'american', 'group': 0},
  {'id': 'americans', 'group': 0},
  {'id': 'voter', 'group': 0},
  {'id': 'refugee', 'group': 0},
  {'id': 'camp', 'group': 0},
  {'id': 'caravan', 'group': 0},
  {'id': 'child', 'group': 0},
  {'id': 'latino', 'group': 0},
  {'id': 'poli

In [91]:
# Save as JSON file
with open('headline_bigrams.json', 'w') as outfile:
    json.dump(headline_bigrams_filtered, outfile)

## Analyze Bigrams of Full Text

In [26]:
# Store data in dataframe
df = pd.DataFrame(news_data)
df.head()

Unnamed: 0,_id,keyword,source,author,title,url,published,compound_score,negative_score,positive_score,neutral_score,text_excerpt,text_complete,sentiment_category
0,5ff0c0da250c080ba07a092c,immigration,NBC News,Dip Patel,I could be deported because my parents came he...,https://www.nbcnews.com/think/opinion/biden-s-...,2020-12-04T09:32:20Z,0.1027,0.0,0.104,0.896,"As an inpatient hospital pharmacist, I am part...","As an inpatient hospital pharmacist, I am part...",neutral
1,5ff0c0da250c080ba07a092d,immigration,NBC News,Suzanne Gamboa,First Latino tapped to head DHS signals shift ...,https://www.nbcnews.com/news/latino/first-lati...,2020-11-23T22:41:00Z,0.0,0.0,0.0,1.0,"Alejandro Mayorkas, the first Latino chosen fo...","Alejandro Mayorkas, the first Latino chosen fo...",neutral
2,5ff0c0da250c080ba07a092e,immigration,NBC News,Cynthia Silva,"Tony Pham, interim director of Immigration and...",https://www.nbcnews.com/news/asian-america/ton...,2020-12-14T22:28:21Z,0.0,0.0,0.0,1.0,Tony Pham will be stepping down as the acting ...,Tony Pham will be stepping down as the acting ...,neutral
3,5ff0c0da250c080ba07a092f,immigration,NBC News,Julia Ainsley and Didi Martinez,More than two-thirds of undocumented immigrant...,https://www.nbcnews.com/politics/immigration/s...,2020-12-16T22:46:58Z,-0.3818,0.157,0.0,0.843,WASHINGTON More than two-thirds of undocumente...,WASHINGTON — More than two-thirds of undocumen...,negative
4,5ff0c0da250c080ba07a0930,immigration,NBC News,"Rebecca Shabad, Adam Edelman","Biden to meet with struggling workers, small-b...",https://www.nbcnews.com/politics/politics-news...,2020-12-02T21:56:00Z,-0.7845,0.365,0.0,0.635,President-elect Joe Biden on Wednesday used a ...,President-elect Joe Biden on Wednesday used a ...,negative


In [27]:
# Add ID column
df['id'] = df['source'] + " " + df['published']

In [28]:
# Check how many unique ID's there are
df['id'].nunique()

3090

In [44]:
# Write full text of articles to txt files
for index, row in df.iterrows():
    _id = row['id']
    corp_id = _id.replace(" ", "").replace(':', '-')
    title = row['title']
    body = row['text_complete']
    file_name = 'full_news_text/' + str(corp_id) + '.txt'
    with open(file_name, 'a+', encoding='utf-8') as f:
        f.write(str(body) + ' ' + str(title))

In [55]:
# Create corpus
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
news_corpus = CategorizedPlaintextCorpusReader('full_news_text/', r'.*', cat_pattern=r'(.*) .*')

In [56]:
news_corpus.words()

['Over', '12', ',', '000', 'men', ',', 'women', 'and', ...]

In [57]:
news_corpus.sents()

[['Over', '12', ',', '000', 'men', ',', 'women', 'and', 'children', 'ran', 'out', 'of', 'tents', 'in', 'panic', 'as', 'the', 'fire', 'destroyed', 'most', 'the', 'camp', '.'], ['European', 'officials', 'said', 'the', 'fate', 'of', '12', ',', '600', 'refugees', 'and', 'migrants', 'now', 'homeless', 'after', 'a', 'fire', 'at', 'their', 'camp', 'on', 'a', 'Greek', 'island', 'needs', 'to', 'be', 'addressed', 'as', 'a', 'top', 'priority', '.'], ...]

To remove wonky characters: https://www.browserling.com/tools/utf8-encode

In [71]:
fdist = nltk.FreqDist()  # Empty distribution

for filename in news_corpus.fileids():
    try:
        fdist.update(nltk.bigrams(news_corpus.words(filename)))
    except:
        print(filename)

In [62]:
fdist.N()

26647756

In [None]:
from nltk.corpus import stopwords

In [108]:
en_stopwords = set(stopwords.words('english'))
text_list = []
for k,v in fdist.items():
    if v > 500:
        first = k[0].lower()
        second = k[1].lower()
        if first not in en_stopwords and second not in en_stopwords:
            if first not in string.punctuation and second not in string.punctuation:
                text_list.append(((first, second), v))

In [160]:
# Function to remove function words from bigrams
def rightTypes(ngram):
    for word in ngram:
        if word in en_stopwords or word.isspace():
            return False
        if word in ('5', "'", '’s', "\\'", '.-'):
            return False
        if len(word) <3:
            return False
        for char in word:
            if char in string.punctuation:
                return False
        if 'x' == word[0]:
            return False
        types_avoid = ('CC', 'DT', 'IN', 'POS', 'TO', 'SYM')
        tags = nltk.pos_tag(ngram)
    if tags[0][1] in types_avoid or tags[1][1] in types_avoid:
            return False
    else:
        return True

# Filter bigrams
filtered_text = [bigram for bigram in text_list if rightTypes(bigram[0])]

In [161]:
len(filtered_text)

323

In [162]:
text_list

[(('officials', 'said'), 1044),
 (('also', 'said'), 1056),
 (('refugee', 'camp'), 580),
 (('prime', 'minister'), 1784),
 (('public', 'health'), 2288),
 (('national', 'security'), 1914),
 (('last', 'week'), 3202),
 (('news', 'agency'), 647),
 ((',”', 'said'), 2134),
 (('human', 'rights'), 2279),
 (('european', 'union'), 1056),
 (('country', '’'), 608),
 (('asylum', 'seekers'), 1673),
 ((',\\', 'xe2'), 28374),
 (('x9d', 'said'), 3318),
 ((".'", 'b'), 7004),
 (('civil', 'rights'), 1268),
 (('getty', 'images'), 1050),
 (('social', 'media'), 2863),
 (('wall', 'street'), 1347),
 (('street', 'journal'), 539),
 (('last', 'week'), 561),
 (('last', 'month'), 1850),
 (('first', 'time'), 2160),
 (('.\\', 'xe2'), 24177),
 (('000', 'people'), 1721),
 (('tuesday', 'night'), 642),
 (('associated', 'press'), 1332),
 (('al', 'jazeera'), 2543),
 (('police', 'officers'), 853),
 (('two', 'years'), 1573),
 (('last', 'year'), 2719),
 (('donald', 'trump'), 11745),
 (('joe', 'biden'), 13471),
 (('trump', '’'),

In [164]:
# Save as JSON file
with open('intermediate_full_text.json', 'w') as outfile:
    json.dump(text_list, outfile)

This file was manually cleaned to remove junk words.

In [9]:
# Reopen cleaned file
with open('intermediate_full_text.json') as f:
      bigrams_cleaned = json.load(f)

print(bigrams_cleaned)

[[['officials', 'said'], 1044], [['also', 'said'], 1056], [['refugee', 'camp'], 580], [['prime', 'minister'], 1784], [['public', 'health'], 2288], [['national', 'security'], 1914], [['last', 'week'], 3202], [['news', 'agency'], 647], [['human', 'rights'], 2279], [['european', 'union'], 1056], [['country', '2019'], 608], [['asylum', 'seekers'], 1673], [['civil', 'rights'], 1268], [['getty', 'images'], 1050], [['social', 'media'], 2863], [['wall', 'street'], 1347], [['street', 'journal'], 539], [['last', 'week'], 561], [['last', 'month'], 1850], [['first', 'time'], 2160], [['tuesday', 'night'], 642], [['associated', 'press'], 1332], [['al', 'jazeera'], 2543], [['police', 'officers'], 853], [['two', 'years'], 1573], [['last', 'year'], 2719], [['donald', 'trump'], 11745], [['joe', 'biden'], 13471], [['trump', '2019'], 3854], [['us', 'president'], 678], [['president', 'donald'], 6731], [['white', 'house'], 9617], [['vice', 'president'], 3885], [['president', 'joe'], 1224], [['presidential',

In [39]:
len(bigrams_cleaned)

357

In [40]:
bigrams_clean = []

for bigram in bigrams_cleaned:
    if bigram[1] > 1700:
        bigrams_clean.append(bigram)

In [45]:
# Create list of all words in bigrams
all_words = []

for bigram in bigrams_clean:
    if bigram[0][0] not in all_words:
        all_words.append(bigram[0][0])
    if bigram[0][1] not in all_words:
        all_words.append(bigram[0][1])

# Remove the word 'news' which is a self-reference
all_words.remove('news')
all_words.remove('breitbart')
all_words.remove('fox')
all_words.remove('york')

# Determine which words occur more than once
duplicate_words = []

for word in all_words:
    word_count = 0
    for bigram in bigrams_clean:
        if word == bigram[0][0] or word == bigram[0][1]:
            word_count += 1
    if word_count > 1:
        duplicate_words.append(word)

# Only keep bigrams that contain these words
links = []

for bigram in bigrams_clean:
    if bigram[0][0] in duplicate_words or bigram[0][1] in duplicate_words:
        bigram_dict = {'source': bigram[0][0], 'target': bigram[0][1], 'frequency': bigram[1]}
        links.append(bigram_dict)

len(links)

34

In [46]:
links

[{'source': 'public', 'target': 'health', 'frequency': 2288},
 {'source': 'national', 'target': 'security', 'frequency': 1914},
 {'source': 'last', 'target': 'week', 'frequency': 3202},
 {'source': 'last', 'target': 'month', 'frequency': 1850},
 {'source': 'last', 'target': 'year', 'frequency': 2719},
 {'source': 'donald', 'target': 'trump', 'frequency': 11745},
 {'source': 'joe', 'target': 'biden', 'frequency': 13471},
 {'source': 'trump', 'target': '2019', 'frequency': 3854},
 {'source': 'president', 'target': 'donald', 'frequency': 6731},
 {'source': 'vice', 'target': 'president', 'frequency': 3885},
 {'source': 'presidential', 'target': 'election', 'frequency': 2659},
 {'source': 'coronavirus', 'target': 'pandemic', 'frequency': 3447},
 {'source': 'trump', 'target': 'administration', 'frequency': 6659},
 {'source': 'president', 'target': 'barack', 'frequency': 1758},
 {'source': 'barack', 'target': 'obama', 'frequency': 3212},
 {'source': 'biden', 'target': 'said', 'frequency': 226

In [47]:
# Read in pos/negative word lists
pos_words = []
with open('positive_words_en.txt') as p:
    for row in p:
        word = p.readline()
        pos_words.append(word.rstrip())

neg_words = [] 
with open('negative_words_en.txt') as n:
    for row in n:
        word = n.readline()
        neg_words.append(word.rstrip())

In [52]:
# Create list of words w/ sentiment from bigrams
nodes = []
word_check = []
for bigram in links:
    word_dict = {}
    sentiment = 0
    first_word = bigram['source']
    second_word = bigram['target']

    if first_word not in word_check:
        word_check.append(first_word)
        if first_word in pos_words:
            sentiment = 1
        if first_word in neg_words:
            sentiment = -1
        word_dict = {'id': first_word, 'group': sentiment}
        nodes.append(word_dict)
        
        
    if second_word not in word_check:
        word_check.append(second_word)
        if second_word in pos_words:
            sentiment = 1
        if second_word in neg_words:
            sentiment = -1
        word_dict = {'id': second_word, 'group': sentiment}
        nodes.append(word_dict)

In [53]:
# Check sentiment labels for accuracy
for word in nodes:
    print(word)

{'id': 'public', 'group': 0}
{'id': 'health', 'group': 0}
{'id': 'national', 'group': 0}
{'id': 'security', 'group': 0}
{'id': 'last', 'group': 0}
{'id': 'week', 'group': 0}
{'id': 'month', 'group': 0}
{'id': 'year', 'group': 0}
{'id': 'donald', 'group': 0}
{'id': 'trump', 'group': 0}
{'id': 'joe', 'group': 0}
{'id': 'biden', 'group': 0}
{'id': '2019', 'group': 0}
{'id': 'president', 'group': 0}
{'id': 'vice', 'group': -1}
{'id': 'presidential', 'group': 0}
{'id': 'election', 'group': 0}
{'id': 'coronavirus', 'group': 0}
{'id': 'pandemic', 'group': 0}
{'id': 'administration', 'group': 0}
{'id': 'barack', 'group': 0}
{'id': 'obama', 'group': 0}
{'id': 'said', 'group': 0}
{'id': 'homeland', 'group': 0}
{'id': 'asian', 'group': 0}
{'id': 'american', 'group': 0}
{'id': 'four', 'group': 0}
{'id': 'years', 'group': 0}
{'id': 'campaign', 'group': 0}
{'id': 'outbreak', 'group': -1}
{'id': 'novel', 'group': 0}
{'id': 'ago', 'group': 0}
{'id': 'democratic', 'group': 0}
{'id': 'party', 'group': 0

In [57]:
# Format data for arc diagram
full_text_bigrams = {'nodes': nodes, 'links': links, 'text_source': 'full_text'}


In [67]:
# Read in headline bigrams
bigrams_data = db.headline_bigrams.find_one({})

bigrams_data.pop('_id')
bigrams_data['text_source'] = 'headlines'

In [68]:
all_bigrams = [full_text_bigrams, bigrams_data]

In [69]:
all_bigrams

[{'nodes': [{'id': 'public', 'group': 0},
   {'id': 'health', 'group': 0},
   {'id': 'national', 'group': 0},
   {'id': 'security', 'group': 0},
   {'id': 'last', 'group': 0},
   {'id': 'week', 'group': 0},
   {'id': 'month', 'group': 0},
   {'id': 'year', 'group': 0},
   {'id': 'donald', 'group': 0},
   {'id': 'trump', 'group': 0},
   {'id': 'joe', 'group': 0},
   {'id': 'biden', 'group': 0},
   {'id': '2019', 'group': 0},
   {'id': 'president', 'group': 0},
   {'id': 'vice', 'group': -1},
   {'id': 'presidential', 'group': 0},
   {'id': 'election', 'group': 0},
   {'id': 'coronavirus', 'group': 0},
   {'id': 'pandemic', 'group': 0},
   {'id': 'administration', 'group': 0},
   {'id': 'barack', 'group': 0},
   {'id': 'obama', 'group': 0},
   {'id': 'said', 'group': 0},
   {'id': 'homeland', 'group': 0},
   {'id': 'asian', 'group': 0},
   {'id': 'american', 'group': 0},
   {'id': 'four', 'group': 0},
   {'id': 'years', 'group': 0},
   {'id': 'campaign', 'group': 0},
   {'id': 'outbreak'

In [70]:
# Save as JSON file
with open('all_bigrams.json', 'w') as outfile:
    json.dump(all_bigrams, outfile)