## References
* https://medium.com/@nicharuch/collocations-identifying-phrases-that-act-like-individual-words-in-nlp-f58a93a2f84a
* Sentiment Lexicons: https://sites.google.com/site/datascienceslab/projects/multilingualsentiment

In [1]:
# Dependencies
import pymongo
import os
from dotenv import load_dotenv
import spacy
import re
import string
import nltk
from nltk.corpus import stopwords
import json

In [2]:
# Load in database access info
load_dotenv()
username = os.getenv("db_username")
password = os.getenv("db_password")

In [3]:
# Connect to database
conn = f"mongodb+srv://{username}:{password}@clusterprime.mpaq0.mongodb.net/ETL?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn)

# Define the 'classDB' database in Mongo
db = client.ETL

In [4]:
# Pull all data from collection
news_data = db.NFTA.find({})

## Analyze Bigrams of Headlines

In [5]:
# Store headlines in list
headlines = []
for article in news_data:
    headlines.append(article['title'])

In [6]:
# Load spacy
nlp = spacy.load('en_core_web_sm')

# Function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub("", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

# Apply function to clean and lemmatize comments
lemmatized = clean_comments(headlines)

# Make all words lowercase
lemmatized_headlines_1 = [word.lower() for word in lemmatized]

In [46]:
# Lemmatize word "aliens" - missed by spacy
headlines_clean = []
for word in lemmatized_headlines_1:
    if word == 'aliens':
        headlines_clean.append('alien')
    else:
        headlines_clean.append(word)

In [47]:
# Calculate bigrams
bigrams = nltk.collocations.BigramAssocMeasures()

bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(headlines_clean)

In [48]:
# Limit bigrams to at least 10 occurrences
bigramFinder.apply_freq_filter(10)

# Pull out bigrams and PMI scores
headline_bigrams = list(bigramFinder.score_ngrams(bigrams.pmi))

In [69]:
# Get english stopwords
en_stopwords = set(stopwords.words('english'))

# Function to remove function words from bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    # Why is this not iterating through the tuple?
    for word in ngram:
        if word in en_stopwords or word.isspace():
            return False
        elif word in ('5', "'", '’s'):
            return False
    types_avoid = ('CC', 'DT', 'IN', 'POS', 'TO', 'SYM')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in types_avoid or tags[1][1] in types_avoid:
        return False
    else:
        return True

# Filter bigrams
filtered_headlines = [bigram for bigram in headline_bigrams if rightTypes(bigram[0])]

In [70]:
# Review filtered bigrams
filtered_headlines

[(('ilhan', 'omar'), 11.245711148410681),
 (('hong', 'kong'), 11.152601744019199),
 (('ruth', 'bader'), 10.982676742576889),
 (('north', 'carolina'), 10.84274648143241),
 (('town', 'hall'), 10.387066997656222),
 (('fact', 'check'), 10.36600538212839),
 (('pope', 'francis'), 10.294620748891628),
 (('bader', 'ginsburg'), 10.134679836021936),
 (('fox', 'news'), 10.108207624660746),
 (('amy', 'coney'), 9.760284321240439),
 (('coney', 'barrett'), 9.612185682251306),
 (('washington', 'post'), 9.608281227795388),
 (('wall', 'street'), 9.567639243298043),
 (('terror', 'attack'), 8.720249659438187),
 (('health', 'care'), 8.69590509310314),
 (('undocumented', 'immigrant'), 8.644807103820504),
 (('kamala', 'harris'), 8.545271430269587),
 (('illegal', 'alien'), 8.38504783001957),
 (('supreme', 'court'), 8.167452134490183),
 (('new', 'york'), 8.054569661058673),
 (('vp', 'debate'), 8.013050391620405),
 (('border', 'patrol'), 7.9826767425768885),
 (('white', 'house'), 7.9153917832641785),
 (('presid

In [71]:
# Read in pos/negative word lists

pos_words = []
with open('positive_words_en.txt') as p:
    for row in p:
        word = p.readline()
        pos_words.append(word.rstrip())

neg_words = [] 
with open('negative_words_en.txt') as n:
    for row in n:
        word = n.readline()
        neg_words.append(word.rstrip())

In [72]:
# Create list of words w/ sentiment from bigrams
words_sentiment = []
word_check = []
for bigram in filtered_headlines:
    word_dict = {}
    sentiment = 'not labelled'
    first_word = bigram[0][0]
    second_word = bigram[0][1]

    if first_word not in word_check:
        word_check.append(first_word)
        if first_word in pos_words:
            sentiment = 1
        if first_word in neg_words:
            sentiment = -1
        word_dict = {'id': first_word, 'group': sentiment}
        words_sentiment.append(word_dict)
        
        
    if second_word not in word_check:
        word_check.append(second_word)
        if second_word in pos_words:
            sentiment = 1
        if second_word in neg_words:
            sentiment = -1
        word_dict = {'id': second_word, 'group': sentiment}
        words_sentiment.append(word_dict)

In [73]:
# Check sentiment labels for accuracy
for word in words_sentiment:
    print(word)

{'id': 'ilhan', 'group': 'not labelled'}
{'id': 'omar', 'group': 'not labelled'}
{'id': 'hong', 'group': 'not labelled'}
{'id': 'kong', 'group': 'not labelled'}
{'id': 'ruth', 'group': 'not labelled'}
{'id': 'bader', 'group': 'not labelled'}
{'id': 'north', 'group': 'not labelled'}
{'id': 'carolina', 'group': 'not labelled'}
{'id': 'town', 'group': 'not labelled'}
{'id': 'hall', 'group': 'not labelled'}
{'id': 'fact', 'group': 'not labelled'}
{'id': 'check', 'group': 'not labelled'}
{'id': 'pope', 'group': 'not labelled'}
{'id': 'francis', 'group': 'not labelled'}
{'id': 'ginsburg', 'group': 'not labelled'}
{'id': 'fox', 'group': 'not labelled'}
{'id': 'news', 'group': 'not labelled'}
{'id': 'amy', 'group': 'not labelled'}
{'id': 'coney', 'group': 'not labelled'}
{'id': 'barrett', 'group': 'not labelled'}
{'id': 'washington', 'group': 'not labelled'}
{'id': 'post', 'group': 'not labelled'}
{'id': 'wall', 'group': 'not labelled'}
{'id': 'street', 'group': 'not labelled'}
{'id': 'terror'

In [74]:
# Revise sentiment labels
for word in words_sentiment:
    if word['group'] == 'not labelled':
        if word['id'] in ['terror', 'attack', 'battleground', 'illegal']:
            word['group'] = -1
        else:
            word['group'] = 0


In [75]:
# Check sentiment labels after editing
for word in words_sentiment:
    print(word)

{'id': 'ilhan', 'group': 0}
{'id': 'omar', 'group': 0}
{'id': 'hong', 'group': 0}
{'id': 'kong', 'group': 0}
{'id': 'ruth', 'group': 0}
{'id': 'bader', 'group': 0}
{'id': 'north', 'group': 0}
{'id': 'carolina', 'group': 0}
{'id': 'town', 'group': 0}
{'id': 'hall', 'group': 0}
{'id': 'fact', 'group': 0}
{'id': 'check', 'group': 0}
{'id': 'pope', 'group': 0}
{'id': 'francis', 'group': 0}
{'id': 'ginsburg', 'group': 0}
{'id': 'fox', 'group': 0}
{'id': 'news', 'group': 0}
{'id': 'amy', 'group': 0}
{'id': 'coney', 'group': 0}
{'id': 'barrett', 'group': 0}
{'id': 'washington', 'group': 0}
{'id': 'post', 'group': 0}
{'id': 'wall', 'group': 0}
{'id': 'street', 'group': 0}
{'id': 'terror', 'group': -1}
{'id': 'attack', 'group': -1}
{'id': 'health', 'group': 0}
{'id': 'care', 'group': 0}
{'id': 'undocumented', 'group': -1}
{'id': 'immigrant', 'group': -1}
{'id': 'kamala', 'group': 0}
{'id': 'harris', 'group': 0}
{'id': 'illegal', 'group': -1}
{'id': 'alien', 'group': 0}
{'id': 'supreme', 'group'

In [76]:
# Format bigrams
links = []
for bigram in filtered_headlines:
    word_dict = {
        'source': bigram[0][0],
        'target': bigram[0][1],
        'value': round(bigram[1])
    }
    links.append(word_dict)    

In [81]:
# Format data for arc diagram
headline_bigrams_formatted = {'nodes': words_sentiment, 'links': links}
headline_bigrams_formatted

{'nodes': [{'id': 'ilhan', 'group': 0},
  {'id': 'omar', 'group': 0},
  {'id': 'hong', 'group': 0},
  {'id': 'kong', 'group': 0},
  {'id': 'ruth', 'group': 0},
  {'id': 'bader', 'group': 0},
  {'id': 'north', 'group': 0},
  {'id': 'carolina', 'group': 0},
  {'id': 'town', 'group': 0},
  {'id': 'hall', 'group': 0},
  {'id': 'fact', 'group': 0},
  {'id': 'check', 'group': 0},
  {'id': 'pope', 'group': 0},
  {'id': 'francis', 'group': 0},
  {'id': 'ginsburg', 'group': 0},
  {'id': 'fox', 'group': 0},
  {'id': 'news', 'group': 0},
  {'id': 'amy', 'group': 0},
  {'id': 'coney', 'group': 0},
  {'id': 'barrett', 'group': 0},
  {'id': 'washington', 'group': 0},
  {'id': 'post', 'group': 0},
  {'id': 'wall', 'group': 0},
  {'id': 'street', 'group': 0},
  {'id': 'terror', 'group': -1},
  {'id': 'attack', 'group': -1},
  {'id': 'health', 'group': 0},
  {'id': 'care', 'group': 0},
  {'id': 'undocumented', 'group': -1},
  {'id': 'immigrant', 'group': -1},
  {'id': 'kamala', 'group': 0},
  {'id': 'h

In [88]:
# Remove words that only occur in one bigram
words_keep = []
for word in word_check:
    word_count = 0
    for bi_dict in links:
        if word == bi_dict['source'] or word == bi_dict['target']:
            word_count += 1
    if word_count > 1:
        words_keep.append(word)
        
links_filtered = []
words_sentiment_filtered = []
for word in words_keep:
    for bi_dict in links:
        if word == bi_dict['source'] or word == bi_dict['target']:
            links_filtered.append(bi_dict)

words_filtered = []
for bigram in links_filtered:
    if bigram['source'] not in words_filtered:
        words_filtered.append(bigram['source'])
    if bigram['target'] not in words_filtered:
        words_filtered.append(bigram['target'])

words_sentiment_filtered = []
for word in words_filtered:
    for sent_dict in words_sentiment:
        if word == sent_dict['id']:
            words_sentiment_filtered.append(sent_dict)

In [None]:
# Format data for arc diagram
headline_bigrams_filtered = {'nodes': words_sentiment, 'links': links}
headline_bigrams_filtered

In [79]:
# Save as JSON file
with open('headline_bigrams.json', 'w') as outfile:
    json.dump(headline_bigrams_formatted, outfile)

## Analyze Bigrams of Full Text

In [6]:
# Store source text in list
text = []
for article in news_data:
    text.append(article['text_complete'])


In [9]:
len(text)

3094

In [13]:
# Load spacy
nlp = spacy.load('en_core_web_sm')

# Function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub("", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

# Apply function to clean and lemmatize comments

text = clean_comments(text[0:150])

# Make all words lowercase
lemmatized_text_1 = [word.lower() for word in lemmatized_text]

In [17]:
text2 = clean_comments(text[151:302])

In [19]:
text1_1 = [word.lower() for word in text]
text2_1 = [word.lower() for word in text2]

In [28]:
text_f = text1_1 + text2_1

In [35]:
# Calculate bigrams
bigrams = nltk.collocations.BigramAssocMeasures()

bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(text_f)

In [42]:
# Limit bigrams to at least 10 occurrences
bigramFinder.apply_freq_filter(30)

# Pull out bigrams and PMI scores
text_bigrams = list(bigramFinder.score_ngrams(bigrams.pmi))
print(text_bigrams)

[(('hong', 'kong'), 11.901203256196636), (('john', 'binder'), 11.13337669832567), (('video', 'clip'), 10.921872593131958), (('emphasis', 'add'), 10.226692423066577), (('green', 'card'), 10.062158104134394), (('supreme', 'court'), 9.940731620383271), (('homeland', 'security'), 9.932147376173404), (('united', 'states'), 9.55978731977284), (('transition', 'team'), 9.481300001745977), (('illegal', 'alien'), 9.184225839821426), (('breitbart', 'news'), 9.058910205351642), (('presidentelect', 'joe'), 9.029882066740631), (('new', 'york'), 8.901203256196634), (('law', 'enforcement'), 8.878835443168182), (('white', 'house'), 8.78146401192254), (('vice', 'president'), 8.543252492488733), (('donald', 'trump'), 8.305027893537783), (('news', 'follow'), 8.249115752629187), (('president', 'donald'), 8.23721250931021), (('tell', 'cnn'), 8.22504268584748), (('joe', 'biden'), 8.20009362249201), (('at', 'least'), 8.043456113651475), (('at', 'jxhnbinder'), 8.034954645085465), (('national', 'security'), 7.8

In [45]:
# Get english stopwords
en_stopwords = set(stopwords.words('english'))

# Function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords or word.isspace():
            return False
        else:
            return True
#     acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
#     second_type = ('NN', 'NNS', 'NNP', 'NNPS')
#     tags = nltk.pos_tag(ngram)
#     if tags[0][1] in acceptable_types and tags[1][1] in second_type:
#         return True
#     else:
#         return False

# Filter bigrams
filtered_headlines = [bigram for bigram in text_bigrams if rightTypes(bigram[0])]

In [46]:
filtered_headlines

[(('hong', 'kong'), 11.901203256196636),
 (('john', 'binder'), 11.13337669832567),
 (('video', 'clip'), 10.921872593131958),
 (('emphasis', 'add'), 10.226692423066577),
 (('green', 'card'), 10.062158104134394),
 (('supreme', 'court'), 9.940731620383271),
 (('homeland', 'security'), 9.932147376173404),
 (('united', 'states'), 9.55978731977284),
 (('transition', 'team'), 9.481300001745977),
 (('illegal', 'alien'), 9.184225839821426),
 (('breitbart', 'news'), 9.058910205351642),
 (('presidentelect', 'joe'), 9.029882066740631),
 (('new', 'york'), 8.901203256196634),
 (('law', 'enforcement'), 8.878835443168182),
 (('white', 'house'), 8.78146401192254),
 (('vice', 'president'), 8.543252492488733),
 (('donald', 'trump'), 8.305027893537783),
 (('news', 'follow'), 8.249115752629187),
 (('president', 'donald'), 8.23721250931021),
 (('tell', 'cnn'), 8.22504268584748),
 (('joe', 'biden'), 8.20009362249201),
 (('national', 'security'), 7.8902416491376375),
 (('four', 'year'), 7.628673769519974),
 (