## References
* https://medium.com/@nicharuch/collocations-identifying-phrases-that-act-like-individual-words-in-nlp-f58a93a2f84a
* Sentiment Lexicons: https://sites.google.com/site/datascienceslab/projects/multilingualsentiment

In [1]:
# Dependencies
import pymongo
import os
from dotenv import load_dotenv
import spacy
import re
import string
import nltk
from nltk.corpus import stopwords

In [2]:
# Load in database access info
load_dotenv()
username = os.getenv("db_username")
password = os.getenv("db_password")

In [3]:
# Connect to database
conn = f"mongodb+srv://{username}:{password}@clusterprime.mpaq0.mongodb.net/ETL?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn)

# Define the 'classDB' database in Mongo
db = client.ETL

In [4]:
# Pull all data from collection
news_data = db.NFTA.find({})

In [5]:
# Store headlines in list
headlines = []
for article in news_data:
    headlines.append(article['title'])
print(headlines) 



In [6]:
# Load spacy
nlp = spacy.load('en_core_web_sm')

# Function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub("", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

# Apply function to clean and lemmatize comments
lemmatized = clean_comments(headlines)

# Make all words lowercase
lemmatized_headlines_1 = [word.lower() for word in lemmatized]

In [8]:
# Lemmatize word "aliens" - missed by spacy
headlines_clean = []
for word in lemmatized_headlines_1:
    if word == 'aliens':
        headlines_clean.append('alien')
    else:
        headlines_clean.append(word)
headlines_clean

['-pron-',
 'could',
 'be',
 'deport',
 'because',
 '-pron-',
 'parent',
 'come',
 'here',
 'legally',
 'biden',
 'can',
 'change',
 'that',
 'first',
 'latino',
 'tap',
 'to',
 'head',
 'dhs',
 'signal',
 'shift',
 'from',
 'trumps',
 'hardline',
 'immigration',
 'policy',
 'tony',
 'pham',
 'interim',
 'director',
 'of',
 'immigration',
 'and',
 'customs',
 'enforcement',
 'to',
 'step',
 'down',
 'more',
 'than',
 'twothird',
 'of',
 'undocumented',
 'immigrant',
 'worker',
 'have',
 'job',
 'essential',
 'to',
 'covid',
 'fight',
 'say',
 'study',
 'biden',
 'to',
 'meet',
 'with',
 'struggle',
 'worker',
 'smallbusiness',
 'owner',
 'about',
 'the',
 'economic',
 'crisis',
 ' ',
 'nbc',
 'news',
 'accused',
 'hate',
 'group',
 'receive',
 'pandemic',
 'aid',
 'lawyer',
 'say',
 'trump',
 'admin',
 'have',
 'hand',
 'over',
 'datum',
 'that',
 'will',
 'help',
 'reunite',
 'separate',
 'migrant',
 'family',
 '28',
 'migrant',
 'child',
 'and',
 '-pron-',
 'parent',
 'face',
 'depor

In [9]:
# Calculate bigrams
bigrams = nltk.collocations.BigramAssocMeasures()

bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(headlines_clean)

In [10]:
# Limit bigrams to at least 10 occurrences
bigramFinder.apply_freq_filter(10)

# Pull out bigrams and PMI scores
headline_bigrams = list(bigramFinder.score_ngrams(bigrams.pmi))
print(headline_bigrams)

[(('ilhan', 'omar'), 11.245711148410681), (('hong', 'kong'), 11.152601744019199), (('ruth', 'bader'), 10.982676742576889), (('north', 'carolina'), 10.84274648143241), (('town', 'hall'), 10.387066997656222), (('fact', 'check'), 10.36600538212839), (('pope', 'francis'), 10.294620748891628), (('bader', 'ginsburg'), 10.134679836021936), (('fox', 'news'), 10.108207624660746), (('amy', 'coney'), 9.760284321240439), (('coney', 'barrett'), 9.612185682251306), (('washington', 'post'), 9.608281227795388), (('wall', 'street'), 9.567639243298043), (('5', 'thing'), 9.446623842336676), (('terror', 'attack'), 8.720249659438187), (('health', 'care'), 8.69590509310314), (('undocumented', 'immigrant'), 8.644807103820504), (('kamala', 'harris'), 8.545271430269587), (('illegal', 'alien'), 8.38504783001957), (('supreme', 'court'), 8.167452134490183), (('new', 'york'), 8.054569661058673), (('vp', 'debate'), 8.013050391620405), (('border', 'patrol'), 7.9826767425768885), (('white', 'house'), 7.91539178326417

In [11]:
# Get english stopwords
en_stopwords = set(stopwords.words('english'))

# Function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords or word.isspace():
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

# Filter bigrams
filtered_headlines = [bigram for bigram in headline_bigrams if rightTypes(bigram[0])]

In [12]:
# Review filtered bigrams
filtered_headlines

[(('ilhan', 'omar'), 11.245711148410681),
 (('hong', 'kong'), 11.152601744019199),
 (('ruth', 'bader'), 10.982676742576889),
 (('north', 'carolina'), 10.84274648143241),
 (('town', 'hall'), 10.387066997656222),
 (('pope', 'francis'), 10.294620748891628),
 (('bader', 'ginsburg'), 10.134679836021936),
 (('fox', 'news'), 10.108207624660746),
 (('amy', 'coney'), 9.760284321240439),
 (('coney', 'barrett'), 9.612185682251306),
 (('washington', 'post'), 9.608281227795388),
 (('wall', 'street'), 9.567639243298043),
 (('terror', 'attack'), 8.720249659438187),
 (('health', 'care'), 8.69590509310314),
 (('undocumented', 'immigrant'), 8.644807103820504),
 (('illegal', 'alien'), 8.38504783001957),
 (('supreme', 'court'), 8.167452134490183),
 (('new', 'york'), 8.054569661058673),
 (('vp', 'debate'), 8.013050391620405),
 (('border', 'patrol'), 7.9826767425768885),
 (('white', 'house'), 7.9153917832641785),
 (('presidential', 'debate'), 7.783568545497637),
 (('battleground', 'state'), 7.75457067004604

In [13]:
# Read in pos/negative word lists

pos_words = []
with open('positive_words_en.txt') as p:
    for row in p:
        word = p.readline()
        pos_words.append(word.rstrip())

neg_words = [] 
with open('negative_words_en.txt') as n:
    for row in n:
        word = n.readline()
        neg_words.append(word.rstrip())

['death', 'issue', 'killed', 'problems', 'problem', 'lies', 'loss', 'fall', 'lack', 'attacks', 'dead', 'break', 'fell', 'critical', 'unknown', 'fiction', 'enemy', 'opposition', 'losing', 'broke', 'bad', 'crime', 'kill', 'failure', 'critics', 'cancer', 'broken', 'wrong', 'concerns', 'controversial', 'fictional', 'criticism', 'limit', 'strike', 'vice', 'die', 'concerned', 'damaged', 'unusual', 'slow', 'defensive', 'struck', 'cold', 'rival', 'wild', 'losses', 'suicide', 'emergency', 'destruction', 'struggle', 'lose', 'absence', 'limits', 'impossible', 'fear', 'destroy', 'assault', 'missed', 'confused', 'falling', 'protest', 'unsuccessful', 'pain', 'slowly', 'lacking', 'trouble', 'waste', 'debt', 'worst', 'isolated', 'difficulty', 'difficulties', 'lying', 'collapse', 'radical', 'disaster', 'outbreak', 'dies', 'noise', 'protests', 'breaks', 'disputed', 'conflicts', 'lie', 'resignation', 'exile', 'surrender', 'retreat', 'cave', 'unclear', 'prisoner', 'burning', 'disorder', 'allegations', 'ki

In [14]:
# Create list of words w/ sentiment from bigrams
words_sentiment = []
word_check = []
for bigram in filtered_headlines:
    word_dict = {}
    sentiment = 'not labelled'
    first_word = bigram[0][0]
    second_word = bigram[0][1]

    if first_word not in word_check:
        word_check.append(first_word)
        if first_word in pos_words:
            sentiment = 1
        if first_word in neg_words:
            sentiment = -1
        word_dict = {'id': first_word, 'group': sentiment}
        words_sentiment.append(word_dict)
        
        
    if second_word not in word_check:
        word_check.append(second_word)
        if second_word in pos_words:
            sentiment = 1
        if second_word in neg_words:
            sentiment = -1
        word_dict = {'id': second_word, 'group': sentiment}
        words_sentiment.append(word_dict)

In [None]:
# Check sentiment labels for accuracy
for word in words_sentiment:
    print(word)

In [15]:
# Revise sentiment labels
for word in words_sentiment:
    if word['group'] == 'not labelled':
        if word['id'] in ['terror', 'attack', 'battleground', 'illegal']:
            word['group'] = -1
        else:
            word['group'] = 0


In [16]:
# Check sentiment labels after editing
for word in words_sentiment:
    print(word)

{'id': 'ilhan', 'group': 0}
{'id': 'omar', 'group': 0}
{'id': 'hong', 'group': 0}
{'id': 'kong', 'group': 0}
{'id': 'ruth', 'group': 0}
{'id': 'bader', 'group': 0}
{'id': 'north', 'group': 0}
{'id': 'carolina', 'group': 0}
{'id': 'town', 'group': 0}
{'id': 'hall', 'group': 0}
{'id': 'pope', 'group': 0}
{'id': 'francis', 'group': 0}
{'id': 'ginsburg', 'group': 0}
{'id': 'fox', 'group': 0}
{'id': 'news', 'group': 0}
{'id': 'amy', 'group': 0}
{'id': 'coney', 'group': 0}
{'id': 'barrett', 'group': 0}
{'id': 'washington', 'group': 0}
{'id': 'post', 'group': 0}
{'id': 'wall', 'group': 0}
{'id': 'street', 'group': 0}
{'id': 'terror', 'group': -1}
{'id': 'attack', 'group': -1}
{'id': 'health', 'group': 0}
{'id': 'care', 'group': 0}
{'id': 'undocumented', 'group': -1}
{'id': 'immigrant', 'group': -1}
{'id': 'illegal', 'group': -1}
{'id': 'alien', 'group': 0}
{'id': 'supreme', 'group': 0}
{'id': 'court', 'group': 0}
{'id': 'new', 'group': 0}
{'id': 'york', 'group': 0}
{'id': 'vp', 'group': 0}
{'

In [20]:
# Format bigrams
links = []
for bigram in filtered_headlines:
    word_dict = {
        'source': bigram[0][0],
        'target': bigram[0][1],
        'value': round(bigram[1])
    }
    links.append(word_dict)    

[{'source': 'ilhan', 'target': 'omar', 'value': 11}, {'source': 'hong', 'target': 'kong', 'value': 11}, {'source': 'ruth', 'target': 'bader', 'value': 11}, {'source': 'north', 'target': 'carolina', 'value': 11}, {'source': 'town', 'target': 'hall', 'value': 10}, {'source': 'pope', 'target': 'francis', 'value': 10}, {'source': 'bader', 'target': 'ginsburg', 'value': 10}, {'source': 'fox', 'target': 'news', 'value': 10}, {'source': 'amy', 'target': 'coney', 'value': 10}, {'source': 'coney', 'target': 'barrett', 'value': 10}, {'source': 'washington', 'target': 'post', 'value': 10}, {'source': 'wall', 'target': 'street', 'value': 10}, {'source': 'terror', 'target': 'attack', 'value': 9}, {'source': 'health', 'target': 'care', 'value': 9}, {'source': 'undocumented', 'target': 'immigrant', 'value': 9}, {'source': 'illegal', 'target': 'alien', 'value': 8}, {'source': 'supreme', 'target': 'court', 'value': 8}, {'source': 'new', 'target': 'york', 'value': 8}, {'source': 'vp', 'target': 'debate'

In [21]:
# Format data for arc diagram

headline_bigrams_formatted = {'nodes': words_sentiment, 'links': links}


{'nodes': [{'id': 'ilhan', 'group': 0}, {'id': 'omar', 'group': 0}, {'id': 'hong', 'group': 0}, {'id': 'kong', 'group': 0}, {'id': 'ruth', 'group': 0}, {'id': 'bader', 'group': 0}, {'id': 'north', 'group': 0}, {'id': 'carolina', 'group': 0}, {'id': 'town', 'group': 0}, {'id': 'hall', 'group': 0}, {'id': 'pope', 'group': 0}, {'id': 'francis', 'group': 0}, {'id': 'ginsburg', 'group': 0}, {'id': 'fox', 'group': 0}, {'id': 'news', 'group': 0}, {'id': 'amy', 'group': 0}, {'id': 'coney', 'group': 0}, {'id': 'barrett', 'group': 0}, {'id': 'washington', 'group': 0}, {'id': 'post', 'group': 0}, {'id': 'wall', 'group': 0}, {'id': 'street', 'group': 0}, {'id': 'terror', 'group': -1}, {'id': 'attack', 'group': -1}, {'id': 'health', 'group': 0}, {'id': 'care', 'group': 0}, {'id': 'undocumented', 'group': -1}, {'id': 'immigrant', 'group': -1}, {'id': 'illegal', 'group': -1}, {'id': 'alien', 'group': 0}, {'id': 'supreme', 'group': 0}, {'id': 'court', 'group': 0}, {'id': 'new', 'group': 0}, {'id': 'yo

In [None]:
# TODO: Upload to MongoDB

In [None]:
# TODO: Repeat this process for full text of articles