# _Natural Language Processing of Economic News Articles_
## Post-Metis Analysis - Token Creation and Visualization

(Jupyter Notebook 2 of ??)

#### ------ Section 2: Tokenize the cleaned articles -----------

In [1]:
import numpy as np
import spacy
import nltk
from nltk.corpus import wordnet
from gensim import corpora

nlp = spacy.load('en_core_web_md')
# nltk.download('stopwords')

# y_relevance = list(dfnews['relevance'])
# y_positivity = list(dfnews['positivity'])

In [2]:
# Define typical stop words plus those likely common to economic articles, without impacting positivity #
# Note: I shouldn't remove these for bi-gram analysis.
STOPWORDS_EXTRA = ['economic', 'finance', 'monetary', 'government', 'bank', 'money', 'amount', 'market']

en_stop = set(nltk.corpus.stopwords.words('english'))
for entry in STOPWORDS_EXTRA:    # could also use spacy's built-in stopword system
    en_stop.add(entry)             # (e.g. nlp.vocab['market'].is_stop = True), except doesn't work for _md model
    en_stop.add(entry+'s')

In [3]:
# Load up data version with common headers (e.g. "NEW YORK --") removed in NewsClassifier_RemoveHeaders.ipynb #
import pickle

with open('./saved_files/clean_NewsEcon2_a.pkl', 'rb') as picklefile: 
    clean_articles = pickle.load(picklefile)

##### STEPS
-- remove stopwords  
-- x_bow = bag of words  
-- x_tfidf = tf-idf  
-- Naive Bayes  


In [100]:
# Function adapted from original in NLP_Analysis.py #
def tokenize(document, nlp, stopwords, stoppos = ['PROPN','DET','PUNCT','NUM','PRON','SYM'],
             NER_tags = ['GRE','LOC','ORG','NORP','FAC','LANGUAGE','MONEY','LAW']):
    """ Tokenize items in string 'document', suitable for news articles.
        nlp: standard SpaCy object
        stopwords, stoppos: list of words (before lemmatizing) and parts-of-speech to exclude
    """
    doc_tokens = []
    
    # Analyze all words, rejecting unwanted parts of speech #
    tokens = nlp(document)  # nlp() = standard SpaCy processing tool
    for token in tokens:
        if token.is_space or token.is_punct or token.like_url:
            continue
        if (token.text not in stopwords) and (token.pos_ not in stoppos):
            lemma = token.lemma_     # note: this makes the word lower case
            if lemma and lemma!='-PRON-':
                doc_tokens.append(lemma)
                
    for entity in tokens.ents:
        if entity.label_ in NER_tags: # first remove '99' (only nec. to assure NER worked)
            ent_text = entity.text.upper()
            ent_text = re.sub(r'([ ]*)99[ ]*', r'\1', ent_text)
            
            if re.match(r'[a-zA-Z]', ent_text):  # reject orphaned entities
                doc_tokens.append(ent_text + ' : ' + entity.label_)
            
            for ent_lemma in nlp(entity.text):
                ent_lemma = ent_lemma.lemma_
                if doc_tokens.count(ent_lemma):  # remove lemma version if present
                    doc_tokens.remove(ent_lemma)

    return doc_tokens

In [101]:
# Look at a few tokens in an example article #
# Note the named entity "G7 Group" appears in all caps with a tag.
tokens = tokenize(clean_articles[19], nlp, en_stop)
tokens[::10]

['outlook',
 'end',
 'reading',
 'third',
 'contract',
 'historical',
 'minus',
 'annualiz',
 'would',
 'gain',
 'enough',
 'order',
 'G7 GROUP : ORG']

#### ------ Section 3: Create a bag of words from the tokens -----------

In [102]:
# from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

def my_tokenizer(documents, my_nlp=nlp, my_stopwords=en_stop):
#     func = tokenize(documents, nlp = my_nlp, stopwords = my_stopwords)
    return tokenize(documents, nlp=my_nlp, stopwords=my_stopwords)

cv_object = CountVectorizer(tokenizer=my_tokenizer, preprocessor=None, lowercase=False,
                            analyzer='word')
bow_array = cv_object.fit_transform(clean_articles[:5])

In [103]:
len(cv_object.get_feature_names())

443

In [104]:
cv_object.get_feature_names()

['ABOVE CENTS : MONEY',
 'AMERICANS : NORP',
 'AP : ORG',
 'AT LEAST $BILLION : MONEY',
 'BANXQUOTE : ORG',
 'BANXQUOTE MONEY MARKETS : ORG',
 'CENT : MONEY',
 'CENTS : MONEY',
 'CITIBANK : ORG',
 'CONGRESS : ORG',
 'CORESTATES : ORG',
 'D. : NORP',
 'DEMOCRATS : NORP',
 'EAP : ORG',
 'FAMILIES USA : ORG',
 'FRANCS : MONEY',
 'HOUSE : ORG',
 'JAPANESE : NORP',
 'MEDICAID : ORG',
 'REPUBLICANS : NORP',
 'SCHIP : ORG',
 'SENATE : ORG',
 'SWISS : NORP',
 'THE ASSOCIATED PRESS : ORG',
 'THE CENTERS FOR DISEASE CONTROL : ORG',
 'THE FEDERAL RESERVE BOARDS : ORG',
 'THE HOUSE LABOR SUBCOMMITTEE ON HEALTH AND SAFETY : ORG',
 'THE NEW YORK TIMES : ORG',
 'THE SENATE BANKING COMMITTEE : ORG',
 'THE STATE CHILDRENS HEALTH INSURANCE PROGRAM : ORG',
 'THE WASHINGTON POST : ORG',
 'THE WHITE HOUSE : ORG',
 'TIMES : ORG',
 'TREASURY : ORG',
 'YEN : MONEY',
 'ability',
 'abuse',
 'accord',
 'accuracy',
 'achieve',
 'administration',
 'adopt',
 'advantage',
 'advocacy',
 'afford',
 'afternoon',
 'agai

In [75]:
bow_array.toarray().shape

(5, 446)