# _Natural Language Processing of Economic News Articles_
## Post-Metis Analysis - Token Creation and Visualization

(Jupyter Notebook 2 of 3)

#### ------ Section 2: Tokenize the cleaned articles -----------

In [200]:
import numpy as np
import spacy
import nltk
from nltk.corpus import wordnet
from gensim import corpora

nlp = spacy.load('en_core_web_md')
# nltk.download('stopwords')

# y_relevance = list(dfnews['relevance'])
# y_positivity = list(dfnews['positivity'])

In [201]:
# Define typical stop words plus those likely common to economic articles, without impacting positivity #
# Note: I shouldn't remove these for bi-gram analysis.
STOPWORDS_EXTRA = ['economic', 'finance', 'monetary', 'government', 'bank', 'money', 'amount', 'market']

en_stop = set(nltk.corpus.stopwords.words('english'))
for entry in STOPWORDS_EXTRA:    # could also use spacy's built-in stopword system
    en_stop.add(entry)             # (e.g. nlp.vocab['market'].is_stop = True), except doesn't work for _md model
    en_stop.add(entry+'s')

In [202]:
# Load up data version with common headers (e.g. "NEW YORK --") removed in NewsClassifier_RemoveHeaders.ipynb #
import pickle

with open('./saved_files/clean_NewsEcon2_a.pkl', 'rb') as picklefile: 
    clean_articles = pickle.load(picklefile)

##### STEPS
-- remove stopwords  
-- x_bow = bag of words  
-- x_tfidf = tf-idf  
-- Naive Bayes  


In [205]:
# Function adapted from original in NLP_Analysis.py #
def tokenize(document, nlp, stopwords, stoppos = ['PROPN','DET','PUNCT','NUM'],
             NER_tags = ['GRE','LOC','ORG','NORP','PERSON','FAC','LANGUAGE','MONEY','LAW']):
    """ Tokenize items in string 'document', suitable for news articles.
        nlp: standard SpaCy object
        stopwords, stoppos: list of words (before lemmatizing) and parts-of-speech to exclude
    """
    doc_tokens = []
    
    # Analyze all words, rejecting unwanted parts of speech #
    tokens = nlp(document)  # nlp() = standard SpaCy processing tool
    for token in tokens:
        if token.is_space or token.is_punct or token.like_url:
            continue
        if (token.text not in stopwords) and (token.pos_ not in stoppos):
            lemma = token.lemma_     # note: this makes the word lower case
            if lemma:
                doc_tokens.append(lemma)
                
    for entity in tokens.ents:
        if entity.label_ in NER_tags:
            doc_tokens.append(entity.text.upper() + ' : ' + entity.label_)
            for ent_lemma in nlp(entity.text):
                ent_lemma = ent_lemma.lemma_
                if doc_tokens.count(ent_lemma):  # remove lemma version if present
                    doc_tokens.remove(ent_lemma)

    return doc_tokens

In [207]:
# Look at a few tokens in an example article #
tokens = tokenize(clean_articles[19], nlp, en_stop)
tokens[::10]

['outlook',
 'end',
 'reading',
 'third',
 'contract',
 'historical',
 'minus',
 'annualiz',
 'would',
 'gain',
 'enough',
 'order',
 'G7 GROUP : ORG']

In [208]:
# Create a bag of words vector for all the documents #
test_list = [tokenize(clean_articles[19], nlp, en_stop), tokenize(clean_articles[2], nlp, en_stop)]

from sklearn.feature_extraction.text import CountVectorizer
cv_object = CountVectorizer()
bow_array = cv_object.fit_transform(test_list)

AttributeError: 'list' object has no attribute 'lower'

In [276]:
# from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

def my_tokenizer(documents, my_nlp=nlp, my_stopwords=en_stop):
#     func = tokenize(documents, nlp = my_nlp, stopwords = my_stopwords)
    return tokenize(documents, nlp=my_nlp, stopwords=my_stopwords)

cv_object = CountVectorizer(tokenizer=my_tokenizer, preprocessor=None, lowercase=False, analyzer='word')
bow_array = cv_object.fit_transform(clean_articles[:5])

In [266]:
cv_object.get_feature_names()[::50]

['$',
 'administration',
 'children',
 'employee',
 'important',
 'mid',
 'political',
 'sale',
 'there',
 'yen']

In [279]:
bow_array.toarray()

array([[0, 0, 0, ..., 0, 1, 4],
       [2, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 2, 0, ..., 0, 0, 0],
       [2, 0, 2, ..., 3, 1, 0]], dtype=int64)