# _Natural Language Processing of Economic News Articles_
## Post-Metis Analysis - Removal of Headers
### (Headers are common text at the beginning of articles, like "Your Morning News Brief".)

(Jupyter Notebook 1 of 2)

#### ------ Section 1:  Load the data as a dataframe and clean up the text -----------

In [1]:
import numpy as np
import pandas as pd
import os, re

pd.set_option('max_colwidth',80)

DATAPATH = '~/Data/Economic_News'

In [2]:
# Load in the data set and keep only the important information #
filename = os.path.join(os.path.expanduser(DATAPATH), 'NewsEcon2.csv')
dfnews = pd.read_csv(filename, encoding ='latin1')
dfnews = dfnews[['articleid','headline','text','relevance','positivity']]

In [3]:
# Look at the first couple of articles #
# Note that positivity is NaN when relevance = 'no'.
dfnews.head(2)   # size is 8000 rows x 5 colums

Unnamed: 0,articleid,headline,text,relevance,positivity
0,wsj_398217788,Yields on CDs Fell in the Latest Week,NEW YORK -- Yields on most certificates of deposit offered by major banks dr...,yes,3.0
1,wsj_399019502,The Morning Brief: White House Seeks to Limit Child Insurance Program,"The Wall Street Journal Online</br></br>The Morning Brief, a look at the day...",no,


In [4]:
# How many articles are a "yes" relevance? Trying two methods. #
sum1 = dfnews['positivity'].notnull().sum()
sum2 = sum(dfnews['relevance']=='yes')

print(f'US relevant article counts = {sum1} and {sum2}.')    # 1420, by both methods

US relevant article counts = 1420 and 1420.


In [5]:
# Clean documents #
def clean_string(document, replace_dict):
    # Replace patterns in dict key with dict value that are found in document.
    assert type(document)==str
    for k,v in replace_dict.items():
        document = re.sub(k, v, document)
    
    return document
    
REPLACE_TEXT = {r"</br></br>" : ". ",  # define non-text characters for replacement
                r"\'" : "",
                r":" : ". ",
                r" --" : ".", r"--" : ".", r"-" : " ",
                r"a\.m\." : "", r"p\.m\." : "", r" am" : " ", r" pm":  " ",
                r"A\.M\." : "", r"P\.M\." : "", r" AM" : " ", r" PM":  " ",
                r"U\.S\. " : "USA ", r"U\.S\.A\." : "USA",
                r"D\.C\." : "DC", r"J\.P\." : "JP",
                r"Co\." : "Company", r"Corp\." : "Corporation",
                r"Jan\." : "January", r"Feb\." : "February", r"Mar\." : "March",
                r"Apr\." : "Apr", r"Jun\." : "June",
                r"Jul\." : "July", r"Aug\." : "August", r"Sep\." : "September",
                r"Oct\." : "October", r"Nov\." : "November", r"Dec\." : "December",
                r"%" : ""}

REPLACE_NUMBER = {r"\d+\.\d+" : "99",   # setting to arbitrary number will allow capturing of monetary values
                   r"([^a-zA-Z])(\d+)" : r"\g<1>99"}

clean_articles = [clean_string(doc, REPLACE_TEXT) for doc in dfnews['text']]
clean_articles = [clean_string(doc, REPLACE_NUMBER) for doc in clean_articles]

In [6]:
# Get first several lines (sentences) from a single document #
# Acquired text will be assessed for repetitions across documents below.
def get_firstlines(document, num_lines, min_char=5, max_char=40):
    # Limit text to first 'max_char'
    line_list = []
    groups = document.split('.')
    
    for i in range(num_lines):    # store if line exists and is long enough
        if len(groups)>=i+1 and (min_char <= len(groups[i]) <= max_char):
            text = groups[i]
        else:
            text = ''
        line_list.append(text)
    
    return tuple(line_list)

In [7]:
# From
from scipy import sparse

def find_commonlines(lines_ByDoc, verbose=True):
    # 'lines_ByDoc': a tuple of lists holding first N lines for each document.
    # 'index' of documents to limit search
    # Returns a tuple of N ndoc x ndoc matrix of indexes (tuple idx = depth,
    #   row = doc w/ common header, col = other doc w/ same header
    
    ndoc = len(lines_ByDoc)
    ndepth = len(lines_ByDoc[0])
    headeridx_ByDepth = tuple(sparse.lil_matrix((ndoc,ndoc),dtype=bool) for i in range(ndepth))
    
    track_str = ''  # for looking at examples of repeated headers
    keepind = np.ones((ndoc,),dtype=bool)
    
    for ld in range(ndepth):
        IdxMat = headeridx_ByDepth[ld]  # alias for index matrix for current depth
        
        for i in range(ndoc):
            line1 = lines_ByDoc[i][ld];
            if verbose and i%1000==0:  # place here so it doesn't get skipped below
                print(i, track_str)
                track_str = ''
                
            if ~keepind[i] or (len(line1)==0):
                continue
            for k in range(i+1, ndoc):  # only half of documents necessary to analyze
                if ~keepind[k]:         # only for depths past the first
                    continue
                line2 = lines_ByDoc[k][ld];
                keep = line1==line2
                if keep:
                    IdxMat[i,k] = True;
                    IdxMat[i,i] = True;
                    keepind[k] = False  # skip this entry in subsequent "i" and "k" searches
                    track_str = (i,line1,k,line2)
        
        nzvec = IdxMat.nonzero()
        nzidx = np.unique(np.concatenate((nzvec[0],nzvec[1]),axis=0))
        keepind = np.zeros((ndoc,),dtype=bool)
        keepind[nzidx] = True           # reset index to only those matching in last round
        
        if verbose:
            print()
    
    print('Done')
    return headeridx_ByDepth

In [8]:
# Look at documents w/ first 3 lines that may be similar to others #
# Note that leading spaces for lines after the first are retained.
first_Lines = [get_firstlines(doc, 3, max_char=150) for doc in clean_articles]
commonIdx_ByDepth = find_commonlines(first_Lines, verbose=True)

import pickle
with open('./saved_files/commonIdx.pkl', 'wb') as picklefile:
    pickle.dump(commonIdx_ByDepth, picklefile)

commonIdx_ByDepth  # outcome was 1762 articles with a common header at line (depth) 1, 114 at line 2, 26 at line 3;
                     # but only 127, 29, and 7 of these are unique! (for a total of 163)

0 
1000 (991, 'THIS WEEK', 3669, 'THIS WEEK')
2000 (1888, 'December 99, 99', 2597, 'December 99, 99')
3000 (2861, 'SAO PAULO', 3586, 'SAO PAULO')
4000 (3307, 'MADRID', 3456, 'MADRID')
5000 (4987, 'NEW YORK, January 99 UP)', 6068, 'NEW YORK, January 99 UP)')
6000 (5612, 'NEW YORK, June 99 (', 7085, 'NEW YORK, June 99 (')
7000 (6861, 'The following is a report of how some major bills fared recently in Congress and a record of how local members of Congress voted', 7934, 'The following is a report of how some major bills fared recently in Congress and a record of how local members of Congress voted')

0 (7447, 'ANNAPOLIS', 7688, 'ANNAPOLIS')
1000 (922, '  Brett Arends', 1988, '  Brett Arends')
2000 (1595, '  patent production', 2036, '  patent production')
3000 (2416, '  Jonathan Cheng; Kristina Peterson', 2848, '  Jonathan Cheng; Kristina Peterson')
4000 (3246, ' The Federal Reserve Board added 99 stocks to the list of over the counter securities falling under the boards margin, or credit

(<8000x8000 sparse matrix of type '<class 'numpy.bool_'>'
 	with 1760 stored elements in LInked List format>,
 <8000x8000 sparse matrix of type '<class 'numpy.bool_'>'
 	with 114 stored elements in LInked List format>,
 <8000x8000 sparse matrix of type '<class 'numpy.bool_'>'
 	with 26 stored elements in LInked List format>)

In [11]:
# Clean articles of common headers, based on the find_commonlines() indices; also save the headers #
def remove_commonlines(headeridx_ByDepth, lines_ByDoc, docs_in):
    ndepth = len(headeridx_ByDepth)
    ndocs = len(docs_in)
    
    assert type(headeridx_ByDepth)==tuple
    if headeridx_ByDepth[0].shape[0] != ndocs:
        raise ValueError('The index matrices and document list don''t match in length.')

    common_dict = {}
    docs_out = docs_in.copy()
    
    for i in range(ndocs):
        full_header = ''
        code = []
        
        for ld in range(ndepth):
            matchidx = headeridx_ByDepth[ld].getcol(i).nonzero()[0]
            if len(matchidx):
                matchidx = matchidx[0]          # should be only one element anyway
                full_header = full_header + lines_ByDoc[matchidx][ld] + '.'  
                code.append(matchidx)           # "." necessary because get_firstlines() kept spaces
                
        docs_out[i] = docs_out[i][len(full_header):]  # remove all of the header lines   
        
        code = tuple(code)
        if len(code) and not common_dict.get(code):
            common_dict[code] = full_header
            
        if i%1000==0:
            print(i, end=' ')
    
    common_headers = [v for k,v in common_dict.items()]
    
    print('\nDone')
    return docs_out, common_headers

In [12]:
# There are 156 unique headers (from 1-3 line depths) #
clean_articles2, common_headers = remove_commonlines(commonIdx_ByDepth, first_Lines, clean_articles)

0 1000 2000 3000 4000 5000 6000 7000 
Done


In [14]:
# Now clean up a bit more and compare the before/after for one article. Cool! #
REPLACE_TEXT = {r"\*\*\*" : "", r"\.[\s]*\." : ".", r",[\s]*," : ",", r"^\s" : "", r"^\.\s*" : ""}
clean_articles3 = [clean_string(doc, REPLACE_TEXT) for doc in clean_articles2]

print(clean_articles[1][:350])
print('   --------')
print(clean_articles3[1][:350])

The Wall Street Journal Online. The Morning Brief, a look at the days biggest news, is emailed to subscribers by 99  every business day. Sign up for the e mail here.. On Friday evening, with Congress out of town on its summer recess and Americans heading into a mid August weekend, the Bush administration sent a message to the states.  The federal g
   --------
On Friday evening, with Congress out of town on its summer recess and Americans heading into a mid August weekend, the Bush administration sent a message to the states.  The federal government will make it tougher for a national childrens insurance program to cover the offspring of middle income families. The State Childrens Health Insurance Progra


In [15]:
# Pickle-save the results #
import pickle
with open('./saved_files/clean_NewsEcon2_a.pkl', 'wb') as picklefile:
    pickle.dump(clean_articles3, picklefile)