# This note book is for testing the functions and example

In [1]:
import pandas as pd
import os
import sys

In [2]:
# Add ../src to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '../src'))
sys.path.insert(0, project_root)

# Import the parse config function to parse the .toml file
from utils.config_tool import parse_config
from utils.logging_tool import initialize_logger

from engine.clean import clean_All_news as cc


In [3]:
# Load the example dataset
config_file = "../config/predict_stock_w_news.toml"
config = parse_config(config_file)

local_data_path = os.path.join(config['info']['local_data_path'],'data_raw', config['news_ingestion']['input']["all_news_eg1_file"])
df = pd.read_csv(local_data_path)

df.head(2)

Unnamed: 0.1,Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider


## Check if the preprocessing function works

## 1: Get the global parmeters

In [4]:
columns_to_drop = config['news_ingestion']['input']['columns_to_drop']
publisher = config['news_ingestion']['input']['publisher']
subsets_dropna = config['news_ingestion']['input']['subsets_dropna']
num_words = config['news_ingestion']['input']['num_words']

print(f"colums to drop: {columns_to_drop}")
print(f"publisher: {publisher}")
print(f"subsets_dropna: {subsets_dropna}")
print(f"num_words: {num_words}")

colums to drop: ['Unnamed: 0', 'author', 'year', 'month', 'day', 'url']
publisher: {}
subsets_dropna: ['article']
num_words: 100


## 2: Drop columns

In [5]:
# Drop columns

df = cc.drop_columns(df, columns_to_drop)
df.head(2)

Unnamed: 0,date,title,article,section,publication
0,2016-12-09 18:31:00,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",,Vox
1,2016-10-07 21:26:46,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,,Business Insider


## 3: Drop null articles: 

In [6]:
# Drop null articles: 
cc.drop_null_article(df, subsets_dropna)
df.head(2)

Unnamed: 0,date,title,article,section,publication
0,2016-12-09 18:31:00,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",,Vox
1,2016-10-07 21:26:46,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,,Business Insider


## 4: Select publisher:

In [7]:
# Select the publisher
print(cc.select_publishers(df, publisher))
print(cc.select_publishers(df, {'Vox'}))

Empty DataFrame
Columns: [date, title, article, section, publication]
Index: []
                     date                                              title  \
0     2016-12-09 18:31:00  We should take concerns about the health of li...   
8     2016-05-18 13:00:06           How to watch the Google I/O keynote live   
10    2019-05-22 20:10:00  “Elizabeth Warren called me!” is turning into ...   
15    2016-09-08 15:40:02  Nancy Pelosi says a Clinton landslide could pu...   
16    2017-01-11 23:10:01  The government official in charge of ethics ju...   
...                   ...                                                ...   
9984  2019-04-30 13:00:00  How Occupy Wall Street animated Bernie Sanders...   
9988  2017-05-11 17:00:02  The past 24 hours since Trump fired Comey were...   
9990  2016-03-08 12:00:36               NCAA March Madness Comes to Apple TV   
9993  2016-10-20 19:25:02  Good times, bad times: #TrumpBookReport's hila...   
9998  2017-04-17 11:11:55  Recode Daily:

## 5: Get the first $ x $ words

In [8]:
df['summary'] = df['article'].apply(lambda x: cc.get_first_words(x, num_words))
df.head(2)

Unnamed: 0,date,title,article,section,publication,summary
0,2016-12-09 18:31:00,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",,Vox,"This post is part of Polyarchy, an independent..."
1,2016-10-07 21:26:46,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,,Business Insider,The Indianapolis Colts made Andrew Luck the hi...


## 6: Tokenization

In [9]:
df['token'] = df['article'].apply(cc.get_tokenized_words_with_no_punctuation)

In [10]:
df.head(2)

Unnamed: 0,date,title,article,section,publication,summary,token
0,2016-12-09 18:31:00,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",,Vox,"This post is part of Polyarchy, an independent...","[this, post, is, part, of, polyarchy, an, inde..."
1,2016-10-07 21:26:46,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,,Business Insider,The Indianapolis Colts made Andrew Luck the hi...,"[the, indianapolis, colts, made, andrew, luck,..."


## 7: Remove stop words

In [11]:
df['token'] = df['token'].apply(cc.remove_stop_words)
df.head(2)

Unnamed: 0,date,title,article,section,publication,summary,token
0,2016-12-09 18:31:00,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",,Vox,"This post is part of Polyarchy, an independent...","[post, part, polyarchy, independent, blog, pro..."
1,2016-10-07 21:26:46,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,,Business Insider,The Indianapolis Colts made Andrew Luck the hi...,"[indianapolis, colts, made, andrew, luck, play..."


## 8: Lemmatization

In [12]:
df['token'] = df['token'].apply(cc.lemmatize_words)
df.head(2)

Unnamed: 0,date,title,article,section,publication,summary,token
0,2016-12-09 18:31:00,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",,Vox,"This post is part of Polyarchy, an independent...","[post, part, polyarchy, independent, blog, pro..."
1,2016-10-07 21:26:46,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,,Business Insider,The Indianapolis Colts made Andrew Luck the hi...,"[indianapolis, colt, made, andrew, luck, playe..."


## 9: The whole preprocessing pipeline

In [13]:
df = pd.read_csv(local_data_path)

# For now we just select some publishers
df = cc.preprocess_all_news(df, columns_to_drop, subsets_dropna,{'Vox'}, num_words)

In [14]:
df.head(2)

Unnamed: 0,date,title,section,publication,summary,token,word_count
8,2016-05-18 13:00:06,How to watch the Google I/O keynote live,,Vox,"Google I/O, the company's big developer confer...","[google, company, big, developer, conference, ...",147
16,2017-01-11 23:10:01,The government official in charge of ethics ju...,,Vox,The nonpartisan director of the federal Office...,"[nonpartisan, director, federal, office, gover...",756


## 10: Can be used to clean the headline dataset

In [15]:
# Loading the headline data set

local_data_path2 = os.path.join(config['info']['local_data_path'],'data_raw', config['news_ingestion']['input']["news_head_line"])
df2 = pd.read_json(local_data_path2, lines = True)

df2.head(2)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23


In [16]:
# we need to define some new local parameters
columns_to_drop_new = ['link']
subsets_dropna_new = ['headline', 'short_description']

# Drop columns
df2 = cc.drop_columns(df2, columns_to_drop_new)
# Drop null description
cc.drop_null_article(df2, subsets_dropna_new)
#Tokenize the words
df2['token'] = df2['short_description'].apply(cc.get_tokenized_words_with_no_punctuation)
# Remove stop words
df2['token'] = df2['token'].apply(cc.remove_stop_words)
# Lemmatize the words
df2['token'] = df2['token'].apply(cc.lemmatize_words)
# NER
df2 = pd.concat([df2, df2['token'].apply(lambda x: cc.extract_ner_features(' '.join(x)))], axis=1)

In [17]:
df2.head(2)

Unnamed: 0,headline,category,short_description,authors,date,token,PERSON,ORG,GPE,EVENT,PRODUCT
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,"[health, expert, said, early, predict, whether...",0,0,0,0,0
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,"[subdued, passenger, crew, fled, back, aircraf...",0,0,1,0,0


In [19]:
# Write the data to the local path
local_data_path = os.path.join(config['info']['local_data_path'],'data_clean', config['news_preprocessing']['output']["news_head_line_cleaned"])
df2.to_csv(local_data_path, index = False)