In [1]:
import pandas as pd
import os
import sys
import logging

In [2]:
# Add ../src to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '../src'))
sys.path.insert(0, project_root)

# Import the parse config function to parse the .toml file
from utils.config_tool import parse_config
from utils.logging_tool import initialize_logger

from engine.clean import clean_All_news as cc
from engine.ingestion import ingest_All_news as ii


In [3]:
config_file = "../config/predict_stock_w_news.toml"
config = parse_config(config_file)

In [4]:
def setup_logger():
    # Create logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    
    # Clear existing handlers
    logger.handlers = []
    
    # Create console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    
    # Create file handler
    file_handler = logging.FileHandler('Ingestion.log')
    file_handler.setLevel(logging.INFO)
    
    # Create formatter
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', 
                                datefmt='%Y-%m-%d %H:%M:%S')
    
    # Add formatter to handlers
    console_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)
    
    # Add handlers to logger
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    
    return logger

# Use the logger
logger = setup_logger()

In [None]:
ii.ingest_example(config, logger)

In [None]:
df1_path = os.path.join(config['info']['local_data_path'],'data_raw', config['news_ingestion']['input']["all_news_eg1_file"])
df1 = pd.read_csv(df1_path)
print(df1.shape)

In [None]:
ii.ingest_k_example(config, logger)

In [None]:
df2_path = os.path.join(config['info']['local_data_path'],'data_raw', config['news_ingestion']['input']["all_news_egk_file"])
df2 = pd.read_csv(df2_path)
print(df2.shape)

In [None]:
ii.ingest_EDA(config, logger)

In [None]:
df3_path = os.path.join(config['info']['local_data_path'],'data_raw', config['news_ingestion']['input']["eda_all_news_file"])
df3 = pd.read_csv(df3_path)
print(df3.shape)

In [5]:
ii.ingest_politics(config, logger)

2024-11-11 22:54:21 - INFO - Processed 0 chunks in 0.32308101654052734 seconds
2024-11-11 22:54:21 - INFO - Processed 1 chunks in 0.30470991134643555 seconds
2024-11-11 22:54:22 - INFO - Processed 2 chunks in 0.2986888885498047 seconds
2024-11-11 22:54:22 - INFO - Processed 3 chunks in 0.30864906311035156 seconds
2024-11-11 22:54:22 - INFO - Processed 4 chunks in 0.29891300201416016 seconds
2024-11-11 22:54:23 - INFO - Processed 5 chunks in 0.2913639545440674 seconds
2024-11-11 22:54:23 - INFO - Processed 6 chunks in 0.28990817070007324 seconds
2024-11-11 22:54:23 - INFO - Processed 7 chunks in 0.2862229347229004 seconds
2024-11-11 22:54:23 - INFO - Processed 8 chunks in 0.29031896591186523 seconds
2024-11-11 22:54:24 - INFO - Processed 9 chunks in 0.27758312225341797 seconds
2024-11-11 22:54:24 - INFO - Processed 10 chunks in 0.2965970039367676 seconds
2024-11-11 22:54:24 - INFO - Processed 11 chunks in 0.27808690071105957 seconds
2024-11-11 22:54:25 - INFO - Processed 12 chunks in 0.

In [7]:
df4_path = os.path.join(config['info']['local_data_path'],'data_raw', config['news_ingestion']['input']["all_news_politic_file"])
df4 = pd.read_csv(df4_path)
df4.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2017-04-07 00:00:00,2017,4.0,7,Dustin Volz,U.S. lawmakers ask for disclosure of number of...,WASHINGTON (Reuters) - A U.S. congressional co...,https://www.reuters.com/article/us-usa-cyber-s...,Politics,Reuters
1,2017-10-16 00:00:00,2017,10.0,16,"James Oliphant, Roberta Rampton",Trump keeping options open as Republican feud ...,WASHINGTON (Reuters) - Like the deal-maker he ...,https://www.reuters.com/article/us-usa-trump-m...,Politics,Reuters
2,2019-01-18 00:00:00,2019,1.0,18,Katharine Jackson,Trump tells anti-abortion marchers he will sup...,WASHINGTON (Reuters) - U.S. President Donald T...,https://www.reuters.com/article/us-usa-abortio...,Politics,Reuters
3,2016-07-21 00:00:00,2016,7.0,21,Michelle Conlin,Exclusive: Trump considering fracking mogul Ha...,CLEVELAND (Reuters) - Republican presidential ...,https://www.reuters.com/article/us-usa-electio...,Politics,Reuters
4,2017-10-06 00:00:00,2017,10.0,6,,White House chief of staff's personal cellphon...,WASHINGTON (Reuters) - White House officials b...,https://www.reuters.com/article/us-usa-trump-k...,Politics,Reuters


In [8]:
df4['section'].value_counts()

section
Politics    33875
Name: count, dtype: int64