<a href="https://colab.research.google.com/github/Gayeon6423/BusinessAI-Capston/blob/main/Preprocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Timeline

### 1. Import library
### 2. Load Data
### 3. Filtering News Data
### 4. Preprocessing News Data
### 5. Preprocessing Numeric Data

### 1. Import library

In [4]:
# 데이터 처리 모듈
# !pip install konlpy
import pandas as pd
import copy
import re
import math
import json
import requests
import urllib.request
from tqdm import tqdm
import datetime
import time
import os
# 텍스트 관련 모듈
from konlpy.tag import Okt
okt = Okt()
from konlpy.tag import *
import nltk
# 구글 드라이브 마운트
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


### 2. Load Data

In [6]:
class LoadGoogleDriveData():
  def __init__(self, data = None):
    self.data = data

  def loadData(self, file_path: str, file_name_extension,
               columnTF: bool, unicode: str) -> pd.DataFrame():
    self.data = pd.read_csv(os.path.join(file_path + file_name_extension),
                            index_col = columnTF,
                            sep = ",",
                            na_values = "NaN",
                            encoding = unicode)
    return self.data

  def loadTxTData(self, file_path: str, file_name_extension,
               columnTF: bool, unicode: str) -> pd.DataFrame():
    self.data = pd.read_csv(os.path.join(file_path + file_name_extension),
                            index_col = columnTF,
                            sep = "|",
                            na_values = "NaN",
                            encoding = unicode)
    return self.data

  def loadExcelData(self, file_path: str, file_name_extension,
               columnTF: bool) -> pd.DataFrame():
    self.data = pd.read_excel(os.path.join(file_path + file_name_extension),
                              index_col = columnTF)
    return self.data

  # 용량이 큰 csv 파일 읽어오기(fopen - fread와 유사한 방식)
  def loadDataWithChunking(self, file_path: str, file_name_extension,
                           chunking_row_num: int, columnTF: bool, unicode: str) -> pd.DataFrame():
    chunkdata = pd.read_csv(os.path.join(file_path + file_name_extension),
                            chunksize = chunking_row_num,
                            index_col = columnTF,
                            sep = ",",
                            na_values = "NaN",
                            encoding = unicode)
    self.data = list(chunkdata)
    self.data = pd.concat(self.data)

    return self.data

In [7]:
mountInstance = LoadGoogleDriveData()

- load stopwrods data

In [10]:
with open('/content/drive/MyDrive/산업 AI 캡스톤/DATA/stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.readlines() # 파일을 읽어서 각 줄을 리스트의 요소로 저장(줄바꿈 문자로 저장)
stopwords = [x.replace('\n','') for x in stopwords] # stopword 파일의 줄바꿈 문자 제거

- load numeric data

In [None]:
total_kospi = mountInstance.loadData(
            file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Stock Index Data/',
            file_name_extension = "total_kospi.csv",
            unicode = 'utf-8-sig', columnTF = False)
total_kosdaq = mountInstance.loadData(
            file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Stock Index Data/',
            file_name_extension = "total_kosdaq.csv",
            unicode = 'utf-8-sig', columnTF = False)

### 3. Filtering News Data

In [21]:
def filtering_news_data(start_date, end_date, keyword):
    for date in pd.date_range(start_date, end_date, freq='M'):
        # Read the CSV file
        news_data = mountInstance.loadData(
            file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Original_News_Data/',
            file_name_extension = f"경제면_금융섹터_기사({date.strftime('%Y%m')}).csv",
            unicode = 'utf-8-sig', columnTF = False)

        # Drop null values
        news_data = news_data.dropna()

        # Filtering news : 코스피,코스피지수,KOSPI,kospi,Kospi,코스피200,Kospi지수,KOSPI지수,KOSPI200,kospi200 행 필터링
        news_data = news_data[news_data['content'].str.contains('코스피|코스피지수|KOSPI|kospi|Kospi|코스피200|Kospi지수|KOSPI지수|KOSPI200|kospi200')]

        # Generate the file name
        file_name = f"/content/drive/MyDrive/산업 AI 캡스톤/DATA/Filtering_News_Data/Kospi_Filtering_News_Data/news_{date.strftime('%Y%m')}_{keyword}.csv"

        # Save the filtered data to a new CSV file
        news_data.to_csv(file_name, encoding='utf-8-sig', index=False)

        # Display the first few rows of the filtered data
        print(f"First few rows of {file_name}, " "Rows number : ", len(news_data) )
        print()

- Filtering : Kospi

In [None]:
start_date = '2022-01'
end_date = '2022-03'
keyword = 'Kospi'
filtering_news_data(start_date, end_date, keyword)

- Filtering : Kosdaq

In [None]:
start_date = '2022-01'
end_date = '2022-03'
keyword = 'Kosdaq'
filtering_news_data(start_date, end_date, keyword)

### 4. Preprocessing News Data

In [19]:
def preprocess_news_data(start_date, end_date, keyword):
    for date in pd.date_range(start_date, end_date, freq='M'):
        # Read the CSV file
        file_path = f'/content/drive/MyDrive/산업 AI 캡스톤/DATA/Filtering_News_Data/{keyword}_Filtering_News_Data/'
        file_name = f"news_{date.strftime('%Y%m')}_{keyword}.csv"
        news_data = mountInstance.loadData(file_path=file_path, file_name_extension=file_name, unicode='utf-8-sig', columnTF=False)

        # Cleaning
        regex = r'[^\w\s]'
        news_data['clean_content'] = news_data['content'].apply(lambda x: re.sub(regex, '', str(x)))

        # Tokenization & Pos Tagging
        pos_tag = []
        for _, row in tqdm(news_data.iterrows()):
            news_text = row['clean_content']
            tokens_ko = t.pos(news_text)
            pos_tag.append(tokens_ko)

        # Normalization
        normalization_li = []
        for pos in pos_tag:
            in_li = []
            for ele in pos:
                if ele[1] in ['Josa', 'Suffix']:
                    continue
                in_li.append(ele[0])
            normalization_li.append(in_li)

        # Stopword Removal
        tokens = normalization_li
        token_stop = []
        for token in tokens:
            in_li = []
            for tok in token:
                if tok not in stopwords:
                    in_li.append(tok)
            token_stop.append(in_li)

        # Data save
        df_li = []
        for tokens in token_stop:
            token = ' '.join(tokens)
            df_li.append(token)

        df = pd.DataFrame(df_li).rename(columns={0: 'preprocess_context'})
        news_data = pd.concat([news_data, df], axis=1)

        # Generate the file name
        file_name = f"/content/drive/MyDrive/산업 AI 캡스톤/DATA/Preprocessing_News_Data/{keyword}_Preprocessing_News_Data/news_preprocess_{date.strftime('%Y%m')}_kospi.csv"

        # Save the preprocessed data to a new CSV file
        news_data.to_csv(file_name, encoding='utf-8-sig', index=False)

- 4.1 Preprocessing : Kospi

In [20]:
start_date = '2020-01'
end_date = '2023-07'
keyword = 'Kospi'
preprocess_news_data(start_date, end_date, keyword)

606it [01:03,  9.61it/s]
387it [00:38,  9.94it/s]


- 4.1 Preprocessing : Kospi

In [None]:
start_date = '2020-01'
end_date = '2023-07'
keyword = "Kosdaq"
preprocess_news_data(start_date, end_date, keyword)

### 5. Preprocessing Numeric Data

In [24]:
total_kospi.head(3)

Unnamed: 0,Date,Kospi_open,Kospi_high,Kospi_low,Kospi_close,Kospi_vol,dji_open,dji_high,dji_low,dji_close,...,ex_AM_high,ex_AM_low,ex_AM_close,ex_JP_open,ex_JP_high,ex_JP_low,ex_JP_close,ko_interest,ko_consumer,ko_real_estate
0,2020-01-02,2201.209961,2202.320068,2171.840088,2175.169922,494700,28638.970703,28872.800781,28627.769531,28868.800781,...,1160.189941,1145.199951,1153.969971,10.613463,10.709266,10.533481,10.615074,1.25,100.09,0.457
1,2020-01-03,2192.580078,2203.379883,2165.389893,2176.459961,631600,28553.330078,28716.310547,28500.359375,28634.880859,...,1168.069946,1155.310059,1157.150024,10.659868,10.815873,10.650669,10.659966,1.25,100.09,0.457
2,2020-01-06,2154.969971,2164.419922,2149.949951,2155.070068,592700,28465.5,28708.019531,28418.630859,28703.380859,...,1172.48999,1165.640015,1165.849976,10.796457,10.859074,10.762263,10.798657,1.25,100.09,0.457


In [27]:
total_kosdaq.head(3)

Unnamed: 0,Date,kosdaq_open,kosdaq_high,kosdaq_low,kosdaq_close,kosdaq_vol,dji_open,dji_high,dji_low,dji_close,...,ex_AM_high,ex_AM_low,ex_AM_close,ex_JP_open,ex_JP_high,ex_JP_low,ex_JP_close,ko_interest,ko_consumer,ko_real_estate
0,2020-01-02,672.530029,674.299988,666.619995,674.02002,800.0,28638.970703,28872.800781,28627.769531,28868.800781,...,1160.189941,1145.199951,1153.969971,10.613463,10.709266,10.533481,10.615074,1.25,100.09,0.457
1,2020-01-03,677.570007,679.369995,668.099976,669.929993,900.0,28553.330078,28716.310547,28500.359375,28634.880859,...,1168.069946,1155.310059,1157.150024,10.659868,10.815873,10.650669,10.659966,1.25,100.09,0.457
2,2020-01-06,660.080017,663.099976,653.609985,655.309998,900.0,28465.5,28708.019531,28418.630859,28703.380859,...,1172.48999,1165.640015,1165.849976,10.796457,10.859074,10.762263,10.798657,1.25,100.09,0.457
