<a href="https://colab.research.google.com/github/Gayeon6423/BusinessAI-Capston/blob/main/Preprocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Timeline

### 1. Import library
### 2. Load Data
### 3. Filtering News Data
### 4. Preprocessing News Data
### 5. Preprocessing Numeric Data

### 1. Import library

In [1]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount("/content/drive")

# 모듈 설치
!pip install konlpy
# 데이터 처리 모듈
import pandas as pd
import copy
import re
import math
import json
import requests
import urllib.request
from tqdm import tqdm
import datetime
import time
import os
# 텍스트 관련 모듈
from konlpy.tag import Okt
okt = Okt()
from konlpy.tag import *
import nltk
# 전처리 모듈
from sklearn.preprocessing import MinMaxScaler
#시각화 모듈
import matplotlib.pyplot as plt
import seaborn as sns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 2. Load Data

In [3]:
class LoadGoogleDriveData():
  def __init__(self, data = None):
    self.data = data

  def loadData(self, file_path: str, file_name_extension,
               columnTF: bool, unicode: str) -> pd.DataFrame():
    self.data = pd.read_csv(os.path.join(file_path + file_name_extension),
                            index_col = columnTF,
                            sep = ",",
                            na_values = "NaN",
                            encoding = unicode)
    return self.data

  def loadTxTData(self, file_path: str, file_name_extension,
               columnTF: bool, unicode: str) -> pd.DataFrame():
    self.data = pd.read_csv(os.path.join(file_path + file_name_extension),
                            index_col = columnTF,
                            sep = "|",
                            na_values = "NaN",
                            encoding = unicode)
    return self.data

  def loadExcelData(self, file_path: str, file_name_extension,
               columnTF: bool) -> pd.DataFrame():
    self.data = pd.read_excel(os.path.join(file_path + file_name_extension),
                              index_col = columnTF)
    return self.data

  # 용량이 큰 csv 파일 읽어오기(fopen - fread와 유사한 방식)
  def loadDataWithChunking(self, file_path: str, file_name_extension,
                           chunking_row_num: int, columnTF: bool, unicode: str) -> pd.DataFrame():
    chunkdata = pd.read_csv(os.path.join(file_path + file_name_extension),
                            chunksize = chunking_row_num,
                            index_col = columnTF,
                            sep = ",",
                            na_values = "NaN",
                            encoding = unicode)
    self.data = list(chunkdata)
    self.data = pd.concat(self.data)

    return self.data

In [4]:
mountInstance = LoadGoogleDriveData()

- load stopwrods data

In [5]:
with open('/content/drive/MyDrive/산업 AI 캡스톤/DATA/stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.readlines() # 파일을 읽어서 각 줄을 리스트의 요소로 저장(줄바꿈 문자로 저장)
stopwords = [x.replace('\n','') for x in stopwords] # stopword 파일의 줄바꿈 문자 제거

- load numeric data

In [6]:
total_kospi = mountInstance.loadData(
            file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Stock Index Data/',
            file_name_extension = "total_kospi.csv",
            unicode = 'utf-8-sig', columnTF = False)
total_kosdaq = mountInstance.loadData(
            file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Stock Index Data/',
            file_name_extension = "total_kosdaq.csv",
            unicode = 'utf-8-sig', columnTF = False)
Stock_Open_Date = mountInstance.loadData(
            file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/',
            file_name_extension = "Stock Open Date.csv",
            unicode = 'utf-8-sig', columnTF = False)

### 3. Filtering News Data

In [20]:
def filtering_news_data(start_date, end_date, keyword):
    for date in pd.date_range(start_date, end_date, freq='M'):
        # Read the CSV file
        news_data = mountInstance.loadData(
            file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Original_News_Data/',
            file_name_extension = f"경제면_금융섹터_기사({date.strftime('%Y%m')}).csv",
            unicode = 'utf-8-sig', columnTF = False)

        # Drop null values
        news_data = news_data.dropna()

        # Filtering news : 행 필터링
        # news_data = news_data[news_data['content'].str.contains('코스피|코스피지수|KOSPI|kospi|Kospi|코스피200|Kospi지수|KOSPI지수|KOSPI200|kospi200')]
        news_data = news_data[news_data['content'].str.contains('코스닥|코스닥지수|KOSDAQ|kosdaq|Kosdaq|코스닥|Kosdaq지수|KOSDAQ지수')]

        # Generate the file name
        file_name = f"/content/drive/MyDrive/산업 AI 캡스톤/DATA/Filtering_News_Data/{keyword}_Filtering_News_Data/news_{date.strftime('%Y%m')}_{keyword}.csv"

        # Save the filtered data to a new CSV file
        news_data.to_csv(file_name, encoding='utf-8-sig', index=False)

        # Display the first few rows of the filtered data
        print(f"First few rows of {file_name}, " "Rows number : ", len(news_data) )
        print()

- Filtering : Kospi

In [None]:
start_date = '2020-01'
end_date = '2023-06'
keyword = 'Kospi'
filtering_news_data(start_date, end_date, keyword)

- Filtering : Kosdaq

In [None]:
start_date = '2020-01'
end_date = '2023-06'
keyword = 'Kosdaq'
filtering_news_data(start_date, end_date, keyword)

In [24]:
# kospi data 수
kospi_data_len = 329+388+1293+265+215+269+282+436+340+347+535+498+997+597+544+434+479+557+390+469+425+492+584+508+606+387+397+424+593+563+445+586+408+405+423+302+349+458+346+302
# kosdaq data 수
kosdaq_data_len = 244+303+1031+177+180+216+220+318+306+246+365+338+521+435+373+272+373+365+256+284+314+321+440+399+510+266+275+248+274+374+344+308+403+332+363+329+252+279+399+331+307
print('kospi_data_len : ', kospi_data_len)
print('kosdaq_data_len : ', kosdaq_data_len)

kospi_data_len :  18667
kosdaq_data_len :  13891


### 4. Preprocessing News Data

In [None]:
def preprocess_news_data(start_date, end_date, keyword):
    for date in pd.date_range(start_date, end_date, freq='M'):
        # Read the CSV file
        file_path = f'/content/drive/MyDrive/산업 AI 캡스톤/DATA/Filtering_News_Data/{keyword}_Filtering_News_Data/'
        file_name = f"news_{date.strftime('%Y%m')}_{keyword}.csv"
        news_data = mountInstance.loadData(file_path=file_path, file_name_extension=file_name, unicode='utf-8-sig', columnTF=False)

        # 주식 시장이 열리는 날짜만 사용
        news_data = news_data[news_data['pubdate'].astype(str).isin(Stock_Open_Date['Date'].astype(str))]

        # Cleaning
        regex = r'[^\w\s]'
        news_data['clean_content'] = news_data['content'].apply(lambda x: re.sub(regex, '', str(x)))

        # Tokenization & Pos Tagging
        pos_tag = []
        for _, row in tqdm(news_data.iterrows()):
            news_text = row['clean_content']
            tokens_ko = okt.pos(news_text)
            pos_tag.append(tokens_ko)

        # Normalization
        normalization_li = []
        for pos in pos_tag:
            in_li = []
            for ele in pos:
                if ele[1] in ['Josa', 'Suffix']:
                    continue
                in_li.append(ele[0])
            normalization_li.append(in_li)

        # Stopword Removal
        tokens = normalization_li
        token_stop = []
        for token in tokens:
            in_li = []
            for tok in token:
                if len(tok) > 1 and tok not in stopwords:
                    in_li.append(tok)
            token_stop.append(in_li)

        # Data save
        token_df = []
        for token in token_stop:
            token_join = ' '.join(token)
            token_df.append(token_join)

        # Create DataFrame for preprocessed context
        news_data['preprocess_context'] = token_df

        # Generate the file name
        file_name = f"/content/drive/MyDrive/산업 AI 캡스톤/DATA/Preprocessing_Data/{keyword}_Preprocessing_News_Data/news_preprocess_{date.strftime('%Y%m')}_kospi.csv"

        # Save the preprocessed data to a new CSV file
        news_data.to_csv(file_name, encoding='utf-8-sig', index=False)

- 4-1. Preprocessing : Kospi news data

In [None]:
start_date = '2023-06'
end_date = '2023-07'
keyword = 'Kospi'
preprocess_news_data(start_date, end_date, keyword)

- 4-2. Preprocessing : Kosdaq news data

In [None]:
start_date = '2020-01'
end_date = '2023-07'
keyword = "Kosdaq"
preprocess_news_data(start_date, end_date, keyword)

In [7]:
file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Preprocessing_Data/Kospi_Preprocessing_News_Data/'
file_name = "news_preprocess_202001_kospi.csv"
news_preprocess_202001_kospi = mountInstance.loadData(file_path = file_path, file_name_extension = file_name, unicode='utf-8-sig', columnTF=False)
news_preprocess_202001_kospi.head(3)

Unnamed: 0,title,url,pubdate,content,clean_content,preprocess_context
0,새해 첫날 코스피 1%대 하락..2180선 무너져,https://n.news.naver.com/mnews/article/469/000...,20200102,경자년(庚子年) 새해 첫 거래일인 2일 오후 서울 중구 KEB 하나은행 딜링룸에서 ...,경자년庚子年 새해 첫 거래일인 2일 오후 서울 중구 KEB 하나은행 딜링룸에서 딜러...,경자년 庚子年 새해 거래 일인 2일 오후 서울 중구 KEB 하나은행 딜링룸 딜러 업...
1,"[시황종합] 첫날 코스피 2170선 후퇴…""기관 차익실현""",https://n.news.naver.com/mnews/article/421/000...,20200102,코스피 1.02% 내린 2175.17…코스닥 0.63% 오른 674.02원달러 환율...,코스피 102 내린 217517코스닥 063 오른 67402원달러 환율 17원 오른...,코스피 102 내린 217517 코스닥 063 오른 67402원 달러 환율 17원 ...
2,"새해 첫 주식·환율시장, 코스피 나홀로 하락",https://n.news.naver.com/mnews/article/421/000...,20200102,(서울=뉴스1) 민경석 기자 = 2일 오후 서울 중구 KEB 하나은행 딜링룸 전광판...,서울뉴스1 민경석 기자 2일 오후 서울 중구 KEB 하나은행 딜링룸 전광판에 경자...,서울 뉴스 민경석 기자 2일 오후 서울 중구 KEB 하나은행 딜링룸 전광판 경자년 ...


### 5. Preprocessing Numeric Data

In [8]:
def preprocess_numeric_data(df):

    # 데이터들 소수점 3자리까지 조절
    df = df.round(3)

    # 종가들을 제외하고 스케일링을 진행하기위해서 우선 칼럼들의 이름을 뽑아내기
    df_names = df.columns.tolist()

    # 종가 열(Kospi_close, Kosdaq_close)을 데이터프레임에서 제외
    col_to_drop = [col for col in df.columns if 'Kospi_close' in col or 'Kosdaq_close' in col]
    df_col = df.columns.tolist()
    # 날짜 열(Date)을 데이터프레임에서 제외
    df_col.remove('Date')
    x_col = [item for item in df_col if item not in col_to_drop]

    # MinMax Scaling 적용
    columns_to_scale = x_col  # 스케일링을 적용할 열 목록
    df[columns_to_scale] = MinMaxScaler().fit_transform(df[columns_to_scale])

    return df



```
#데이터들 소수점 3자리까지 조절
total_kospi = total_kospi.round(3)
total_kosdaq = total_kosdaq.round(3)

#종가들을 제외하고 스케일링을 진행하기위해서 우선 칼럼들의 이름을 뽑아내기
kospi_names = total_kospi.columns.tolist()
kosdaq_names = total_kosdaq.columns.tolist()

#각각의 종가 열을 리스트에서 제외
kospi_names.remove('Kospi_close')
kosdaq_names.remove('kosdaq_close')
kospi_names.remove('Date')
kosdaq_names.remove('Date')

#kospi 데이터 프레임 스케일링 적용
kospi_scaled = total_kospi.copy()
columns_to_scale = kospi_names  # Min-Max 스케일링을 적용할 열 목록
kospi_scaled[columns_to_scale] = MinMaxScaler().fit_transform(kospi_scaled[columns_to_scale])

#kosdaq 데이터 프레임 스케일링 적용
kosdaq_scaled = total_kosdaq.copy()
columns_to_scale = kosdaq_names  # Min-Max 스케일링을 적용할 열 목록
kosdaq_scaled[columns_to_scale] = MinMaxScaler().fit_transform(kosdaq_scaled[columns_to_scale])
```



- 5-1. Preprocessing : Kospi neumeric data

In [9]:
neumeric_preprocess_kospi = preprocess_numeric_data(total_kospi)
# neumeric_preprocess_kospi.to_csv("/content/drive/MyDrive/산업 AI 캡스톤/DATA/Preprocessing_Data/Preprocessing_Neumeric_Data/neumeric_preprocess_kospi.csv", encoding='utf-8-sig', index=False)

- 5-2. Preprocessing : Kosdaq neumeric data

In [10]:
neumeric_preprocess_kosdaq = preprocess_numeric_data(total_kosdaq)
# neumeric_preprocess_kosdaq.to_csv("/content/drive/MyDrive/산업 AI 캡스톤/DATA/Preprocessing_Data/Preprocessing_Neumeric_Data/neumeric_preprocess_kosdaq.csv", encoding='utf-8-sig', index=False)

In [11]:
neumeric_preprocess_kospi.head(3)

Unnamed: 0,Date,Kospi_open,Kospi_high,Kospi_low,Kospi_close,Kospi_vol,dji_open,dji_high,dji_low,dji_close,...,gold_close,gold_vol,oil_open,oil_high,oil_low,oil_close,oil_vol,ko_interest,ko_consumer,ko_real_estate
0,20200102,0.396918,0.381014,0.394615,2175.17,0.063453,0.543149,0.546881,0.565298,0.564424,...,0.310484,0.034935,0.357855,0.353594,0.370475,0.360653,0.000267,0.25,0.055603,1.0
1,20200103,0.392204,0.381603,0.39114,2176.46,0.106757,0.538309,0.538105,0.558382,0.551577,...,0.315092,0.141863,0.377182,0.37125,0.390883,0.378227,3e-06,0.25,0.055603,1.0
2,20200106,0.371664,0.359951,0.382821,2155.07,0.094452,0.533345,0.53764,0.553946,0.555339,...,0.304724,0.150331,0.381234,0.375312,0.388716,0.376081,0.000202,0.25,0.055603,1.0


In [None]:
#코스피 산점도 그래프
sns.set(style="ticks")
sns.pairplot(total_kospi, diag_kind="kde")
plt.show()

In [None]:
#코스피 박스플랏
plt.figure(figsize=(10, 6))  # 그림의 크기 조정 (선택 사항)
sns.boxplot(data=total_kospi, orient="h")
plt.title("박스플랏")
plt.show()

In [None]:
#코스닥 산점도 그래프
sns.set(style="ticks")
sns.pairplot(total_kosdaq, diag_kind="kde")
plt.show()

In [None]:
#코스닥 박스플랏
plt.figure(figsize=(10, 6))  # 그림의 크기 조정 (선택 사항)
sns.boxplot(data=total_kosdaq, orient="h")
plt.title("박스플랏")
plt.show()