# 목차

### 1. Import library

### 2. Preprocessing Data
- [2.1 Sector Preprocessing : 2차전지](#2.1-Sector-Preprocessing-:-2차전지)
- [2.2 Sector Preprocessing : 바이오](#2.2-Sector-Preprocessing-:-바이오)
- [2.3 Sector Preprocessing : 인터넷](#2.3-Sector-Preprocessing-:-인터넷)
- [2.4 Sector Preprocessing : 게임](#2.4-Sector-Preprocessing-:-게임)
- [2.5 종목 Preprocessing : 삼성전자](#2.5-종목-Preprocessing-:-삼성전자)

## 1. Import library

In [1]:
import time
import urllib.request
import datetime
import time
import json
import pandas as pd
import math
import requests
import re
from tqdm import tqdm

from konlpy.tag import Kkma        
kkma = Kkma()
from konlpy.tag import Okt         
t = Okt() 
from konlpy.tag import *
import nltk
import pickle
import copy

In [2]:
# Define the start and end dates
start_date = '2021-06'
end_date = '2023-07'

# Convert start and end dates to datetime objects
start = pd.to_datetime(start_date)
end = pd.to_datetime(end_date)

In [8]:
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.readlines() # 파일을 읽어서 각 줄을 리스트의 요소로 저장(줄바꿈 문자로 저장)
stopwords = [x.replace('\n','') for x in stopwords] # stopword 파일의 줄바꿈 문자 제거
okt = Okt()

## 2. Preprocessing Data

### 2.1 Sector Preprocessing : 2차전지

In [3]:
sector = "2차전지"

In [None]:
# Iterate over the date range
for date in pd.date_range(start, end, freq='M'):
    # Generate the file name for the specific month
    file_name = f"./{sector}/news_{date.strftime('%Y%m')}_{sector}.csv"
    
    # 1. Read the CSV file
    news_data = pd.read_csv(file_name, encoding='utf-8-sig')
    
    # 2. Cleaning
    regex = r'[^\w\s]'
    # text 열의 모든 값을 대상으로 정규표현식을 적용하여 특수문자를 제거
    news_data['clean_content'] = news_data['content'].apply(lambda x: re.sub(regex, '', str(x)))
    
    # 3. Tokenization & Pos Tagging
    pos_tag = []

    # 4. news_data 데이터프레임의 각 행에 대해 반복
    for _, row in tqdm(news_data.iterrows()):
        # 현재 행의 content 값을 news_text 변수에 할당
        news_text = row['clean_content']

        # PosTagging
        tokens_ko = t.pos(news_text)
        pos_tag.append(tokens_ko)
    
    # 5. Normalization
    normalization_li = []
    for pos in pos_tag: 
        in_li = []
        for ele in pos:
            #품사가 조사, 접속사이면 continue
            if ele[1] in ['Josa','Suffix']:
                continue
            # 품사가 조사, 접속사가 아닌 경우, 리스트에 추가
            in_li.append(ele[0])
        # 한 문장의 정규화된 형태소를 정규화 리스트에 추가
        normalization_li.append(in_li)
        
    # 6. Stopword Removal    
    tokens = normalization_li
    token_stop = []
    # token들을 하나씩 확인
    for token in tokens:
        in_li = []
        # token이 stopword 리스트에 없다면, in_li 리슽트에 추가
        for tok in token:
            if tok not in stopwords:
                in_li.append(tok)
        token_stop.append(in_li)

    # 7. data save
    df_li = []
    # 각 문장의 token들을 공백을 기준으로 하나의 문자열로 결합
    for tokens in token_stop:
        token = ' '.join(tokens)
        df_li.append(token)
        
    # 전처리된 데이터를 데이터프레임으로 저장
    df = pd.DataFrame(df_li).rename(columns = {0:'preprocess_context'})
    news_data = pd.concat([news_data,df],axis=1)
    
    # Generate the file name 
    file_name = f"./{sector}_전처리/preprocess_news_{date.strftime('%Y%m')}_{sector}.csv"
    # Save the filed data to a new CSV files
    news_data.to_csv(file_name, encoding='utf-8-sig', index = False)

### 2.2 Sector Preprocessing : 바이오

In [15]:
sector = "바이오"

In [None]:
# Iterate over the date range
for date in pd.date_range(start, end, freq='M'):
    # Generate the file name for the specific month
    file_name = f"./{sector}/news_{date.strftime('%Y%m')}_{sector}.csv"
    
    # 1. Read the CSV file
    news_data = pd.read_csv(file_name, encoding='utf-8-sig')
    
    # 2. Cleaning
    regex = r'[^\w\s]'
    # text 열의 모든 값을 대상으로 정규표현식을 적용하여 특수문자를 제거
    news_data['clean_content'] = news_data['content'].apply(lambda x: re.sub(regex, '', str(x)))
    
    # 3. Tokenization & Pos Tagging
    pos_tag = []

    # 4. news_data 데이터프레임의 각 행에 대해 반복
    for _, row in tqdm(news_data.iterrows()):
        # 현재 행의 content 값을 news_text 변수에 할당
        news_text = row['clean_content']

        # PosTagging
        tokens_ko = t.pos(news_text)
        pos_tag.append(tokens_ko)
    
    # 5. Normalization
    normalization_li = []
    for pos in pos_tag: 
        in_li = []
        for ele in pos:
            #품사가 조사, 접속사이면 continue
            if ele[1] in ['Josa','Suffix']:
                continue
            in_li.append(ele[0])
        normalization_li.append(in_li)
        
    # 6. Stopword Removal    
    tokens = normalization_li
    token_stop = []
    for token in tokens:
        in_li = []
        for tok in token:
            if tok not in stopwords:
                in_li.append(tok)
        token_stop.append(in_li)

    # 7. data save
    df_li = []
    for tokens in token_stop:
        token = ' '.join(tokens)
        df_li.append(token)
    df = pd.DataFrame(df_li).rename(columns = {0:'preprocess_context'})
    news_data = pd.concat([news_data,df],axis=1)
    
    # Generate the file name 
    file_name = f"./{sector}_전처리/preprocess_news_{date.strftime('%Y%m')}_{sector}.csv"
    # Save the filed data to a new CSV files
    news_data.to_csv(file_name, encoding='utf-8-sig', index = False)
    
    #Display the first row of the preprocess data
#     print(news_data[['content','preprocess_context']].head())

### 2.3 Sector Preprocessing : 인터넷

In [18]:
sector = "인터넷"

In [None]:
# Iterate over the date range
for date in pd.date_range(start, end, freq='M'):
    # Generate the file name for the specific month
    file_name = f"./{sector}/news_{date.strftime('%Y%m')}_{sector}.csv"
    
    # 1. Read the CSV file
    news_data = pd.read_csv(file_name, encoding='utf-8-sig')
    
    # 2. Cleaning
    regex = r'[^\w\s]'
    # text 열의 모든 값을 대상으로 정규표현식을 적용하여 특수문자를 제거
    news_data['clean_content'] = news_data['content'].apply(lambda x: re.sub(regex, '', str(x)))
    
    # 3. Tokenization & Pos Tagging
    pos_tag = []

    # 4. news_data 데이터프레임의 각 행에 대해 반복
    for _, row in tqdm(news_data.iterrows()):
        # 현재 행의 content 값을 news_text 변수에 할당
        news_text = row['clean_content']

        # PosTagging
        tokens_ko = t.pos(news_text)
        pos_tag.append(tokens_ko)
    
    # 5. Normalization
    normalization_li = []
    for pos in pos_tag: 
        in_li = []
        for ele in pos:
            #품사가 조사, 접속사이면 continue
            if ele[1] in ['Josa','Suffix']:
                continue
            in_li.append(ele[0])
        normalization_li.append(in_li)
        
    # 6. Stopword Removal    
    tokens = normalization_li
    token_stop = []
    for token in tokens:
        in_li = []
        for tok in token:
            if tok not in stopwords:
                in_li.append(tok)
        token_stop.append(in_li)

    # 7. data save
    df_li = []
    for tokens in token_stop:
        token = ' '.join(tokens)
        df_li.append(token)
    df = pd.DataFrame(df_li).rename(columns = {0:'preprocess_context'})
    news_data = pd.concat([news_data,df],axis=1)
    
    # Generate the file name 
    file_name = f"./{sector}_전처리/preprocess_news_{date.strftime('%Y%m')}_{sector}.csv"
    # Save the filed data to a new CSV files
    news_data.to_csv(file_name, encoding='utf-8-sig', index = False)
    
    #Display the first row of the preprocess data
#     print(news_data[['content','preprocess_context']].head())

### 2.4 Sector Preprocessing : 게임

In [20]:
sector = "게임"

In [None]:
# Iterate over the date range
for date in pd.date_range(start, end, freq='M'):
    # Generate the file name for the specific month
    file_name = f"./{sector}/news_{date.strftime('%Y%m')}_{sector}.csv"
    
    # 1. Read the CSV file
    news_data = pd.read_csv(file_name, encoding='utf-8-sig')
    
    # 2. Cleaning
    regex = r'[^\w\s]'
    # text 열의 모든 값을 대상으로 정규표현식을 적용하여 특수문자를 제거
    news_data['clean_content'] = news_data['content'].apply(lambda x: re.sub(regex, '', str(x)))
    
    # 3. Tokenization & Pos Tagging
    pos_tag = []

    # 4. news_data 데이터프레임의 각 행에 대해 반복
    for _, row in tqdm(news_data.iterrows()):
        # 현재 행의 content 값을 news_text 변수에 할당
        news_text = row['clean_content']

        # PosTagging
        tokens_ko = t.pos(news_text)
        pos_tag.append(tokens_ko)
    
    # 5. Normalization
    normalization_li = []
    for pos in pos_tag: 
        in_li = []
        for ele in pos:
            #품사가 조사, 접속사이면 continue
            if ele[1] in ['Josa','Suffix']:
                continue
            in_li.append(ele[0])
        normalization_li.append(in_li)
        
    # 6. Stopword Removal    
    tokens = normalization_li
    token_stop = []
    for token in tokens:
        in_li = []
        for tok in token:
            if tok not in stopwords:
                in_li.append(tok)
        token_stop.append(in_li)

    # 7. data save
    df_li = []
    for tokens in token_stop:
        token = ' '.join(tokens)
        df_li.append(token)
    df = pd.DataFrame(df_li).rename(columns = {0:'preprocess_context'})
    news_data = pd.concat([news_data,df],axis=1)
    
    # Generate the file name 
    file_name = f"./{sector}_전처리/preprocess_news_{date.strftime('%Y%m')}_{sector}.csv"
    # Save the filed data to a new CSV files
    news_data.to_csv(file_name, encoding='utf-8-sig', index = False)
    
    #Display the first row of the preprocess data
#     print(news_data[['content','preprocess_context']].head())

### 2.5 종목 Preprocessing : 삼성전자

In [22]:
sector = "삼성전자"

In [None]:
# Iterate over the date range
for date in pd.date_range(start, end, freq='M'):
    # Generate the file name for the specific month
    file_name = f"./{sector}/news_{date.strftime('%Y%m')}_{sector}.csv"
    
    # 1. Read the CSV file
    news_data = pd.read_csv(file_name, encoding='utf-8-sig')
    
    # 2. Cleaning
    regex = r'[^\w\s]'
    # text 열의 모든 값을 대상으로 정규표현식을 적용하여 특수문자를 제거
    news_data['clean_content'] = news_data['content'].apply(lambda x: re.sub(regex, '', str(x)))
    
    # 3. Tokenization & Pos Tagging
    pos_tag = []

    # 4. news_data 데이터프레임의 각 행에 대해 반복
    for _, row in tqdm(news_data.iterrows()):
        # 현재 행의 content 값을 news_text 변수에 할당
        news_text = row['clean_content']

        # PosTagging
        tokens_ko = t.pos(news_text)
        pos_tag.append(tokens_ko)
    
    # 5. Normalization
    normalization_li = []
    for pos in pos_tag: 
        in_li = []
        for ele in pos:
            #품사가 조사, 접속사이면 continue
            if ele[1] in ['Josa','Suffix']:
                continue
            in_li.append(ele[0])
        normalization_li.append(in_li)
        
    # 6. Stopword Removal    
    tokens = normalization_li
    token_stop = []
    for token in tokens:
        in_li = []
        for tok in token:
            if tok not in stopwords:
                in_li.append(tok)
        token_stop.append(in_li)

    # 7. data save
    df_li = []
    for tokens in token_stop:
        token = ' '.join(tokens)
        df_li.append(token)
    df = pd.DataFrame(df_li).rename(columns = {0:'preprocess_context'})
    news_data = pd.concat([news_data,df],axis=1)
    
    # Generate the file name 
    file_name = f"./{sector}_전처리/preprocess_news_{date.strftime('%Y%m')}_{sector}.csv"
    # Save the filed data to a new CSV files
    news_data.to_csv(file_name, encoding='utf-8-sig', index = False)
    
    #Display the first row of the preprocess data
#     print(news_data[['content','preprocess_context']].head())