<a href="https://colab.research.google.com/github/Gayeon6423/BusinessAI-Capston/blob/main/Modeling_Sentiment_Analysis_KoFinBERT_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Timeline

### 1. Import library
### 2. Load Data
### 3. Sentiment Analysis

### 1. Import library

In [None]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount("/content/drive")
# 모듈 설치
# !pip install konlpy
!pip install transformers
!pip install tensorflow_addons
# 데이터 처리 모듈
import pandas as pd
import numpy as np
import copy
import re
import math
import json
import requests
import urllib.request
from tqdm import tqdm
import datetime
import time
import os
import torch
# 텍스트 관련 모듈
# from konlpy.tag import Okt
# okt = Okt()
# from konlpy.tag import *
# import nltk
# 전처리 모듈
from sklearn.preprocessing import MinMaxScaler
#시각화 모듈
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
# 모델링 관련 모듈
from torch.utils.data import DataLoader, TensorDataset
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import BertTokenizer, pipeline, TFBertForSequenceClassification,BertForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss

- Checking GPU Working

In [2]:
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
  print("GPU 작동 중")
  mirrored_strategy = tf.distribute.MirroredStrategy()
else:
  print("GPU 미작동 중")

GPU 작동 중


### 2. Load Data

In [3]:
class LoadGoogleDriveData():
  def __init__(self, data = None):
    self.data = data

  def loadData(self, file_path: str, file_name_extension,
               columnTF: bool, unicode: str) -> pd.DataFrame():
    self.data = pd.read_csv(os.path.join(file_path + file_name_extension),
                            index_col = columnTF,
                            sep = ",",
                            na_values = "NaN",
                            encoding = unicode)
    return self.data

  def loadTxTData(self, file_path: str, file_name_extension,
               columnTF: bool, unicode: str) -> pd.DataFrame():
    self.data = pd.read_csv(os.path.join(file_path + file_name_extension),
                            index_col = columnTF,
                            sep = "|",
                            na_values = "NaN",
                            encoding = unicode)
    return self.data

  def loadExcelData(self, file_path: str, file_name_extension,
               columnTF: bool) -> pd.DataFrame():
    self.data = pd.read_excel(os.path.join(file_path + file_name_extension),
                              index_col = columnTF)
    return self.data

  # 용량이 큰 csv 파일 읽어오기(fopen - fread와 유사한 방식)
  def loadDataWithChunking(self, file_path: str, file_name_extension,
                           chunking_row_num: int, columnTF: bool, unicode: str) -> pd.DataFrame():
    chunkdata = pd.read_csv(os.path.join(file_path + file_name_extension),
                            chunksize = chunking_row_num,
                            index_col = columnTF,
                            sep = ",",
                            na_values = "NaN",
                            encoding = unicode)
    self.data = list(chunkdata)
    self.data = pd.concat(self.data)

    return self.data

In [4]:
mountInstance = LoadGoogleDriveData()

### 3. Sentiment Analysis

- 참고 자료

```
- financial_phrasebank : https://huggingface.co/datasets/financial_phrasebank
- 한국어 경제 뉴스 기사 감정 분류 github : https://github.com/park-gb/financial-news-sentiment-classifier
- 한국어 경제 뉴스 기사 감정 분류 blog : https://heytech.tistory.com/394
- kofinbert finbert : https://huggingface.co/kwoncho/KoFinBERT
- finbert hugging face : https://huggingface.co/yiyanghkust/finbert-tone
```


- kofinbert 예시

```
tokenizer = AutoTokenizer.from_pretrained("kwoncho/KoFinBERT")
kofinbert = AutoModelForSequenceClassification.from_pretrained("kwoncho/KoFinBERT")
text_classifier = pipeline("sentiment-analysis", model=kofinbert, tokenizer=tokenizer)
sentences = ["SK이노베이션의 자원 개발 자회사인 SK어스온이 남중국해 해상 광구에서 원유 생산에 성공했다",
             "SK이노베이션의 자원 개발 자회사인 SK어스온이 남중국해 해상 광구에서 원유 생산에 실패했다"]
text_classifier(sentences)
```







- Import Tokenizer & KoFinBert Model From Hugging Face

In [6]:
tokenizer = AutoTokenizer.from_pretrained("kwoncho/KoFinBERT")
model = AutoModelForSequenceClassification.from_pretrained("kwoncho/KoFinBERT")
text_classifier = pipeline("sentiment-analysis", model = model, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/980 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]



```
# 예시 데이터
file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Preprocessing_Data/Kospi_Preprocessing_News_Data/'
file_name = "news_preprocess_202301_kospi.csv"
news_preprocess_202301_kospi = mountInstance.loadData(file_path = file_path, file_name_extension = file_name, unicode='utf-8-sig', columnTF=False)

# 일자별 감정지수 저장할 데이터프레임 생성
daily_sentiments = pd.DataFrame(columns = ['Date','Positive','Negative','Neutral'])
# 고유한 기사 작성 날짜
unique_dates = news_preprocess_202301_kospi['pubdate'].unique()
# 파일 저장 경로
file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Sentiment_Score_Data/'
keyword = 'Kospi'

# 각 날짜에 대한 감정 분석
for date in unique_dates:
    # 날짜에 대한 기사 제목 결합
    text_for_date = news_preprocess_202301_kospi[news_preprocess_202301_kospi['pubdate']==date]['title'].tolist()
    tokenized_texts = [[word for word in tokenizer.tokenize(text)] for text in text_for_date]

    # 뉴스 본문 최대 길이(max tokens is 749 on news content)
    # 뉴스 기사 최대 길이(max tokens is 80 on news content)
    max_token_length = 100
    tokenized_texts = [" ".join(tokens[:max_token_length]) for tokens in tokenized_texts]
    # input 층 설정
    inputs = tokenizer(tokenized_texts, padding=True, truncation=True, return_tensors="pt", max_length=max_token_length)

    with torch.no_grad():
      outputs = model(**inputs)
      logits = outputs.logits

    probabilities = torch.softmax(logits, dim=1)
    positive_prob = probabilities[:, 2].mean().item()
    negative_prob = probabilities[:, 0].mean().item()
    neutral_prob = probabilities[:, 1].mean().item()

    daily_sentiments = daily_sentiments.append({"Date": date, "Positive": positive_prob, "Negative": negative_prob, "Neutral": neutral_prob}, ignore_index=True)
    daily_sentiments.to_csv(file_path + f'Sentiment_Score_{keyword}_{date}.csv')
```



- 일자별 감정분석 메서드

In [67]:
def sentiment_scoring(start_date, end_date, keyword):
    # sentiment score dataframe
    daily_sentiments_tmp = pd.DataFrame(columns = ['Date','Positive','Negative','Neutral'])
    daily_sentiment = pd.DataFrame(columns = ['Date','Positive','Negative','Neutral'])

    # Read the Month News Data
    for month in pd.date_range(start_date, end_date, freq = 'M'):
        file_path = f'/content/drive/MyDrive/산업 AI 캡스톤/DATA/Preprocessing_Data/{keyword}_Preprocessing_News_Data/'
        file_name = f"news_preprocess_{month.strftime('%Y%m')}_{keyword}.csv"
        month_news_data = mountInstance.loadData(file_path = file_path, file_name_extension = file_name, unicode='utf-8-sig', columnTF=False)

        # unique pub date
        unique_dates = month_news_data['pubdate'].unique()
        # Save File path
        file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Sentiment_Score_Data/'

        # 각 날짜에 대한 Sentiment Analysis
        for date in unique_dates:
            # 날짜에 대한 기사 제목 결합
            ## 데이터 수집 완료 후 에는 장 시작 시간, 장 마감 시간도 고려해서 평균 계산
            text_for_date = month_news_data[month_news_data['pubdate']==date]['title'].tolist()
            tokenized_texts = [[word for word in tokenizer.tokenize(text)] for text in text_for_date]

            # 뉴스 본문 최대 길이(max tokens is 749 on news content)
            # 뉴스 기사 최대 길이(max tokens is 80 on news content)
            max_token_length = 100
            tokenized_texts = [" ".join(tokens[:max_token_length]) for tokens in tokenized_texts]
            # input 층 설정
            inputs = tokenizer(tokenized_texts, padding=True, truncation=True, return_tensors="pt", max_length=max_token_length)

            with torch.no_grad():
              outputs = model(**inputs)
              logits = outputs.logits

            probabilities = torch.softmax(logits, dim=1)
            # sentiment 분류 확률
            positive_prob = probabilities[:, 2].mean().item()
            negative_prob = probabilities[:, 0].mean().item()
            neutral_prob = probabilities[:, 1].mean().item()
            # sentiment score 계산
            daily_sentiments_tmp = daily_sentiments_tmp.append({"Date": date, "Positive": positive_prob, "Negative": negative_prob, "Neutral": neutral_prob}, ignore_index=True)
            daily_sentiments_tmp['sentiment_score'] = daily_sentiments_tmp['Positive'] - daily_sentiments_tmp['Negative']

        daily_sentiments = daily_sentiments_tmp.append(daily_sentiments_tmp)
        daily_sentiments.to_csv(file_path + f'Sentiment_Score_{keyword}_{month.strftime("%Y%m")}_KoFinBERT.csv',index=False)

- Kospi Sentiment Analysis

In [None]:
start_date = '2023-01'
end_date = '2023-02'
keyword = "Kospi"
sentiment_scoring(start_date, end_date, keyword)

- Kosdaq Sentiment Analysis

In [None]:
start_date = '2023-01'
end_date = '2023-02'
keyword = "Kosdaq"
sentiment_scoring(start_date, end_date, keyword)

In [69]:
# 뉴스 제목 기반 감정분석
file_path = '/content/drive/MyDrive/산업 AI 캡스톤/DATA/Sentiment_Score_Data/'
file_name = 'Sentiment_Score_Kospi_202301_KoFinBERT.csv'
Sentiment_Score_Kospi_202301 = mountInstance.loadData(file_path = file_path, file_name_extension = file_name, unicode='utf-8-sig', columnTF=False)
Sentiment_Score_Kospi_202301.head(5)

Unnamed: 0,Date,Positive,Negative,Neutral,sentiment_score
0,20230103.0,0.024744,0.091738,0.883518,-0.066994
1,20230104.0,0.023362,0.012003,0.964635,0.01136
2,20230105.0,0.038854,0.009683,0.951463,0.029171
3,20230106.0,0.047947,0.011895,0.940158,0.036051
4,20230109.0,0.022619,0.032354,0.945027,-0.009735


### [본문]
- 1 month sentiment modeling runtime : 150분(2시간 30분)
- 30 month sentiment modeling runtime : 4500분(187시간)

### [제목]
- 1 month sentiment modeling runtime : 10분
- 30 month sentiment modeling runtime : 300분(6시간)

### [해결책]
- 해결책1 : 제목만 가지고 진행
- 해결책2 : 문장을 한 문장으로 요약 후 진행