In [None]:
# root path 설정
root_path = "C:/sh/study/krx데이콘/krx_2022/sh"

## 2022-06-01 ~ 2022-08-09 데이터 파일로 분할

+ 모듈 적재

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import datetime as dt

import datetime
import scrapetube
from googleapiclient.discovery import build

import soynlp
from soynlp.noun import LRNounExtractor_v2
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

import warnings
warnings.filterwarnings('ignore')

+ raw data 크롤링 및 분할 함수

In [None]:
# 종목코드 가져오는 코드
def get_code(symbol):
    krx = pd.read_csv(root_path + '/data/code/krx_code.csv',encoding='utf-8')
    krx = krx.set_index('한글 종목약명')
    try:
        code = krx.at[symbol,'단축코드']
        return code
    except:
        print('종목명을 다시 확인해주세요.')
        return 0

# 종토방 댓글 가져오는 코드
def get_comment_csv(symbol,page,year,month,day):   
    code = get_code(symbol)
    date_list = [] # 날짜
    comment_list = [] # 댓글
    view_list = [] # 조회수
    good_list = [] # 좋아요
    bad_list = [] # 싫어요
    flag = 0
    for i in range(1,page+1):
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')
                
                date_list.append(root[1].replace('.','-'))
                
                if len(root) == 14: # 답글
                    comment_list.append('답글:'+root[4])
                    view_list.append(root[10])
                    good_list.append(root[11])
                    bad_list.append(root[12])          
                elif len(root) == 13: # 기본
                    comment_list.append(root[3])
                    view_list.append(root[9])
                    good_list.append(root[10])
                    bad_list.append(root[11])
                else: # 에러
                    comment_list.append('error')
                    view_list.append(0)
                    good_list.append(0)
                    bad_list.append(0)   
            except:
                break
            tp = [int(j) for j in root[1].split()[0].split('.')]
            if dt.datetime(tp[0],tp[1],tp[2]) < dt.datetime(year,month,day):
                flag = 1
                break
        if flag == 1:
            break
        print(f'\r{i}페이지 크롤링 완료.',end='')
        
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = comment_list
    df['조회수'] = view_list
    df['좋아요'] = good_list
    df['싫어요'] = bad_list
    return df

# 종목이름 가져오는 코드
def get_company_name():
    df = pd.read_excel(root_path + '/data/code/KODEX_KTOP_30_20220629.xlsx',header=2).drop(0,axis=0)
    return df.종목명.tolist()

# 해당 년,월,일까지 종토방 댓글 가져오는 코드
# 종목이름 순서대로 각 데이터프레임을 리스트에 저장하여 반환
def get_date_comment(year,month,day):
    c_list = get_company_name()
    data_list = []
    for company in c_list:
        print(company,"크롤링")
        df = get_comment_csv(company,10000,year,month,day)
        data_list.append(df)
        print()
    return data_list

# 특수문자 제거
def clean_sents_df(target):
    df = target
    df['정제된 댓글'] = df['댓글'].str.replace('\\[삭제된 게시물의 답글\\]',' ')
    df['정제된 댓글'] = df['정제된 댓글'].str.replace('답글:',' ')
    df['정제된 댓글'] = df['정제된 댓글'].str.replace('[^가-힣]',' ').str.replace(' +',' ').str.strip()
    df = df[df['정제된 댓글'] != '']
    df = df.reset_index(drop=True)
    return  df

# 댓글 토큰화를 위한 말뭉치 준비
def return_tokenizer():
    corpus = DoublespaceLineCorpus(root_path + "/data/code/corpus_target.txt",iter_sent=True)
    noun_extractor = LRNounExtractor_v2(verbose=True)
    nouns = noun_extractor.train_extract(corpus)
    scores = {word:score.score for word, score in nouns.items()}
    tokenizer = LTokenizer(scores=scores)
    return tokenizer

# 오늘의 댓글 데이터를 저장 후 반환
def get_data_list():
    # 2022-06-01 데이터부터 가져옴
    data_list = get_date_comment(2022,6,1)
    df = pd.read_csv(root_path + f"/data/youtube/sampro.csv")
    data_list.append(df)
    return data_list

# 각 데이터를 전처리 후 리스트에 저장
def comment_prep(data_list):
    tokenizer = return_tokenizer()

    pp_list = []
    for company_data in data_list:
        target_df = clean_sents_df(company_data)
        target_df['토큰화 댓글'] = [tokenizer(str(i)) for i in target_df['정제된 댓글']]
        pp_list.append(target_df)
    
    return pp_list

# 댓글 크롤링, 전처리, 파일로 저장
def date_crawler():
    data_list = get_data_list()
    pp_list = comment_prep(data_list)
    df_day = pp_list[0]
    for i in range(1,len(pp_list)):
        df_day = pd.concat([df_day, pp_list[i]])
    today = str(datetime.datetime.today())[:10]
    df_day = df_day[["날짜","정제된 댓글"]]
    df_day.dropna(inplace=True)
    df_day.to_csv(root_path + f"/data/alldf.csv")

+ 데이터 크롤링
    + 전체 데이터를 하나의 파일로 저장

In [None]:
date_crawler()

In [None]:
# 해당 날짜로 체크
target_date = ['2022-06-01', '2022-06-02', '2022-06-03', '2022-06-04', '2022-06-05',
               '2022-06-06', '2022-06-07', '2022-06-08', '2022-06-09', '2022-06-10',
               '2022-06-11', '2022-06-12', '2022-06-13', '2022-06-14', '2022-06-15',
               '2022-06-16', '2022-06-17', '2022-06-18', '2022-06-19', '2022-06-20',
               '2022-06-21', '2022-06-22', '2022-06-23', '2022-06-24', '2022-06-25',
               '2022-06-26', '2022-06-27', '2022-06-28', '2022-06-29', '2022-06-30',
               '2022-07-01', '2022-07-02', '2022-07-03', '2022-07-04', '2022-07-05',
               '2022-07-06', '2022-07-07', '2022-07-08', '2022-07-09', '2022-07-10',
               '2022-07-11', '2022-07-12', '2022-07-13', '2022-07-14', '2022-07-15',
               '2022-07-16', '2022-07-17', '2022-07-18', '2022-07-19', '2022-07-20',
               '2022-07-21', '2022-07-22', '2022-07-23', '2022-07-24', '2022-07-25',
               '2022-07-26', '2022-07-27', '2022-07-28', '2022-07-29', '2022-07-30',"2022-07-31",
               '2022-08-01', '2022-08-02', '2022-08-03', '2022-08-04', '2022-08-05',
               '2022-08-06', '2022-08-07', '2022-08-08', '2022-08-09']

df = pd.read_csv(root_path + f"/data/alldf.csv")

# 날짜 형식을 동일하게 변경
date_list = []
for i in range(len(df["날짜"])):
    date_list.append(str(df["날짜"].iloc[i][:10]))
df["날짜"] = date_list

# 각 날짜별 파일로 변경
for date in target_date:
    df2 = df[df["날짜"]==date]
    df2.to_csv(root_path + f"/data/date/{date}.csv")

+ 날짜별 파일 저장
    + 하나의 파일로 저장된 데이터를 날짜별 데이터로 분할

In [None]:
# 해당 날짜로 체크
target_date = ['2022-06-01', '2022-06-02', '2022-06-03', '2022-06-04', '2022-06-05',
               '2022-06-06', '2022-06-07', '2022-06-08', '2022-06-09', '2022-06-10',
               '2022-06-11', '2022-06-12', '2022-06-13', '2022-06-14', '2022-06-15',
               '2022-06-16', '2022-06-17', '2022-06-18', '2022-06-19', '2022-06-20',
               '2022-06-21', '2022-06-22', '2022-06-23', '2022-06-24', '2022-06-25',
               '2022-06-26', '2022-06-27', '2022-06-28', '2022-06-29', '2022-06-30',
               '2022-07-01', '2022-07-02', '2022-07-03', '2022-07-04', '2022-07-05',
               '2022-07-06', '2022-07-07', '2022-07-08', '2022-07-09', '2022-07-10',
               '2022-07-11', '2022-07-12', '2022-07-13', '2022-07-14', '2022-07-15',
               '2022-07-16', '2022-07-17', '2022-07-18', '2022-07-19', '2022-07-20',
               '2022-07-21', '2022-07-22', '2022-07-23', '2022-07-24', '2022-07-25',
               '2022-07-26', '2022-07-27', '2022-07-28', '2022-07-29', '2022-07-30',"2022-07-31",
               '2022-08-01', '2022-08-02', '2022-08-03', '2022-08-04', '2022-08-05',
               '2022-08-06', '2022-08-07', '2022-08-08', '2022-08-09']

df = pd.read_csv(root_path + f"/data/alldf.csv")

for date in target_date:
    df2 = df[df["날짜"]==date]
    df2.to_csv(root_path + f"/data/date/{date}.csv")

## 하루치 데이터 크롤링
+ 날짜 정보를 받아 데이터를 크롤링하는 모듈
+ 날짜는 컴퓨터에 내장된 시계에서 받아서 사용

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import datetime as dt

import datetime
import scrapetube
from googleapiclient.discovery import build

import soynlp
from soynlp.noun import LRNounExtractor_v2
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

import warnings
warnings.filterwarnings('ignore')

In [None]:
# 종목코드 가져오는 코드
def get_code(symbol):
    krx = pd.read_csv(root_path + '/data/code/krx_code.csv',encoding='utf-8')
    krx = krx.set_index('한글 종목약명')
    try:
        code = krx.at[symbol,'단축코드']
        return code
    except:
        print('종목명을 다시 확인해주세요.')
        return 0

# 종토방 댓글 가져오는 코드
def get_comment_csv(symbol,page,year,month,day):   
    code = get_code(symbol)
    date_list = [] # 날짜
    comment_list = [] # 댓글
    view_list = [] # 조회수
    good_list = [] # 좋아요
    bad_list = [] # 싫어요
    flag = 0
    for i in range(1,page+1):
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')
                
                date_list.append(root[1].replace('.','-'))
                
                if len(root) == 14: # 답글
                    comment_list.append('답글:'+root[4])
                    view_list.append(root[10])
                    good_list.append(root[11])
                    bad_list.append(root[12])          
                elif len(root) == 13: # 기본
                    comment_list.append(root[3])
                    view_list.append(root[9])
                    good_list.append(root[10])
                    bad_list.append(root[11])
                else: # 에러
                    comment_list.append('error')
                    view_list.append(0)
                    good_list.append(0)
                    bad_list.append(0)   
            except:
                break
            tp = [int(j) for j in root[1].split()[0].split('.')]
            if dt.datetime(tp[0],tp[1],tp[2]) < dt.datetime(year,month,day):
                flag = 1
                break
        if flag == 1:
            break
        print(f'\r{i}페이지 크롤링 완료.',end='')
        
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = comment_list
    df['조회수'] = view_list
    df['좋아요'] = good_list
    df['싫어요'] = bad_list
    return df

# 종목이름 가져오는 코드
def get_company_name():
    df = pd.read_excel(root_path + '/data/code/KODEX_KTOP_30_20220629.xlsx',header=2).drop(0,axis=0)
    return df.종목명.tolist()

# 해당 년,월,일까지 종토방 댓글 가져오는 코드
# 종목이름 순서대로 각 데이터프레임을 리스트에 저장하여 반환
def get_date_comment(year,month,day):
    c_list = get_company_name()
    data_list = []
    for company in c_list:
        print(company,"크롤링")
        df = get_comment_csv(company,10000,year,month,day)
        data_list.append(df)
        print()
    return data_list

# 유튜브 댓글 크롤링
class YoutubeAPI():
    def __init__ (self,date):
        self.date = date
        self.api_key = 'AIzaSyB-ttkB5mrZ6eo_iXlZdSv7zu105SgS2-E'
        self.youtube = build('youtube', 'v3', developerKey=self.api_key)
        self.channel_id = 'UChlv4GSd7OQl3js-jkLOnFA' # 삼프로TV

        self.get_video_ids()
        
    def get_video_ids(self):
        videos = scrapetube.get_channel(self.channel_id)
        
        video_ids = []
        for video in videos:
            video_ids.append(video['videoId'])
        
        (date, date_range) = self.get_date_input()
        self.get_video_infos(video_ids, date, date_range)
        
    def get_date_input(self):
        date = self.date
        date_range = (0)
        
        return (date, date_range)
    
    def get_video_infos(self, video_ids, date, date_range):
        video_infos = []
        
        for i in range(date_range + 1):
            start = i * 50
            end = (i + 1) * 50
            
            video_request = self.youtube.videos().list(
                part='snippet',
                id=','.join(video_ids[start:end]))
            
            video_response = video_request.execute()
            
            for item in video_response['items']:
                title = item['snippet']['title']
                if ('글로벌 이슈체크' in title) or ('글로벌 마켓브리핑' in title) or ('직장인 vlog' in title):
                    continue
                if date in item['snippet']['publishedAt'].split()[0]:
                    video_infos.append([item['snippet']['title'], item['snippet']['publishedAt'], item['id']])
    
        df_ids = pd.DataFrame(video_infos, columns=['title', 'video_date', 'id'])
            
        self.get_comments(date, df_ids)
            
    def get_comments(self, date, df_ids):
        comments = []
        
        for video_id in df_ids['id']:
            api_obj = build('youtube', 'v3', developerKey=self.api_key)
            response = api_obj.commentThreads().list(part='snippet', videoId=video_id, maxResults=100).execute()
            
            while response:
                for item in response['items']:
                    comment = item['snippet']['topLevelComment']['snippet']
                    if date in comment['publishedAt'].split()[0]:
                        comments.append([video_id, comment['textDisplay'], comment['authorDisplayName'], comment['publishedAt'], comment['likeCount']])
            
                if 'nextPageToken' in response:
                    response = api_obj.commentThreads().list(part='snippet', videoId=video_id, pageToken=response['nextPageToken'], maxResults=100).execute()
                else:
                    break
                
        df_comments = pd.DataFrame(comments, columns=['id', 'comment', 'author', 'comment_date', 'num_likes'])
        
        df = pd.merge(df_comments, df_ids, on='id', how='outer')
        
        df.to_csv(root_path + f'/data/youtube/sampro_{date}.csv', index=False)
        
    def update_db(self, df):
        pass
    
    def execute_daily(self):
        pass

# 특수문자 제거
def clean_sents_df(target):
    df = target
    df['정제된 댓글'] = df['댓글'].str.replace('\\[삭제된 게시물의 답글\\]',' ')
    df['정제된 댓글'] = df['정제된 댓글'].str.replace('답글:',' ')
    df['정제된 댓글'] = df['정제된 댓글'].str.replace('[^가-힣]',' ').str.replace(' +',' ').str.strip()
    df = df[df['정제된 댓글'] != '']
    df = df.reset_index(drop=True)
    return  df

# 댓글 토큰화를 위한 말뭉치 준비
def return_tokenizer():
    corpus = DoublespaceLineCorpus(root_path + "/data/code/corpus_target.txt",iter_sent=True)
    noun_extractor = LRNounExtractor_v2(verbose=True)
    nouns = noun_extractor.train_extract(corpus)
    scores = {word:score.score for word, score in nouns.items()}
    tokenizer = LTokenizer(scores=scores)
    return tokenizer

# 오늘의 댓글 데이터를 저장 후 반환
def get_data_list(y,m,d):
    data_list = get_date_comment(y,m,d)
    if len(str(m)) == 1:
        m = f"0{m}"
    if len(str(d)) == 1:
        d = f"0{d}"
    date = f"{y}-{m}-{d}"
    ya = YoutubeAPI(date)
    df = pd.read_csv(root_path + f"/data/youtube/sampro_{date}.csv")
    df = df[["video_date","comment"]]
    df.columns = ["날짜","댓글"]
    data_list.append(df)
    return data_list

# 각 데이터를 전처리 후 리스트에 저장
def comment_prep(data_list):
    tokenizer = return_tokenizer()

    pp_list = []
    for company_data in data_list:
        target_df = clean_sents_df(company_data)
        target_df['토큰화 댓글'] = [tokenizer(str(i)) for i in target_df['정제된 댓글']]
        pp_list.append(target_df)
    
    for df in pp_list:
        date_list = []
        for i in range(len(df["날짜"])):
            date_list.append(df["날짜"][i][:10])
        df["날짜"] = date_list
    
    return pp_list

# 댓글 크롤링, 전처리, 파일로 저장
def date_crawler(y,m,d):
    data_list = get_data_list(y,m,d)
    pp_list = comment_prep(data_list)
    df_day = pp_list[0]
    for i in range(1,len(pp_list)):
        df_day = pd.concat([df_day, pp_list[i]])
    if len(str(m)) == 1:
        m = f"0{m}"
    if len(str(d)) == 1:
        d = f"0{d}"
    today = f"{y}-{m}-{d}"
    df_day = df_day[["날짜","정제된 댓글"]]
    df = df_day[df_day["날짜"] == today]
    df.dropna(inplace=True)
    df.to_csv(root_path + f"/data/date/{today}.csv")

def get_today_ymd():
    y = datetime.datetime.today().year
    m = datetime.datetime.today().month
    d = datetime.datetime.today().day
    return y,m,d

+ 댓글 크롤링

In [None]:
# 크롤링 실행
y,m,d = get_today_ymd()
date_crawler(y,m,d)


## 버트모델 학습

+ 구글 드라이브 연결

In [None]:
# google colab 패키지 사용
from google.colab import drive

# 구글 드라이브 연결
drive.mount('/content/drive')

<!-- !pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece==0.1.96
!pip install transformers==3.0.2
!pip install torch
!pip install gluonnlp
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
!pip install pykrx
!pip install soynlp
!pip install xlrd==1.2.0 -->

+ 모듈 적재

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

from sklearn.model_selection import train_test_split

import pandas as pd

+ GPU 설정

In [None]:
# GPU 설정
device = torch.device("cuda:0")

+ 모델 학습

In [None]:
# 학습용 데이터 적재
file_path2 = f"{root_path}/data/code/train_label_-1_1.csv"
df2 = pd.read_csv(file_path2)

# -1인 라벨값을 0으로 변경
for i in range(len(df2["label"])):
  if df2["label"].iloc[i] == -1:
    df2["label"].iloc[i] = 0
df2["label"].astype("int32")

# 트레인 테스트 분리
x_train, x_test, y_train, y_test = train_test_split(df2["정제된 댓글"],df2["label"],test_size=0.2,random_state=11)
df_train = pd.concat([x_train,y_train],axis = 1)
df_test = pd.concat([x_test,y_test],axis = 1)
df_train.head()

# 파라미터 설정 
max_len = 128
batch_size = 64
warmup_ratio = 0.1
num_epochs = 2
max_grad_norm = 1
log_interval = 100
learning_rate =  0.00001

# 데이터셋을 koBERT 모델에 맞게 변형하는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i]) for i in dataset[sent_idx]]
        self.labels = [np.int32(i) for i in dataset[label_idx]]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))  


# koBERT 모듈을 적재하여 모델을 학습시키는 분류기 
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

# 토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

# 훈련데이터, 테스트데이터 전처리
data_train = BERTDataset(df_train, "정제된 댓글", "label", tok, max_len, True, False)
data_test = BERTDataset(df_test, "정제된 댓글", "label", tok, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

# 모델 생성
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

# optimizer 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 모델 학습
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

train_dataloader

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step() 
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)

# 모델 저장
# torch.save(model, root_path +"/data/model/"+'라벨_2_dr_0.5.pt') 
# torch.save(model.state_dict(),root_path +"/data/model/"+ 'model_state_dict_라벨_2_dr_0.5.pt')

+ 날짜별 데이터 공포/탐욕 점수 예측

In [None]:
# 버트모델 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

# 파라미터 설정 
max_len = 128
batch_size = 64
warmup_ratio = 0.1
num_epochs = 2
max_grad_norm = 1
log_interval = 100
learning_rate =  0.00001

# KoBERT에 입력될 데이터셋 정리
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))  

# fear/greed 평가
def new_softmax(a) : 
    c = np.max(a)
    exp_a = np.exp(a-c) 
    sum_exp_a = np.sum(exp_a)
    y = (exp_a / sum_exp_a) * 100
    return np.round(y, 3)

# 분류기 생성
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

# 예측 모델 설정
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)

        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            min_v = min(logits)
            total = 0
            probability = []
            logits = np.round(new_softmax(logits), 3).tolist()
            for logit in logits:
                probability.append(np.round(logit, 3))

            if np.argmax(logits) == 0: emotion = "fear"
            elif np.argmax(logits) == 1: emotion = 'greed'

            probability.append(emotion)
    return probability

## 학습 모델 로드
model = torch.load(root_path +"/data/model/"+'라벨_2_dr_0.5.pt')
model.load_state_dict(torch.load(root_path +"/data/model/"+'model_state_dict_라벨_2_dr_0.5.pt'))

#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
# 데이터 예측
target_date = ['2022-06-01', '2022-06-02', '2022-06-03', '2022-06-04', '2022-06-05',
               '2022-06-06', '2022-06-07', '2022-06-08', '2022-06-09', '2022-06-10',
               '2022-06-11', '2022-06-12', '2022-06-13', '2022-06-14', '2022-06-15',
               '2022-06-16', '2022-06-17', '2022-06-18', '2022-06-19', '2022-06-20',
               '2022-06-21', '2022-06-22', '2022-06-23', '2022-06-24', '2022-06-25',
               '2022-06-26', '2022-06-27', '2022-06-28', '2022-06-29', '2022-06-30',
               '2022-07-01', '2022-07-02', '2022-07-03', '2022-07-04', '2022-07-05',
               '2022-07-06', '2022-07-07', '2022-07-08', '2022-07-09', '2022-07-10',
               '2022-07-11', '2022-07-12', '2022-07-13', '2022-07-14', '2022-07-15',
               '2022-07-16', '2022-07-17', '2022-07-18', '2022-07-19', '2022-07-20',
               '2022-07-21', '2022-07-22', '2022-07-23', '2022-07-24', '2022-07-25',
               '2022-07-26', '2022-07-27', '2022-07-28', '2022-07-29', '2022-07-30',"2022-07-31",
               '2022-08-01', '2022-08-02', '2022-08-03', '2022-08-04', '2022-08-05',
               '2022-08-06', '2022-08-07', '2022-08-08', '2022-08-09', '2022-08-10',
               '2022-08-11', '2022-08-12', '2022-08-13', '2022-08-14', '2022-08-15',
               '2022-08-16', '2022-08-17', '2022-08-18']

for date in target_date:
  try:
    try:  # score 파일이 있는 경우 스킵
      pd.read_csv((root_path + f"/data/score/{date}_score.csv"))
      print(f"{date} 점수 예측 완료")
      continue
    except: # score 파일이 없는 경우 예측 진행
      df = pd.read_csv(root_path + f"/data/date/{date}.csv")
      print(f"{date} 예측 진행중")
      df_pred = []
      i = 0
      for comment in df["정제된 댓글"]:
        i += 1
        print(f'\r{i}/{len(df["정제된 댓글"])} 완료.',end='')
        fg_result = predict(comment)
        score = abs(fg_result[0] - fg_result[1])
        if fg_result[2] == "fear":
            score = -score
        df_pred.append(score)
        df2 = pd.DataFrame(df_pred,columns = ["socre"])
        df2.to_csv(root_path + f"/data/score/{date}_score.csv")
      print()
  except:
    # date 파일이 둘 다 없는 경우
    pass