## 날짜별 댓글 합치기
+ youtube 폴더의 sampro 파일은 유튜브 댓글 로우데이터
+ 그 외 종토방 댓글은 실시간 크롤링으로 받아옴

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import datetime as dt

import datetime
import scrapetube
from googleapiclient.discovery import build

import soynlp
from soynlp.noun import LRNounExtractor_v2
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

import warnings
warnings.filterwarnings('ignore')

# root path 설정
root_path = "C:/sh/study/krx데이콘/krx_2022/sh"

In [4]:
# 종목코드 가져오는 코드
def get_code(symbol):
    krx = pd.read_csv(root_path + '/data/code/krx_code.csv',encoding='utf-8')
    krx = krx.set_index('한글 종목약명')
    try:
        code = krx.at[symbol,'단축코드']
        return code
    except:
        print('종목명을 다시 확인해주세요.')
        return 0

# 종토방 댓글 가져오는 코드
def get_comment_csv(symbol,page,year,month,day):   
    code = get_code(symbol)
    date_list = [] # 날짜
    comment_list = [] # 댓글
    view_list = [] # 조회수
    good_list = [] # 좋아요
    bad_list = [] # 싫어요
    flag = 0
    for i in range(1,page+1):
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')
                
                date_list.append(root[1].replace('.','-'))
                
                if len(root) == 14: # 답글
                    comment_list.append('답글:'+root[4])
                    view_list.append(root[10])
                    good_list.append(root[11])
                    bad_list.append(root[12])          
                elif len(root) == 13: # 기본
                    comment_list.append(root[3])
                    view_list.append(root[9])
                    good_list.append(root[10])
                    bad_list.append(root[11])
                else: # 에러
                    comment_list.append('error')
                    view_list.append(0)
                    good_list.append(0)
                    bad_list.append(0)   
            except:
                break
            tp = [int(j) for j in root[1].split()[0].split('.')]
            if dt.datetime(tp[0],tp[1],tp[2]) < dt.datetime(year,month,day):
                flag = 1
                break
        if flag == 1:
            break
        print(f'\r{i}페이지 크롤링 완료.',end='')
        
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = comment_list
    df['조회수'] = view_list
    df['좋아요'] = good_list
    df['싫어요'] = bad_list
    return df

# 종목이름 가져오는 코드
def get_company_name():
    df = pd.read_excel(root_path + '/data/code/KODEX_KTOP_30_20220629.xlsx',header=2).drop(0,axis=0)
    return df.종목명.tolist()

# 해당 년,월,일까지 종토방 댓글 가져오는 코드
# 종목이름 순서대로 각 데이터프레임을 리스트에 저장하여 반환
def get_date_comment(year,month,day):
    c_list = get_company_name()
    data_list = []
    for company in c_list:
        print(company,"크롤링")
        df = get_comment_csv(company,10000,year,month,day)
        data_list.append(df)
        print()
    return data_list

# 특수문자 제거
def clean_sents_df(target):
    df = target
    df['정제된 댓글'] = df['댓글'].str.replace('\\[삭제된 게시물의 답글\\]',' ')
    df['정제된 댓글'] = df['정제된 댓글'].str.replace('답글:',' ')
    df['정제된 댓글'] = df['정제된 댓글'].str.replace('[^가-힣]',' ').str.replace(' +',' ').str.strip()
    df = df[df['정제된 댓글'] != '']
    df = df.reset_index(drop=True)
    return  df

# 댓글 토큰화를 위한 말뭉치 준비
def return_tokenizer():
    corpus = DoublespaceLineCorpus(root_path + "/data/code/corpus_target.txt",iter_sent=True)
    noun_extractor = LRNounExtractor_v2(verbose=True)
    nouns = noun_extractor.train_extract(corpus)
    scores = {word:score.score for word, score in nouns.items()}
    tokenizer = LTokenizer(scores=scores)
    return tokenizer

# 오늘의 댓글 데이터를 저장 후 반환
def get_data_list():
    # 2022-06-01 데이터부터 가져옴
    data_list = get_date_comment(2022,6,1)
    date = str(datetime.datetime.today())[:10]
    df = pd.read_csv(root_path + f"/data/youtube/sampro.csv")
    data_list.append(df)
    return data_list

# 각 데이터를 전처리 후 리스트에 저장
def comment_prep(data_list):
    tokenizer = return_tokenizer()

    pp_list = []
    for company_data in data_list:
        target_df = clean_sents_df(company_data)
        target_df['토큰화 댓글'] = [tokenizer(str(i)) for i in target_df['정제된 댓글']]
        pp_list.append(target_df)
    
    for df in pp_list:
        date_list = []
        for i in range(len(df["날짜"])):
            date_list.append(df["날짜"][i][:10])
        df["날짜"] = date_list
    
    return pp_list

# 댓글 크롤링, 전처리, 파일로 저장
def date_crawler():
    data_list = get_data_list()
    pp_list = comment_prep(data_list)
    df_day = pp_list[0]
    for i in range(1,len(pp_list)):
        df_day = pd.concat([df_day, pp_list[i]])
    today = str(datetime.datetime.today())[:10]
    df_day = df_day[["날짜","정제된 댓글"]]
    df = df_day[df_day["날짜"] == today]
    df.dropna(inplace=True)
    df.to_csv(root_path + f"/data/alldf.csv")

In [3]:
# 해당 날짜로 체크
target_date = ['2022-06-01', '2022-06-02', '2022-06-03', '2022-06-04', '2022-06-05',
               '2022-06-06', '2022-06-07', '2022-06-08', '2022-06-09', '2022-06-10',
               '2022-06-11', '2022-06-12', '2022-06-13', '2022-06-14', '2022-06-15',
               '2022-06-16', '2022-06-17', '2022-06-18', '2022-06-19', '2022-06-20',
               '2022-06-21', '2022-06-22', '2022-06-23', '2022-06-24', '2022-06-25',
               '2022-06-26', '2022-06-27', '2022-06-28', '2022-06-29', '2022-06-30',
               '2022-07-01', '2022-07-02', '2022-07-03', '2022-07-04', '2022-07-05',
               '2022-07-06', '2022-07-07', '2022-07-08', '2022-07-09', '2022-07-10',
               '2022-07-11', '2022-07-12', '2022-07-13', '2022-07-14', '2022-07-15',
               '2022-07-16', '2022-07-17', '2022-07-18', '2022-07-19', '2022-07-20',
               '2022-07-21', '2022-07-22', '2022-07-23', '2022-07-24', '2022-07-25',
               '2022-07-26', '2022-07-27', '2022-07-28', '2022-07-29', '2022-07-30',"2022-07-31",
               '2022-08-01', '2022-08-02', '2022-08-03', '2022-08-04', '2022-08-05',
               '2022-08-06', '2022-08-07', '2022-08-08', '2022-08-09']

In [5]:
date_crawler()

삼성전자 크롤링
2660페이지 크롤링 완료.

In [None]:
df = pd.read_csv(root_path + f"/data/alldf.csv")

for date in target_date:
    df2 = df[df["날짜"]==date]
    df2.to_csv(root_path + f"/data/date/{date}.csv")