- 원본 데이터: newsdata/
    - ‘Content’ 컬럼: 뉴스 본문 원본 그대로
    - ‘Content_split’ 컬럼: ‘Content’ 컬럼 데이터를 kss 라이브러리로 문장 분리한 결과입니다.
- 전처리 이후 데이터: preprocessed_data/
    - ‘Sent’ 컬럼이 전처리 이후 기사 본문이고, 나머지는 기사 제목, 랭킹, 조회수 등 메타데이터입니다.
    - 언론사별 상위 20개씩 정렬되어있습니다. (ex. MBC 1-20위 기사 -> KBS 1-20위 기사 -> SBS 1-20위 기사)
    - 넉넉하게 8/1 ~ 10/13 기간 동안 방송 3사의 상위 20개 기사를 수집했습니다. 총 74개의 파일, (31+30+13)x20x3=4440개의 기사입니다.


In [None]:
### kss: 문장 분리에 사용
### newspaper3k: 뉴스 기사 크롤링에 사용

# !pip install kss
# !pip install newspaper3k

## 1. 네이버 뉴스 크롤링 - 기사 분야별

In [1]:
import os
import re
import kss
import json
import glob
import time
import requests
import pandas as pd
from newspaper import Article
from bs4 import BeautifulSoup
from pathlib import Path

def get_href(soup):
    # 각 분야별 속보 기사에 접근할 수 있는 href를 리스트로 반환
    
    result = []
    
    div = soup.find("div", class_="list_body newsflash_body")
    
    for dt in div.find_all("dt", class_="photo"):
        result.append(dt.find("a")["href"])
    
    return result

In [2]:
def get_href_daily(section, date):
    custom_header = {
        'referer' : 'https://www.naver.com/',
        'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    }
    # https://news.naver.com/main/list.naver?mode=LS2D&mid=shm&sid2=245&sid1=103&date=20220801
    url = "https://news.naver.com/main/list.naver?mode=LS2D&mid=shm&sid2=245&sid1=" + str(section) + "&date=" + str(date)
    req = requests.get(url, headers=custom_header)
    soup = BeautifulSoup(req.text, "html.parser")

    list_href = get_href(soup)
    
    return list_href


def crawl_daily(news_list, date, save_path):
    dates = [str(date)] * len(news_list)
    ids = [str(i) for i in range(len(news_list))]

    titles = []
    contents = []
    for news_url in news_list:
        article = Article(news_url, language='ko')
        article.download()
        article.parse()
        title = article.title
        content = article.text.split('\n')
        titles.append(title)
        contents.append(content)
        

    df = pd.DataFrame({'date': dates,
                       'id': ids,
                       'title': titles,
                       'content': contents,
                       'url': news_list
                      })
    
    out_path = os.path.join(save_path, str(date) + '.json')
    df.to_json(out_path, orient='table', index=False, force_ascii=False, indent=4)
    print('Saved to: ', out_path)

In [14]:
# {'정치': 100, '경제': 101, '사회': 102, '생활/문화': 103, '세계': 104, 'IT/과학': 105}

# test
# section = 103
# date = 20220801
# days = 3
# save_path = '/workspace/chitchat_cb/test'


section = 103

### 8월
# date = 20220801
# days = 31

### 9월
# date = 20220901
# days = 30

### 10월
date = 20221001
days = 24


save_path = '/workspace/chitchat_cb/newsdata_culture'

for _ in range(days):
    news_list = get_href_daily(section, date)
    crawl_daily(news_list, date, save_path)
    date += 1

Saved to:  /workspace/chitchat_cb/newsdata_culture/20221001.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221002.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221003.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221004.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221005.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221006.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221007.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221008.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221009.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221010.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221011.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221012.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221013.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221014.json
Saved to:  /workspace/chitchat_cb/newsdata_culture/20221015.json
Saved to:  /workspace/chi

## 2. 전처리
- [참고 코드](https://github.com/HanNayeoniee/boostcamp/blob/main/week10-KLUE/(2%EA%B0%95)%20%EC%9E%90%EC%97%B0%EC%96%B4%EC%9D%98%20%EC%A0%84%EC%B2%98%EB%A6%AC%20-%200_%ED%95%9C%EA%B5%AD%EC%96%B4%EC%A0%84%EC%B2%98%EB%A6%AC.ipynb)
- private repo이므로 권한 요청하기

In [9]:
### kss 라이브러리 사용해 문장 분리
def split_sentence(data):
    sents = []
    for sent in data:
        split_sent = kss.split_sentences(sent)
        sents.extend(split_sent)

    return sents


def remove_pattern(texts):
    outs = []
    p1 = re.compile(r'[\w\.-]+@[\w\.-]+')  # 이메일(영어@영어)
    p2 = re.compile(r'@[\w\.-]+')  # (@영어)
    p3 = re.compile('\d{2,3}-\d{3,4}-\d{4}$')  # 일반 전화번호
    p4 = re.compile('\d{3}-\d{3,4}-\d{4}$')  # 휴대폰번호
    p5 = re.compile('^◀')
    p6 = re.compile('^▷')
    p7 = re.compile('^=')
    p8 = re.compile('^MBC뉴스')
    p9 = re.compile('^영상제공 : ')
    # kbs
    p10 = re.compile('^KBS 뉴스')
    p11 = re.compile('^영상편집:')
    p12 = re.compile('^촬영기자:')
    p13 = re.compile('^\(영상취재 :')
    

    for text in texts:    
        res1 = p1.findall(text)
        res2 = p2.findall(text)
        res3 = p3.findall(text)
        res4 = p4.findall(text)
        res5 = p5.findall(text)
        res6 = p6.findall(text)
        res7 = p7.findall(text)
        res8 = p8.findall(text)
        res9 = p9.findall(text)
        res10 = p10.findall(text)
        res11 = p11.findall(text)
        res12 = p12.findall(text)
        res13 = p13.findall(text)
        
        if not res1 and not res2 and not res3 and not res4 and not res5 and not res6 and not res7 and not res8 \
            and not res9 and not res10 and not res11 and not res12 and not res13:
            outs.append(text)

    return outs


def remove_stops(texts):
    outs = []
    stop_mbc = ['MBC 뉴스는 24시간 여러분의 제보를 기다립니다.', '[뉴스투데이]', '[탐사기획 스트레이트]']
    
    for text in texts:    
        if text not in stop_mbc:
            outs.append(text)

    return outs

In [10]:
def remove_press(texts):
    patterns = [r"\(사진=[가-힣]{3,4}\)$",  # (사진=연합뉴스)
                    r"\(사진=[가-힣]{3,4} [가-힣]{3,4}\)$",  # (사진=온라인 커뮤니티)
                    r"(촬영기자:|영상편집:|그래픽:|영상취재:|편집:)[가-힣]{2,4}",  # 촬영기자:윤대민/영상편집:최근혁/그래픽:김석훈
                    r"(촬영기자: |영상편집: |그래픽: |영상취재: |편집: )[가-힣]{2,4}",  # 촬영기자: 윤대민/영상편집: 최근혁/그래픽: 김석훈
                    r"(영상취재|영상편집|그래픽|영상취재|편집)·(영상취재|영상편집|그래픽|영상취재|편집): [가-힣]{2,4}",  # 영상취재·편집: 위동원 
                    r"(영상취재|영상편집|그래픽|영상취재|편집)·(영상취재|영상편집|그래픽|영상취재|편집):[가-힣]{2,4}",  # 영상취재·편집:위동원 
                    r"\((영상취재 : |영상편집 : |그래픽 : |편집: )[가-힣]{2,4}·[가-힣]{2,4}", # (영상취재 : 김균종·조창현
                    r"(영상취재 : |영상편집 : |그래픽 : |편집 : \))[가-힣]{2,4}",  # 영상편집 : 박지인
                    r"\((영상취재 : |영상편집 : |그래픽 : |편집 : \))[가-힣]{2,4}",  # (영상편집 : 박지인
                    r"\[탐사기획 스트레이트]",
                    r"\[설문 참여하기]",
                    r"<기자>",
                    r"<앵커>",
                    r"앵커>"
                    ]
    
    outs = []
    for text in texts:
        for pat in patterns:
            text = re.sub(pat, "", text).strip()
        if text:
            outs.append(text)    
    return outs

In [11]:
def remove_url(texts):
    """
    URL을 제거합니다.
    ``주소: www.naver.com`` -> ``주소: ``
    """
    outs = []
    for text in texts:
        text = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text).strip()
        text = re.sub(r"pic\.(\w+\.)+\S*", "", text).strip()
        if text:
            outs.append(text)
    return outs


def filter(texts):
    outs = []
    for text in texts:
        text = re.sub('[▲━■▶◀△☎■▲※🎧]', '', text)
        if text:
            outs.append(text)
    return outs

In [15]:
## 모든 파일에 대해 전처리 수행

target = './newsdata_culture/*.json'
json_list = glob.glob(target)
print('json파일 개수:', len(json_list))


for file in json_list:
    with open(file, 'r') as f:
        json_data = json.load(f)["data"]
        df = pd.DataFrame(json_data)


        sents = []
        for data in df["content"]:
            split_sent = split_sentence(data)
#             print('문장 개수:', len(split_sent))
            outs = remove_pattern(split_sent)
            outs = remove_stops(outs)
            outs = remove_press(outs)
            outs = remove_url(outs)
            outs = filter(outs)        
            sents.append(split_sent)

    df['sent'] = sents
    save_path = os.path.join('./newsdata_culture_res/', Path(file).stem + '.json')
    df.to_json(save_path, orient='table', index=False, force_ascii=False, indent=4)
    print('Save to:', save_path)

json파일 개수: 85
Save to: ./newsdata_culture_res/20220815.json
Save to: ./newsdata_culture_res/20220822.json
Save to: ./newsdata_culture_res/20221012.json
Save to: ./newsdata_culture_res/20220804.json
Save to: ./newsdata_culture_res/20221008.json
Save to: ./newsdata_culture_res/20220910.json
Save to: ./newsdata_culture_res/20220823.json
Save to: ./newsdata_culture_res/20220814.json
Save to: ./newsdata_culture_res/20220912.json
Save to: ./newsdata_culture_res/20221010.json
Save to: ./newsdata_culture_res/20220830.json
Save to: ./newsdata_culture_res/20220827.json
Save to: ./newsdata_culture_res/20220922.json
Save to: ./newsdata_culture_res/20221024.json
Save to: ./newsdata_culture_res/20220802.json
Save to: ./newsdata_culture_res/20220909.json
Save to: ./newsdata_culture_res/20220803.json
Save to: ./newsdata_culture_res/20220923.json
Save to: ./newsdata_culture_res/20220806.json
Save to: ./newsdata_culture_res/20220927.json
Save to: ./newsdata_culture_res/20220810.json
Save to: ./newsdata_

In [None]:
import natsort


## 모든 파일에 대해 전처리 수행

target = './newsdata_culture_res/*.json'
json_list = glob.glob(target)
json_list = natsort.natsorted(json_list)
print('json파일 개수:', len(json_list))


start_file = json_list[0]
with open(start_file, 'r') as f:
    json_data = json.load(f)["data"]
    df = pd.DataFrame(json_data)
    news_id = [df['date'][i] + '_' + str(df['id'][i]) for i in range(len(df))]
    df['news_id'] = news_id
    total_df = df[['news_id', 'url', 'title', 'sent']]


for file in json_list[1:]:
    with open(file, 'r') as f:
        json_data = json.load(f)["data"]
        df = pd.DataFrame(json_data)
        news_id = [df['date'][i] + '_' + str(df['id'][i]) for i in range(len(df))]
        df['news_id'] = news_id
        df = df[['news_id', 'url', 'title', 'sent']]
    total_df = pd.concat([total_df, df])

    
total_df.to_excel('./final.xlsx', index=False)
