## 네이버 뉴스 개편 되기 전

In [37]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

def get_news(sectionid, date):
    url = "https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=" + str(sectionid) + "&date=" + str(date)
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    l = []
    ranking_text = soup.find_all(class_ = 'ranking_text')
    for item in ranking_text:
        d = {}
        d['LinkSrc'] = item.find('a')['href']
        d['Title'] = item.find('a')['title']
        d['Views'] = item.find(class_ = "ranking_view").get_text()
        l.append(d)
    
    for link in l:
        resp = requests.get("http://news.naver.com" + link['LinkSrc'])
        soup = BeautifulSoup(resp.text, "html.parser")
        content = soup.find(id="articleBodyContents")
        link['Content'] = clean_text(content)
        
    df = pd.DataFrame(l)
    return df
    
# text 정제하기
def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스 오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    return cleaned_text

## 네이버 뉴스 개편된 이후

In [16]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

# error 발생
url = "https://news.naver.com/main/ranking/office.nhn?officeId=214" 
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.pareser")

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

### 언론사 ID

In [30]:
press_ID = {"MBC" : "214", "KBS" : "056", "SBS" : "055"}

### 텍스트 처리

In [2]:
def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스                       뉴스데스크 ", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스                       뉴스투데이 ", "")
    cleaned_text = cleaned_text.replace("앵커  ", "")
    return cleaned_text

### 뉴스 크롤링

In [5]:
# demo v0.1
import re
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

url = "https://news.naver.com/main/ranking/office.nhn?officeId=214&date=20210114"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

ranking_box = soup.find_all(class_="rankingnews_box_inner")
l = []

rank = ranking_text[0].find_all(class_ = "list_ranking_num")
url_list = ranking_text[0].find_all(class_="list_content")

for num in range(20):
    d = {}
    d['Rank'] = rank[num].get_text()
    d['URL'] = url_list[num].find('a')['href']
    d['Title'] = url_list[num].find('a').get_text()
    d['View'] = url_list[num].find(class_ = "list_view").get_text()
    l.append(d)

for link in l:
    resp = requests.get("http://news.naver.com" + link['URL'], headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    content = soup.find(id="articleBodyContents")
    link['Content'] = clean_text(content)

df = pd.DataFrame(l)
df.to_csv("20210112 MBC 많이 본 뉴스.csv", sep=",", index=False, encoding="utf-8-sig")




rank = ranking_text[1].find_all(class_ = "list_ranking_num")
url_list = ranking_text[1].find_all(class_="list_content")

for num in range(20):
    d = {}
    d['Rank'] = rank[num].get_text()
    d['URL'] = url_list[num].find('a')['href']
    d['Title'] = url_list[num].find('a').get_text()
    d['Comment'] = url_list[num].find(class_ = "list_comment nclicks('RBP.dcmtnwscmt')").get_text()
    l.append(d)

for link in l:
    resp = requests.get("http://news.naver.com" + link['URL'], headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    content = soup.find(id="articleBodyContents")
    link['Content'] = clean_text(content)

df = pd.DataFrame(l)
df.to_csv("20210112 MBC 많이 본 뉴스.csv", sep=",", index=False, encoding="utf-8-sig")

In [43]:
# demo v0.2 (2021.01.14, MBC)
import re
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

url = "https://news.naver.com/main/ranking/office.nhn?officeId=214&date=20210114"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

ranking_box = soup.find_all(class_="rankingnews_box_inner")
l = []

for ranking_type in range(2):
    ranking = ranking_box[ranking_type].find_all(class_="list_ranking_num")
    url_list = ranking_box[ranking_type].find_all(class_="list_content")

    for rank in range(20):
        d = {}
        d['Rank'] = ranking[rank].get_text()
        d['URL'] = url_list[rank].find('a')['href']
        d['Title'] = url_list[rank].find('a').get_text()
        if (ranking_type == 0):
            d['View'] = url_list[rank].find(class_="list_view").get_text()
        elif (ranking_type == 1):
            d['Comment'] = url_list[rank].find(class_="list_comment nclicks('RBP.dcmtnwscmt')").get_text()
        l.append(d)

for news in l:
    resp = requests.get("https://news.naver.com" + news['URL'], headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    content = soup.find(id="articleBodyContents")
    news['Content'] = clean_text(content)

df = pd.DataFrame(l)
df.to_csv("20210114_MBC_ranking_news.csv", sep=",", index=False, encoding="utf-8-sig")

In [5]:
# demo v0.3 (2021.01.15, MBC, KBS, SBS)
import os
os.chdir(r"C:/Users/cjy89/NLP/Naver_news_crawling/")
press_ID = {"MBC" : "214", "KBS" : "056", "SBS" : "055"}

import re
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup


def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스                       뉴스데스크 ", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스                       뉴스투데이 ", "")
    cleaned_text = cleaned_text.replace("앵커  ", "")
    return cleaned_text

for press in press_ID:
    start = time.time()
    url = "https://news.naver.com/main/ranking/office.nhn?officeId=" + press_ID[press] + "&date=20210115"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    ranking_box = soup.find_all(class_="rankingnews_box_inner")
    l = []

    for ranking_type in range(2):
        ranking = ranking_box[ranking_type].find_all(class_="list_ranking_num")
        url_list = ranking_box[ranking_type].find_all(class_="list_content")

        for rank in range(20):
            d = {}
            d['Rank'] = ranking[rank].get_text()
            d['URL'] = url_list[rank].find('a')['href']
            d['Title'] = url_list[rank].find('a').get_text()
            if (ranking_type == 0):
                d['View'] = url_list[rank].find(class_="list_view").get_text()
            elif (ranking_type == 1):
                d['Comment'] = url_list[rank].find(class_="list_comment nclicks('RBP.dcmtnwscmt')").get_text()
            l.append(d)

    for news in l:
        resp = requests.get("https://news.naver.com" + news['URL'], headers=headers)
        soup = BeautifulSoup(resp.text, "html.parser")
        content = soup.find(id="articleBodyContents")
        news['Content'] = clean_text(content)

    df = pd.DataFrame(l)
    title = press + "/20210115_" + press + "_ranking_news.csv"
    df.to_csv(title, sep=",", index=False, encoding="utf-8-sig")
    end = time.time()
    print("Crawling " + press + " news :", end - start)

Crawling MBC news : 10.095808267593384
Crawling KBS news : 8.139859437942505
Crawling SBS news : 8.135042905807495


In [43]:
# demo v0.4 날짜를 입력받아 자동으로 크롤링
import os
os.chdir(r"C:\Users\cjy89\NLP\Project_news_crawling\Naver")

from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
import re

press_ID = {"MBC":"214"}
#, "KBS": "056", "SBS": "055", "JTBC": "437"}

"""
def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub(
        '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace(
        "🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    cleaned_text = cleaned_text.replace(
        "동영상 뉴스                       뉴스데스크 ", "")
    cleaned_text = cleaned_text.replace(
        "동영상 뉴스                       뉴스투데이 ", "")
    cleaned_text = cleaned_text.replace("앵커  ", "")
    return cleaned_text
"""

# 8자리로 된 날짜를 입력하면 해당 날짜의 ranking news를 가져온다.
def get_ranking_news(date):
    total_time = 0
    
    # 언론사 순
    for press in press_ID:
        start = time.time()
        url = "https://news.naver.com/main/ranking/office.nhn?officeId=" + press_ID[press] + "&date=" + str(date)
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
        resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(resp.text, "html.parser")

        ranking_box = soup.find_all(class_="rankingnews_box_inner")
        l = []

        # 조회수 -> 댓글 수
        for ranking_type in range(2):
            ranking = ranking_box[ranking_type].find_all(class_="list_ranking_num")
            url_list = ranking_box[ranking_type].find_all(class_="list_content")

            # 랭킹 순 (1 ~ 20)
            for rank in range(20):
                d = {}
                d['Date'] = int(date)
                d['Press'] = press
                d['Rank'] = ranking[rank].get_text()
                d['URL'] = url_list[rank].find('a')['href']
                d['Title'] = url_list[rank].find('a').get_text()
                if (ranking_type == 0):
                    d['View'] = url_list[rank].find(class_="list_view").get_text()
                elif (ranking_type == 1):
                    d['Comment'] = url_list[rank].find(class_="list_comment nclicks('RBP.dcmtnwscmt')").get_text()
                l.append(d)

        # 본문 가져오기
        for news in l:
            resp = requests.get("https://news.naver.com" + news['URL'], headers=headers)
            soup = BeautifulSoup(resp.text, "html.parser")
            contents = soup.find(id="articleBodyContents").get_text()
            news['Content'] = re.sub('[\{\}\[\]\/?\(\);:|*~`!^\-_+<>▶▽♡◀━@\#$&\\\=\'\"ⓒ(\n)(\t)]', ' ', contents)
            # news['Content'] = clean_text(content)

        df = pd.DataFrame(l)
        title = press + "/" + str(date) + "_" + press + "_ranking_news.csv"
        df.to_csv(title, sep=",", index=False, encoding="utf-8-sig")
        end = time.time()
        total_time += end - start
        print("Crawling " + str(date) + " " + press + " news :", end - start)
    print("Total time :", total_time)
    print("Average time : ", total_time/len(press_ID))
    print("───────────────────")

In [44]:
for i in range(29):
    get_ranking_news(20210101+i)

Crawling 20210101 MBC news : 8.793712377548218
Total time : 8.793712377548218
Average time :  8.793712377548218
───────────────────
Crawling 20210102 MBC news : 8.152604103088379
Total time : 8.152604103088379
Average time :  8.152604103088379
───────────────────
Crawling 20210103 MBC news : 7.707432508468628
Total time : 7.707432508468628
Average time :  7.707432508468628
───────────────────
Crawling 20210104 MBC news : 8.801400661468506
Total time : 8.801400661468506
Average time :  8.801400661468506
───────────────────
Crawling 20210105 MBC news : 8.02721381187439
Total time : 8.02721381187439
Average time :  8.02721381187439
───────────────────
Crawling 20210106 MBC news : 7.572917461395264
Total time : 7.572917461395264
Average time :  7.572917461395264
───────────────────
Crawling 20210107 MBC news : 8.192397594451904
Total time : 8.192397594451904
Average time :  8.192397594451904
───────────────────
Crawling 20210108 MBC news : 8.231377840042114
Total time : 8.231377840042114
A

## 자연어처리

In [121]:
# MBC 데이터 불러오기 -> 조회수 별, 댓글 수 별 따로 저장
import os
import re
import pandas as pd
os.chdir(r"C:\Users\cjy89\NLP\Project_news_crawling\Naver")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

crawling_list = os.listdir('MBC/')
MBC_ranking_view = pd.DataFrame()
MBC_ranking_comment = pd.DataFrame()

for item in crawling_list:
    df = pd.read_csv("MBC/"+item, sep=",", encoding="utf-8-sig", engine="python")
    MBC_ranking_view = pd.concat([MBC_ranking_view, df.iloc[:20, :]], axis = 0, ignore_index=True)
    MBC_ranking_comment = pd.concat([MBC_ranking_comment, df.iloc[20:, :]], axis=0, ignore_index=True)
    
MBC_ranking_view.drop(["Comment"], axis=1, inplace = True)
MBC_ranking_comment.drop(["View"], axis=1, inplace = True)

In [148]:
# SettingWithCopyWarning 경고 무시
pd.set_option('mode.chained_assignment',  None)

# 뉴스 기사 전처리
title_view = MBC_ranking_view.iloc[:, 4]
title_comment = MBC_ranking_comment.iloc[:, 4]
contents_view = MBC_ranking_view.iloc[:, 6]
contents_comment = MBC_ranking_comment.iloc[:, 5]

titles = [title_view, title_comment]
contents = [contents_view, contents_comment]

for contents_type in contents:
    for i in range(len(contents_type)):
        cleaned_text = contents_type[i][:contents_type[i].find('MBC뉴스')]
        cleaned_text = re.sub('flash 오류를 우회하기 위한 함수 추가 function  flash removeCallback', ' ', cleaned_text)
        cleaned_text = re.sub("동영상 뉴스", ' ', cleaned_text)
        cleaned_text = re.sub("뉴스투데이", ' ', cleaned_text)
        cleaned_text = re.sub("뉴스데스크", ' ', cleaned_text)
        cleaned_text = re.sub("정오뉴스", ' ', cleaned_text)
        cleaned_text = re.sub("앵커", ' ', cleaned_text)
        cleaned_text = re.sub("리포트", ' ', cleaned_text)
        contents_type.iloc[i] = cleaned_text.strip()

---

In [151]:
# 제목으로 사용된 단어 빈도수 분석
from konlpy.tag import Hannanum
from konlpy.utils import pprint
from collections import Counter

total_title=" "
for title_type in titles:
    for item in title_type:
        total_title += " " + item
        
hannanum = Hannanum()
nouns = hannanum.nouns(total_title)
count = Counter(nouns)

wordInfo = dict()
for tags, counts in count.most_common(50):
    if (len(str(tags)) > 1):
        wordInfo[tags] = counts
        print ("%s : %d" % (tags, counts))

단독 : 66
백신 : 51
대통령 : 44
확진 : 37
감염 : 36
오늘 : 31
거리두 : 30
코로나19 : 29
1천 : 28
신규 : 27
접종 : 26
국민 : 22
확산 : 21
정부 : 20
수사 : 20
검찰 : 18
이재명 : 18
아파트 : 17
우려 : 17
2주 : 16
뉴스 : 16
연장 : 16
이상 : 15
사과 : 15
금지 : 15
서울 : 15
전국 : 14
적용 : 14
징역 : 14
하루 : 13
재난지원금 : 13
검토 : 13
트럼프 : 13
'국정농단' : 13
총리 : 12


In [142]:
title_view[0]

'신규 확진 다시 1천 명대…내일 거리두기 단계 조정'

In [140]:
total_title

' ?'

In [147]:
title[1]

0                          "文 대통령 속 간첩사상"…전광훈 '막말' 재시동
1      [MBC여론조사①] 국정운영 '부정 평가' 52.5%  vs '긍정 평가' 43.4%
2                           차기 대권 '3강'…이재명·윤석열·이낙연 초박빙
3                            문 대통령 "국민 일상의 회복으로 보답하겠다"
4                            모더나 백신 2천만 명분 계약…2분기부터 접종
5                            '의사 시험' 결국 허용…"의료 공백 막아야"
6                         신규 확진 다시 1천 명대…내일 거리두기 단계 조정
7                          모더나도 계약 완료…"9월까지 1차 접종 마무리"
8                           모더나 "5월부터 한국에 코로나19 백신 공급"
9       [MBC여론조사②] 백신준비 '잘하고 있다' 48.7%…'잘못하고 있다' 48.8%
10                 정부, 모더나 백신 2천만명분 계약완료…총 5천600만명분 확보
11                          조마조마한 '1천 명대'…내일 거리두기 재연장?
12               김종인 "사면? 처음 듣는 얘기"…안철수 "사면 선거이용 용납못해"
13                              문 대통령 "국민 일상의 회복으로 보답"
14                             1천 명 넘을 듯…내일 거리두기 단계 결정
15                           文, 피스아이 초계비행…한반도 전역 안보 점검
16                      '백신 대응' 팽팽한 민심…공수처 "검찰 개혁에 도움"
17                            '군기' 잡으려 새벽에 소집…위치 추적까지?
18        