## 네이버 뉴스 개편 되기 전

In [37]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

def get_news(sectionid, date):
    url = "https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&sectionId=" + str(sectionid) + "&date=" + str(date)
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    l = []
    ranking_text = soup.find_all(class_ = 'ranking_text')
    for item in ranking_text:
        d = {}
        d['LinkSrc'] = item.find('a')['href']
        d['Title'] = item.find('a')['title']
        d['Views'] = item.find(class_ = "ranking_view").get_text()
        l.append(d)
    
    for link in l:
        resp = requests.get("http://news.naver.com" + link['LinkSrc'])
        soup = BeautifulSoup(resp.text, "html.parser")
        content = soup.find(id="articleBodyContents")
        link['Content'] = clean_text(content)
        
    df = pd.DataFrame(l)
    return df
    
# text 정제하기
def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스 오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    return cleaned_text

## 네이버 뉴스 개편된 이후

In [16]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

# error 발생
url = "https://news.naver.com/main/ranking/office.nhn?officeId=214" 
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.pareser")

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

### 언론사 ID

In [30]:
press_ID = {"MBC" : "214", "KBS" : "056", "SBS" : "055"}

### 텍스트 처리

In [2]:
def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스                       뉴스데스크 ", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스                       뉴스투데이 ", "")
    cleaned_text = cleaned_text.replace("앵커  ", "")
    return cleaned_text

### 뉴스 크롤링

In [5]:
# demo v0.1
import re
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

url = "https://news.naver.com/main/ranking/office.nhn?officeId=214&date=20210114"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

ranking_box = soup.find_all(class_="rankingnews_box_inner")
l = []

rank = ranking_text[0].find_all(class_ = "list_ranking_num")
url_list = ranking_text[0].find_all(class_="list_content")

for num in range(20):
    d = {}
    d['Rank'] = rank[num].get_text()
    d['URL'] = url_list[num].find('a')['href']
    d['Title'] = url_list[num].find('a').get_text()
    d['View'] = url_list[num].find(class_ = "list_view").get_text()
    l.append(d)

for link in l:
    resp = requests.get("http://news.naver.com" + link['URL'], headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    content = soup.find(id="articleBodyContents")
    link['Content'] = clean_text(content)

df = pd.DataFrame(l)
df.to_csv("20210112 MBC 많이 본 뉴스.csv", sep=",", index=False, encoding="utf-8-sig")




rank = ranking_text[1].find_all(class_ = "list_ranking_num")
url_list = ranking_text[1].find_all(class_="list_content")

for num in range(20):
    d = {}
    d['Rank'] = rank[num].get_text()
    d['URL'] = url_list[num].find('a')['href']
    d['Title'] = url_list[num].find('a').get_text()
    d['Comment'] = url_list[num].find(class_ = "list_comment nclicks('RBP.dcmtnwscmt')").get_text()
    l.append(d)

for link in l:
    resp = requests.get("http://news.naver.com" + link['URL'], headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    content = soup.find(id="articleBodyContents")
    link['Content'] = clean_text(content)

df = pd.DataFrame(l)
df.to_csv("20210112 MBC 많이 본 뉴스.csv", sep=",", index=False, encoding="utf-8-sig")

In [43]:
# demo v0.2 (2021.01.14, MBC)
import re
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup

url = "https://news.naver.com/main/ranking/office.nhn?officeId=214&date=20210114"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

ranking_box = soup.find_all(class_="rankingnews_box_inner")
l = []

for ranking_type in range(2):
    ranking = ranking_box[ranking_type].find_all(class_="list_ranking_num")
    url_list = ranking_box[ranking_type].find_all(class_="list_content")

    for rank in range(20):
        d = {}
        d['Rank'] = ranking[rank].get_text()
        d['URL'] = url_list[rank].find('a')['href']
        d['Title'] = url_list[rank].find('a').get_text()
        if (ranking_type == 0):
            d['View'] = url_list[rank].find(class_="list_view").get_text()
        elif (ranking_type == 1):
            d['Comment'] = url_list[rank].find(class_="list_comment nclicks('RBP.dcmtnwscmt')").get_text()
        l.append(d)

for news in l:
    resp = requests.get("https://news.naver.com" + news['URL'], headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    content = soup.find(id="articleBodyContents")
    news['Content'] = clean_text(content)

df = pd.DataFrame(l)
df.to_csv("20210114_MBC_ranking_news.csv", sep=",", index=False, encoding="utf-8-sig")

In [5]:
# demo v0.3 (2021.01.15, MBC, KBS, SBS)
import os
os.chdir(r"C:/Users/cjy89/NLP/Naver_news_crawling/")
press_ID = {"MBC" : "214", "KBS" : "056", "SBS" : "055"}

import re
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup


def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스                       뉴스데스크 ", "")
    cleaned_text = cleaned_text.replace("동영상 뉴스                       뉴스투데이 ", "")
    cleaned_text = cleaned_text.replace("앵커  ", "")
    return cleaned_text

for press in press_ID:
    start = time.time()
    url = "https://news.naver.com/main/ranking/office.nhn?officeId=" + press_ID[press] + "&date=20210115"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    ranking_box = soup.find_all(class_="rankingnews_box_inner")
    l = []

    for ranking_type in range(2):
        ranking = ranking_box[ranking_type].find_all(class_="list_ranking_num")
        url_list = ranking_box[ranking_type].find_all(class_="list_content")

        for rank in range(20):
            d = {}
            d['Rank'] = ranking[rank].get_text()
            d['URL'] = url_list[rank].find('a')['href']
            d['Title'] = url_list[rank].find('a').get_text()
            if (ranking_type == 0):
                d['View'] = url_list[rank].find(class_="list_view").get_text()
            elif (ranking_type == 1):
                d['Comment'] = url_list[rank].find(class_="list_comment nclicks('RBP.dcmtnwscmt')").get_text()
            l.append(d)

    for news in l:
        resp = requests.get("https://news.naver.com" + news['URL'], headers=headers)
        soup = BeautifulSoup(resp.text, "html.parser")
        content = soup.find(id="articleBodyContents")
        news['Content'] = clean_text(content)

    df = pd.DataFrame(l)
    title = press + "/20210115_" + press + "_ranking_news.csv"
    df.to_csv(title, sep=",", index=False, encoding="utf-8-sig")
    end = time.time()
    print("Crawling " + press + " news :", end - start)

Crawling MBC news : 10.095808267593384
Crawling KBS news : 8.139859437942505
Crawling SBS news : 8.135042905807495


In [11]:
# demo v0.4 날짜를 입력받아 자동으로 크롤링
import os
os.chdir(r"C:\Users\cjy89\NLP\Project_news_crawling\Naver")

from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
import re

# press_ID = {"MBC":"214"}
press_ID = {"KBS": "056", "SBS": "055", "JTBC": "437"}

"""
def clean_text(text):
    content = text.get_text()
    cleaned_text = re.sub('[a-zA-Z]', '', content)
    cleaned_text = re.sub(
        '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>▶▽♡◀━@\#$%&\\\=\(\'\"ⓒ(\n)(\t)]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace(
        "🇲\u200b🇮\u200b🇱\u200b🇱\u200b🇮\u200b🇪\u200b", "")
    cleaned_text = cleaned_text.replace("오류를 우회하기 위한 함수 추가 ", "")
    cleaned_text = cleaned_text.replace("무단전재 및 재배포 금지", "")
    cleaned_text = cleaned_text.replace(
        "동영상 뉴스                       뉴스데스크 ", "")
    cleaned_text = cleaned_text.replace(
        "동영상 뉴스                       뉴스투데이 ", "")
    cleaned_text = cleaned_text.replace("앵커  ", "")
    return cleaned_text
"""

# 8자리로 된 날짜를 입력하면 해당 날짜의 ranking news를 가져온다.
def get_ranking_news(date):
    total_time = 0
    
    # 언론사 순
    for press in press_ID:
        start = time.time()
        url = "https://news.naver.com/main/ranking/office.nhn?officeId=" + press_ID[press] + "&date=" + str(date)
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
        resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(resp.text, "html.parser")

        ranking_box = soup.find_all(class_="rankingnews_box_inner")
        l = []

        # 조회수 -> 댓글 수
        for ranking_type in range(2):
            ranking = ranking_box[ranking_type].find_all(class_="list_ranking_num")
            url_list = ranking_box[ranking_type].find_all(class_="list_content")

            # 랭킹 순 (1 ~ 20)
            for rank in range(20):
                d = {}
                d['Date'] = int(date)
                d['Press'] = press
                d['Rank'] = ranking[rank].get_text()
                d['URL'] = url_list[rank].find('a')['href']
                d['Title'] = url_list[rank].find('a').get_text()
                if (ranking_type == 0):
                    d['View'] = url_list[rank].find(class_="list_view").get_text()
                elif (ranking_type == 1):
                    d['Comment'] = url_list[rank].find(class_="list_comment nclicks('RBP.dcmtnwscmt')").get_text()
                l.append(d)

        # 본문 가져오기
        for news in l:
            resp = requests.get("https://news.naver.com" + news['URL'], headers=headers)
            soup = BeautifulSoup(resp.text, "html.parser")
            contents = soup.find(id="articleBodyContents").get_text()
            news['Content'] = re.sub('[\{\}\[\]\/?\(\);:|*~`!^\-_+<>▶▽♡◀━@\#$&\\\=\'\"ⓒ(\n)(\t)]', ' ', contents)
            # news['Content'] = clean_text(content)

        df = pd.DataFrame(l)
        title = press + "/" + str(date) + "_" + press + "_ranking_news.csv"
        df.to_csv(title, sep=",", index=False, encoding="utf-8-sig")
        end = time.time()
        total_time += end - start
        print("Crawling " + str(date) + " " + press + " news :", end - start)
    print("Total time :", total_time)
    print("Average time : ", total_time/len(press_ID))
    print("───────────────────")

In [12]:
for i in range(31):
    get_ranking_news(20210101+i)

Crawling 20210101 KBS news : 9.752766847610474
Crawling 20210101 SBS news : 8.970605373382568
Crawling 20210101 JTBC news : 9.061842679977417
Total time : 27.78521490097046
Average time :  9.261738300323486
───────────────────
Crawling 20210102 KBS news : 8.599104166030884
Crawling 20210102 SBS news : 8.933659791946411
Crawling 20210102 JTBC news : 8.457802057266235
Total time : 25.99056601524353
Average time :  8.663522005081177
───────────────────
Crawling 20210103 KBS news : 10.385263919830322
Crawling 20210103 SBS news : 8.947070598602295
Crawling 20210103 JTBC news : 8.91811490058899
Total time : 28.250449419021606
Average time :  9.416816473007202
───────────────────
Crawling 20210104 KBS news : 8.9640531539917
Crawling 20210104 SBS news : 9.114179611206055
Crawling 20210104 JTBC news : 9.843366861343384
Total time : 27.921599626541138
Average time :  9.307199875513712
───────────────────
Crawling 20210105 KBS news : 8.779188394546509
Crawling 20210105 SBS news : 8.28168988227844

NameError: name 'printf' is not defined