In [1]:
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import pandas as pd
import requests
import json
import re
import time

In [2]:
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'ko,en;q=0.9,en-US;q=0.8',
    'Content-Type': 'application/json;charset=UTF-8',
    'Host': 'www.bigkinds.or.kr',
    'Origin': 'https://www.bigkinds.or.kr',
    'Referer': 'https://www.bigkinds.or.kr/v2/news/index.do',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
}

In [3]:
# payload, .기사 리스트 크롤링
def request_data(keyword, start_date, end_date):
  data = {
    "indexName":"news",
    "searchKey":f"{keyword}",
    "searchKeys":[{}],
    "byLine":"",
    "searchFilterType":"1",
    "searchScopeType":"1",
    "searchSortType":"date",
    "sortMethod":"date",
    "mainTodayPersonYn":"",
    "startDate":f"{start_date}",
    "endDate":f"{end_date}",
    "newsIds":[],
    "categoryCodes":[],
    "providerCodes":[],
    "incidentCodes":[],
    "networkNodeType":"",
    "topicOrigin":"",
    "dateCodes":[],
    "editorialIs":"false",
    "startNo":1,
    "resultNumber":1000000,
    "isTmUsable":"false",
    "isNotTmUsable":"false"
  }
  return json.dumps(data).encode('utf-8')

In [4]:
# 각 기사별 내용 크롤링
def request_content(new_id):
  url = 'https://www.bigkinds.or.kr/news/detailView.do'
  params = {
    'docId': f'{new_id}',
    'returnCnt': '1',
    'sectionDiv': '1000',
  }
  
  result = requests.get(url, headers=headers, params=params).json()
  return result['detail']['CONTENT']

In [5]:
# all function
def news_crawl(keyword, start_date, end_date):
  url = 'https://www.bigkinds.or.kr/api/news/search.do'
  headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'ko,en;q=0.9,en-US;q=0.8',
    'Content-Type': 'application/json;charset=UTF-8',
    'Host': 'www.bigkinds.or.kr',
    'Origin': 'https://www.bigkinds.or.kr',
    'Referer': 'https://www.bigkinds.or.kr/v2/news/index.do',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
}
  data = request_data(keyword, start_date, end_date)
  res = requests.post(url, headers=headers, data=data).json()
  
  df = pd.DataFrame()

  for i in tqdm(res['resultList']):
    try :# 저장내용 
          title = i['TITLE']
          where = i['PROVIDER']
          date=i['DATE']
          year = date[0:4]
          month = date[4:6]
          url = i['PROVIDER_LINK_PAGE']
          content = request_content(i['NEWS_ID']).replace('<br/>', ' ')
          content = re.sub('(. [가-힣]*=.*)', '', content).strip()
          # 기사 frame
          sub = pd.DataFrame({'기사제목':[title], '언론사':[where],'년도':[year],'월':[month],'url':[url], '기사내용':[content]})
          df = pd.concat([df,sub], axis=0)
          time.sleep(3)
    except KeyError:
           continue
  df.to_csv(f'./{keyword}/{start_date}_{keyword}.csv', index=False)

  return df


In [6]:
import os 

keyword = '항만'
if not(keyword in os.listdir()):
    os.mkdir(f'./{keyword}')

# 날짜 지정    
date = pd.date_range("2003-05-01","2023-05-01").strftime('%Y-%m-%d').values
date = date[::-1] # 이거 하면 뒤날짜부터 추출되고 주석처리하면 앞부터 추출됩니다!

fold_list = os.listdir(f'./{keyword}/')
suc_list = [i.split('_')[0] for i in fold_list]

for dt in date:
    if not(dt in suc_list):
        print(dt)
        news_crawl(keyword, dt, dt)

2003-05-01


  5%|▌         | 2/37 [00:06<01:48,  3.10s/it]

----------