# Crawling Naver Stock Repots

In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [6]:
# 1. URL
url = 'https://finance.naver.com/research/company_list.naver?&page=1'

In [8]:
# 2. request(URL) > response(HTML)
response = requests.get(url)
response

<Response [200]>

In [11]:
response.text[:200]

'<!--  global include -->\n\n\t\n\t\n\t\n\t\n\t\n<html lang=\'ko\'>\n<head>\n\n\n\t\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t<title>종목분석 리포트 : 네이버페이 증권</title>\n\t\t\t\n\t\t\n\t\n\n\n\n\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n\n'

In [10]:
# 3. HTML > BeautifulSoup > css-selector > DataFrame

In [13]:
dom = BeautifulSoup(response.content, 'html.parser')
type(dom) # select(css-selector), select_one()

bs4.BeautifulSoup

In [19]:
# selector = '#contentarea_left > div.box_type_m > table.type_1 > tbody > tr:nth-child(3)'
selector = 'table > tr'
elements = dom.select(selector)
len(elements)

55

In [21]:
element = elements[2]
tag = element.select('td')
len(tag), tag

(6,
 [<td style="padding-left:10">
  <a class="stock_item" href="/item/main.naver?code=066570" title="LG전자">LG전자</a>
  </td>,
  <td><a href="company_read.naver?nid=77126&amp;page=1">AI데이터센터 냉각 시장 공략</a><img alt="NEW" class="ico_new" height="8" src="https://ssl.pstatic.net/imgstock/images5/ico_research_new.gif" width="8"/></td>,
  <td>교보증권</td>,
  <td class="file"><a href="https://stock.pstatic.net/stock-research/company/34/20240923_company_481199000.pdf" target="_blank"><img align="absmiddle" alt="pdf" src="https://ssl.pstatic.net/imgstock/images5/down.gif"/></a></td>,
  <td class="date" style="padding-left:5px">24.09.23</td>,
  <td class="date">226</td>])

In [39]:
data = {}
data['stock_item'] = tag[0].select_one('a').text
data['stock_link'] = tag[0].select_one('a').get('href')
data['title'] = tag[1].select_one('a').text
data['title_link'] = tag[1].select_one('a').get('href')
data['writer'] = tag[2].text
data['pdf_link'] = tag[3].select_one('a').get('href')
data['date'] = tag[4].text
data['pv'] = tag[5].text
data

{'stock_item': 'LG전자',
 'stock_link': '/item/main.naver?code=066570',
 'title': 'AI데이터센터 냉각 시장 공략',
 'title_link': 'company_read.naver?nid=77126&page=1',
 'writer': '교보증권',
 'pdf_link': 'https://stock.pstatic.net/stock-research/company/34/20240923_company_481199000.pdf',
 'date': '24.09.23',
 'pv': '226'}

In [45]:
# enumerate

rows = []
for idx, elemnet in enumerate(elements):
    tag = elemnet.select('td')
    # print(idx,len(tag))
    if len(tag) == 6:
        data = {}
        data['stock_item'] = tag[0].select_one('a').text
        data['stock_link'] = tag[0].select_one('a').get('href')
        data['title'] = tag[1].select_one('a').text
        data['title_link'] = tag[1].select_one('a').get('href')
        data['writer'] = tag[2].text
        data['pdf_link'] = tag[3].select_one('a').get('href')
        data['date'] = tag[4].text
        data['pv'] = tag[5].text
        rows.append(data)

In [47]:
df = pd.DataFrame(rows)
df.tail(2)

Unnamed: 0,stock_item,stock_link,title,title_link,writer,pdf_link,date,pv
28,동국S&C,/item/main.naver?code=100130,금리 하락으로 미국에서 훈풍이 불어온다,company_read.naver?nid=77098&page=1,iM증권,https://stock.pstatic.net/stock-research/compa...,24.09.19,1501
29,SOOP,/item/main.naver?code=067160,"경쟁사 스트리머 이적, 트래픽 유입 기대",company_read.naver?nid=77097&page=1,신한투자증권,https://stock.pstatic.net/stock-research/compa...,24.09.19,1455


In [50]:
# file download

In [52]:
import os
os.listdir()[-2:]

['Untitled2.ipynb', 'Untitled3.ipynb']

In [54]:
path = 'reports'
# 디렉토리, 파일 존재 여부 확인
os.path.exists(path)

False

In [73]:
if not os.path.exists(path):
    os.makedirs(path)

In [57]:
os.path.exists(path)

True

In [58]:
os.listdir()

['.ipynb_checkpoints',
 '.virtual_documents',
 '01_requests_naver_stock.ipynb',
 '02_requests_daum_exchange.ipynb',
 '03_rest_api (1).ipynb',
 '03_rest_api.ipynb',
 '04_requests_zigbang.ipynb',
 '04_requests_zigbang_code.ipynb',
 '05_html.ipynb',
 '06_css_selector.ipynb',
 '07_naver_relational_keywords.ipynb',
 '08_naver_stock_report.ipynb',
 'reports',
 'Untitled.ipynb',
 'untitled.txt',
 'Untitled1.ipynb',
 'Untitled2.ipynb',
 'Untitled3.ipynb']

In [63]:
title = df.loc[0, 'title']
pdf_link = df.loc[0, 'pdf_link']
title, pdf_link

('AI데이터센터 냉각 시장 공략',
 'https://stock.pstatic.net/stock-research/company/34/20240923_company_481199000.pdf')

In [64]:
response = requests.get(url)
response

<Response [200]>

In [66]:
filename = f'{path}/{title}.pdf'
print(filename)

with open(filename, 'wb') as file:
    file.write(response.content)

reports/AI데이터센터 냉각 시장 공략.pdf


In [68]:
import shutil
shutil.rmtree(path)
os.path.exists(path)

False

In [69]:
os.listdir(path)

FileNotFoundError: [WinError 3] 지정된 경로를 찾을 수 없습니다: 'reports'

In [74]:
for idx, row in df.iterrows():
    print(idx, row['title'], row['pdf_link'])
    title, pdf_link = row['title'], row['pdf_link']
    response = requests.get(url)
    filename = f'{path}/{title}.pdf'
    with open(filename, 'wb') as file:
        
        file.write(response.content)

0 AI데이터센터 냉각 시장 공략 https://stock.pstatic.net/stock-research/company/34/20240923_company_481199000.pdf
1 설계와 시공능력으로 액침냉각 사업 확대 https://stock.pstatic.net/stock-research/company/34/20240923_company_279682000.pdf
2 액침냉각, 기술적 강점을 확보해 나가는 중 https://stock.pstatic.net/stock-research/company/62/20240923_company_647832000.pdf
3 3Q24 Preview: 국내 수주 강세 VS 중국 부진 .. https://stock.pstatic.net/stock-research/company/57/20240923_company_621805000.pdf
4 Re-rating 구간 돌입 https://stock.pstatic.net/stock-research/company/63/20240923_company_747255000.pdf
5 높아지는 Peak sales https://stock.pstatic.net/stock-research/company/29/20240923_company_582032000.pdf
6 Metsera, 너는 계획이 다 있구나 https://stock.pstatic.net/stock-research/company/39/20240923_company_387902000.pdf
7 의대 열풍, 나만 믿어 https://stock.pstatic.net/stock-research/company/21/20240923_company_352867000.pdf
8 속도가 느려도, 방향성은 맞다 https://stock.pstatic.net/stock-research/company/39/20240923_company_584932000.pdf
9 빅파마들의 RPT 방향, 우리도 간다 https://stock.pstatic.net/stoc