# 금융 개별종목 수집
* html 파일 읽어오기(requests라이브러리, BeautifulSoup라이브러리)
* 결측 데이터 제거하기
* 데이터 프레임합치기
* 중복데이터 제거하기
* 날짜 column의 첫 row값 확인
* 파일로 저장하고 읽어오기

##  라이브러리 로드

In [1]:
# 라이브러리 로드
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs

## html 파일 읽어오기

In [4]:
# 종목 URL
item_code = "373220"
page_no = 1
url = f"https://finance.naver.com/item/sise_day.naver?code={item_code}&page={page_no}"

In [5]:
# User-Agent 인증 (브라우저 인증)
headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"}
print(headers["user-agent"])

Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36


In [6]:
# requests로 html 요청
response = requests.get(url, headers=headers)
response.status_code

200

In [1]:
# BeautifulSoup으로 html 파싱
html = bs(response.text, 'lxml')
html

NameError: name 'bs' is not defined

In [8]:
tables = html.find_all('table')
tables

[<table cellspacing="0" class="type2">
 <tr>
 <th>날짜</th>
 <th>종가</th>
 <th>전일비</th>
 <th>시가</th>
 <th>고가</th>
 <th>저가</th>
 <th>거래량</th>
 </tr>
 <tr>
 <td colspan="7" height="8"></td>
 </tr>
 <tr onmouseout="mouseOut(this)" onmouseover="mouseOver(this)">
 <td align="center"><span class="tah p10 gray03">2022.05.19</span></td>
 <td class="num"><span class="tah p11">415,500</span></td>
 <td class="num">
 <img alt="상승" height="6" src="https://ssl.pstatic.net/imgstock/images/images4/ico_up.gif" style="margin-right:4px;" width="7"/><span class="tah p11 red02">
 				5,500
 				</span>
 </td>
 <td class="num"><span class="tah p11">398,500</span></td>
 <td class="num"><span class="tah p11">417,000</span></td>
 <td class="num"><span class="tah p11">398,500</span></td>
 <td class="num"><span class="tah p11">318,385</span></td>
 </tr>
 <tr onmouseout="mouseOut(this)" onmouseover="mouseOver(this)">
 <td align="center"><span class="tah p10 gray03">2022.05.18</span></td>
 <td class="num"><span class

## pandas의 read_html로 값을 데이터프레임으로 수집하기

In [9]:
# 네이버의 일별 시세는 cp949 인코딩
table = pd.read_html(str(tables), encoding="cp949")
table

[            날짜        종가      전일비        시가        고가        저가       거래량
 0          NaN       NaN      NaN       NaN       NaN       NaN       NaN
 1   2022.05.19  415500.0   5500.0  398500.0  417000.0  398500.0  318385.0
 2   2022.05.18  410000.0   1500.0  410000.0  413000.0  407500.0  233063.0
 3   2022.05.17  408500.0   8000.0  400500.0  411000.0  399000.0  230327.0
 4   2022.05.16  400500.0      0.0  407000.0  407500.0  397000.0  200731.0
 5   2022.05.13  400500.0  12500.0  390500.0  401500.0  390500.0  228636.0
 6          NaN       NaN      NaN       NaN       NaN       NaN       NaN
 7          NaN       NaN      NaN       NaN       NaN       NaN       NaN
 8          NaN       NaN      NaN       NaN       NaN       NaN       NaN
 9   2022.05.12  388000.0   3500.0  390000.0  395500.0  386000.0  251570.0
 10  2022.05.11  391500.0   2000.0  394000.0  398000.0  391500.0  239355.0
 11  2022.05.10  393500.0    500.0  387500.0  398500.0  382000.0  407928.0
 12  2022.05.09  394000.0

## 결측 데이터 제거

In [10]:
# dropna를 통해 결측치가 들어있는 row 제거
temp = table[0].dropna()
temp

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
1,2022.05.19,415500.0,5500.0,398500.0,417000.0,398500.0,318385.0
2,2022.05.18,410000.0,1500.0,410000.0,413000.0,407500.0,233063.0
3,2022.05.17,408500.0,8000.0,400500.0,411000.0,399000.0,230327.0
4,2022.05.16,400500.0,0.0,407000.0,407500.0,397000.0,200731.0
5,2022.05.13,400500.0,12500.0,390500.0,401500.0,390500.0,228636.0
9,2022.05.12,388000.0,3500.0,390000.0,395500.0,386000.0,251570.0
10,2022.05.11,391500.0,2000.0,394000.0,398000.0,391500.0,239355.0
11,2022.05.10,393500.0,500.0,387500.0,398500.0,382000.0,407928.0
12,2022.05.09,394000.0,7500.0,397000.0,400000.0,390000.0,289113.0
13,2022.05.06,401500.0,0.0,395000.0,401500.0,390000.0,350875.0


# 전체 일자 데이터 수집하기

In [11]:
# 페이지별 데이터 수집 함수 만들기
def get_day_list(item_code, page_no):
    """
    일자별 시세를 페이지별로 수집
    """ 
    url = f"https://finance.naver.com/item/sise_day.naver?code={item_code}&page={page_no}"
    headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"}
    
    response = requests.get(url, headers=headers)
    
    html = bs(response.text, 'lxml')
    tables = html.select('table')
    table = pd.read_html(str(tables), encoding="cp949")
    temp = table[0].dropna()
    
    return temp

In [12]:
# 반복문을 통한 전체 일자 데이터 수집하기
import time
# web page 시작번호
page_no = 1
#데이터를 저장할 빈 변수 선언
item_list = []
item_code = "373220"
item_name = "LG에너지솔루션"

prev = ""

while True:
    print(page_no)
    
    df_one_page = get_day_list(item_code, page_no)
    # 마지막 날짜를 가져옴
    curr = df_one_page.iloc[-1, 0]
    # 마지막 날짜를 비교했을 때 같으면 반복문을 종료
    if curr == prev:
        break
    
    item_list.append(df_one_page)
    page_no = page_no + 1
    prev = curr

1
2
3
4
5
6
7
8
9


In [13]:
# 데이터프레임으로 합치기
df = pd.concat(item_list)

In [14]:
# 종목코드, 종목명 컬럼 추가
df["종목코드"] = item_code
df["종목명"] = item_name
df.head()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종목코드,종목명
1,2022.05.19,415500.0,5500.0,398500.0,417000.0,398500.0,318385.0,373220,LG에너지솔루션
2,2022.05.18,410000.0,1500.0,410000.0,413000.0,407500.0,233063.0,373220,LG에너지솔루션
3,2022.05.17,408500.0,8000.0,400500.0,411000.0,399000.0,230327.0,373220,LG에너지솔루션
4,2022.05.16,400500.0,0.0,407000.0,407500.0,397000.0,200731.0,373220,LG에너지솔루션
5,2022.05.13,400500.0,12500.0,390500.0,401500.0,390500.0,228636.0,373220,LG에너지솔루션


In [15]:
# 컬럼 순서 변경하기
cols = ['종목코드', '종목명', '날짜', '종가', '전일비', '시가', '고가', '저가', '거래량']
df = df[cols]
df.head()

Unnamed: 0,종목코드,종목명,날짜,종가,전일비,시가,고가,저가,거래량
1,373220,LG에너지솔루션,2022.05.19,415500.0,5500.0,398500.0,417000.0,398500.0,318385.0
2,373220,LG에너지솔루션,2022.05.18,410000.0,1500.0,410000.0,413000.0,407500.0,233063.0
3,373220,LG에너지솔루션,2022.05.17,408500.0,8000.0,400500.0,411000.0,399000.0,230327.0
4,373220,LG에너지솔루션,2022.05.16,400500.0,0.0,407000.0,407500.0,397000.0,200731.0
5,373220,LG에너지솔루션,2022.05.13,400500.0,12500.0,390500.0,401500.0,390500.0,228636.0


In [16]:
# 중복데이터 제거하기
df.drop_duplicates()

Unnamed: 0,종목코드,종목명,날짜,종가,전일비,시가,고가,저가,거래량
1,373220,LG에너지솔루션,2022.05.19,415500.0,5500.0,398500.0,417000.0,398500.0,318385.0
2,373220,LG에너지솔루션,2022.05.18,410000.0,1500.0,410000.0,413000.0,407500.0,233063.0
3,373220,LG에너지솔루션,2022.05.17,408500.0,8000.0,400500.0,411000.0,399000.0,230327.0
4,373220,LG에너지솔루션,2022.05.16,400500.0,0.0,407000.0,407500.0,397000.0,200731.0
5,373220,LG에너지솔루션,2022.05.13,400500.0,12500.0,390500.0,401500.0,390500.0,228636.0
...,...,...,...,...,...,...,...,...,...
1,373220,LG에너지솔루션,2022.02.07,548000.0,44000.0,520000.0,548000.0,511000.0,1911176.0
2,373220,LG에너지솔루션,2022.02.04,504000.0,27000.0,476500.0,505000.0,476000.0,2088996.0
3,373220,LG에너지솔루션,2022.02.03,477000.0,27000.0,458000.0,495500.0,441000.0,2918435.0
4,373220,LG에너지솔루션,2022.01.28,450000.0,55000.0,476000.0,483000.0,445000.0,4559773.0


In [17]:
# 중복데이터가 있는지 확인하기
df.duplicated().sum()

0

In [18]:
# 기술통계값 구하기
df.describe()

Unnamed: 0,종가,전일비,시가,고가,저가,거래량
count,75.0,75.0,75.0,75.0,75.0,75.0
mean,429573.333333,10900.0,430953.333333,438753.333333,421040.0,970808.7
std,35904.68739,14496.271655,40763.50492,42390.110184,31833.685202,1934300.0
min,359500.0,0.0,363000.0,367500.0,355000.0,115652.0
25%,408250.0,2250.0,405500.0,412500.0,398750.0,317397.0
50%,431000.0,6000.0,432000.0,436500.0,418500.0,413972.0
75%,441500.0,12750.0,443250.0,448250.0,437250.0,889387.0
max,548000.0,92000.0,597000.0,598000.0,524000.0,15946990.0


In [19]:
item_list = []
item_code = "373220"
item_name = "LG에너지솔루션"

In [20]:
def get_item_list(item_code, item_name):
    """
    일별 시세를 수집하는 함수
    """
    page_no = 1
    # 데이터를 저장할 빈 변수 선언
    item_list = []
    prev = ""

    while True:
        df_one_page = get_day_list(item_code, page_no)
        # 마지막 날짜를 가져옴
        curr = df_one_page.iloc[-1, 0]
        # 마지막 날짜를 비교했을 때 같으면 반복문을 종료
        if curr == prev:
            print(f'{page_no}쪽 완료')
            break

        item_list.append(df_one_page)
        
        # 짝수 페이지에서만 * 를 출력하게
        if page_no % 2 == 0:
            print("*", end="")
            
        page_no = page_no + 1
        # 현재 날짜를 이전 날짜 변수에 담아서 다음번에 비교합니다.
        prev = curr
    
    # 수집한 내용을 하나로 합치기
    df = pd.concat(item_list)
    # 파일명을 위해 마지막 날짜 가져오기
    date = df.iloc[0]['날짜']
    # 파일명 만들기
    file_name = f"news_{item_code}_{item_name}_{date}.csv"
    # 파일로 저장하기
    df.to_csv(file_name, index=False)
    
    return df
    

In [21]:
get_item_list(item_code, item_name)

****9쪽 완료


Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
1,2022.05.19,415500.0,5500.0,398500.0,417000.0,398500.0,318385.0
2,2022.05.18,410000.0,1500.0,410000.0,413000.0,407500.0,233063.0
3,2022.05.17,408500.0,8000.0,400500.0,411000.0,399000.0,230327.0
4,2022.05.16,400500.0,0.0,407000.0,407500.0,397000.0,200731.0
5,2022.05.13,400500.0,12500.0,390500.0,401500.0,390500.0,228636.0
...,...,...,...,...,...,...,...
1,2022.02.07,548000.0,44000.0,520000.0,548000.0,511000.0,1911176.0
2,2022.02.04,504000.0,27000.0,476500.0,505000.0,476000.0,2088996.0
3,2022.02.03,477000.0,27000.0,458000.0,495500.0,441000.0,2918435.0
4,2022.01.28,450000.0,55000.0,476000.0,483000.0,445000.0,4559773.0


# 파일로 저장하기

In [22]:
# 날짜 컬럼
# date
date = df.iloc[0]['날짜']

In [23]:
# 파일명 만들기
file_name = f"news_{item_code}_{item_name}_{date}.csv"

# 파일로 저장하기
df.to_csv(file_name, index=False)

In [24]:
# 저장된 파일 확인하기
pd.read_csv('news_373220_LG에너지솔루션_2022.05.18.csv')

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
0,2022.05.18,410500.0,2000.0,410000.0,413000.0,408500.0,74923.0
1,2022.05.17,408500.0,8000.0,400500.0,411000.0,399000.0,230327.0
2,2022.05.16,400500.0,0.0,407000.0,407500.0,397000.0,200731.0
3,2022.05.13,400500.0,12500.0,390500.0,401500.0,390500.0,228636.0
4,2022.05.12,388000.0,3500.0,390000.0,395500.0,386000.0,251570.0
...,...,...,...,...,...,...,...
69,2022.02.07,548000.0,44000.0,520000.0,548000.0,511000.0,1911176.0
70,2022.02.04,504000.0,27000.0,476500.0,505000.0,476000.0,2088996.0
71,2022.02.03,477000.0,27000.0,458000.0,495500.0,441000.0,2918435.0
72,2022.01.28,450000.0,55000.0,476000.0,483000.0,445000.0,4559773.0
