In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np

## 데이터 수집

In [2]:
url = 'https://finance.naver.com/api/sise/etfItemList.nhn?etfType=0&targetColumn=market_sum&sortOrder=desc'

In [3]:
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.141 Whale/3.15.136.29 Safari/537.36'}

res = requests.get(url, headers=headers)

In [6]:
etfItemList = res.json()['result']['etfItemList']
len(etfItemList)

593

## 데이터 프레임 변환

In [7]:
df = pd.DataFrame(etfItemList)
df.head()

Unnamed: 0,itemcode,etfTabCode,itemname,nowVal,risefall,changeVal,changeRate,nav,threeMonthEarnRate,quant,amonut,marketSum
0,69500,1,KODEX 200,31425,5,-155,-0.49,31510.0,-10.6016,2100418,65964,52118
1,371460,4,TIGER 차이나전기차SOLACTIVE,17125,5,-365,-2.09,17176.0,32.8018,6904743,119235,37908
2,122630,3,KODEX 레버리지,14780,5,-150,-1.0,14837.0,-21.4418,20353741,300394,24128
3,133690,4,TIGER 미국나스닥100,69990,5,-1230,-1.73,69780.0,-6.4225,99724,6986,20724
4,102110,1,TIGER 200,31495,5,-150,-0.47,31576.0,-10.5467,820110,25811,20362


## 데이터 저장

In [8]:
from datetime import datetime

date = datetime.today().strftime('%Y-%m-%d')

file_name = f'etf_{date}_row.csv'
df.to_csv(file_name, index=False)

## 데이터 확인

In [13]:
df = pd.read_csv('etf_2022-07-19_row.csv', dtype={'itemcode':object})
df.head()

Unnamed: 0,itemcode,etfTabCode,itemname,nowVal,risefall,changeVal,changeRate,nav,threeMonthEarnRate,quant,amonut,marketSum
0,69500,1,KODEX 200,31425,5,-155,-0.49,31510.0,-10.6016,2100418,65964,52118
1,371460,4,TIGER 차이나전기차SOLACTIVE,17125,5,-365,-2.09,17176.0,32.8018,6904743,119235,37908
2,122630,3,KODEX 레버리지,14780,5,-150,-1.0,14837.0,-21.4418,20353741,300394,24128
3,133690,4,TIGER 미국나스닥100,69990,5,-1230,-1.73,69780.0,-6.4225,99724,6986,20724
4,102110,1,TIGER 200,31495,5,-150,-0.47,31576.0,-10.5467,820110,25811,20362


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593 entries, 0 to 592
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   itemcode            593 non-null    object 
 1   etfTabCode          593 non-null    int64  
 2   itemname            593 non-null    object 
 3   nowVal              593 non-null    int64  
 4   risefall            593 non-null    int64  
 5   changeVal           593 non-null    int64  
 6   changeRate          593 non-null    float64
 7   nav                 593 non-null    float64
 8   threeMonthEarnRate  551 non-null    float64
 9   quant               593 non-null    int64  
 10  amonut              593 non-null    int64  
 11  marketSum           593 non-null    int64  
dtypes: float64(3), int64(7), object(2)
memory usage: 55.7+ KB


#### quant를 기준으로 내림차순 정렬한 후 상위 10개 추출

In [18]:
df.sort_values('quant',ascending=False).head(10)

Unnamed: 0,itemcode,etfTabCode,itemname,nowVal,risefall,changeVal,changeRate,nav,threeMonthEarnRate,quant,amonut,marketSum
5,252670,3,KODEX 200선물인버스2X,3190,2,30,0.95,3192.0,21.5384,138661549,443280,20272
39,251340,3,KODEX 코스닥150선물인버스,5025,5,-15,-0.3,5027.0,9.8039,29203379,147557,4166
22,114800,3,KODEX 인버스,4925,2,25,0.51,4925.0,10.9852,28677568,141280,9264
2,122630,3,KODEX 레버리지,14780,5,-150,-1.0,14837.0,-21.4418,20353741,300394,24128
21,233740,3,KODEX 코스닥150레버리지,8545,2,70,0.83,8646.0,-25.3304,14756284,124973,9374
1,371460,4,TIGER 차이나전기차SOLACTIVE,17125,5,-365,-2.09,17176.0,32.8018,6904743,119235,37908
74,271050,5,KODEX WTI원유선물인버스(H),4305,5,-150,-3.37,4304.0,-0.1122,5453353,23497,1761
65,217770,5,TIGER 원유선물인버스(H),3000,5,-105,-3.38,2991.0,-0.4808,5307544,15918,2219
27,229200,1,KODEX 코스닥150,11060,2,50,0.45,11197.0,-12.0116,3862300,42539,6420
94,252710,3,TIGER 200선물인버스2X,3340,2,30,0.91,3341.0,22.1402,3188865,10673,1313


#### etfTabCode를 기준으로 빈도수 확인하기

In [19]:
df['etfTabCode'].value_counts().sort_index()

1     66
2    210
3     37
4    149
5     18
6     62
7     51
Name: etfTabCode, dtype: int64

## 데이터 전처리

#### etfTabNaem 생성

In [27]:
etfcode='''전체
국내 시장지수
국내 업종/테마
국내 파생
해외 주식
원자재
채권
기타'''

In [28]:
etfcode

'전체\n국내 시장지수\n국내 업종/테마\n국내 파생\n해외 주식\n원자재\n채권\n기타'

In [29]:
# etfcode를 \n 기준으로 나누어 etf_tab_name 리스트로 생성
etf_tab_name = etfcode.split('\n')

In [30]:
etf_tab_name

['전체', '국내 시장지수', '국내 업종/테마', '국내 파생', '해외 주식', '원자재', '채권', '기타']

In [32]:
def find_etf_tab_name(no):
    return etf_tab_name[no]

In [33]:
# etfTabName 컬럼 생성

df['etfTabNaem'] = df['etfTabCode'].map(find_etf_tab_name)

# df['etfTabNaem'] = df['etfTabCode'].map(lambda no:etf_tab_name[no])

In [34]:
df.head()

Unnamed: 0,itemcode,etfTabCode,itemname,nowVal,risefall,changeVal,changeRate,nav,threeMonthEarnRate,quant,amonut,marketSum,etfTabNaem
0,69500,1,KODEX 200,31425,5,-155,-0.49,31510.0,-10.6016,2100418,65964,52118,국내 시장지수
1,371460,4,TIGER 차이나전기차SOLACTIVE,17125,5,-365,-2.09,17176.0,32.8018,6904743,119235,37908,해외 주식
2,122630,3,KODEX 레버리지,14780,5,-150,-1.0,14837.0,-21.4418,20353741,300394,24128,국내 파생
3,133690,4,TIGER 미국나스닥100,69990,5,-1230,-1.73,69780.0,-6.4225,99724,6986,20724,해외 주식
4,102110,1,TIGER 200,31495,5,-150,-0.47,31576.0,-10.5467,820110,25811,20362,국내 시장지수


#### 컬렴명 변경

In [45]:
col_name = '''종목코드
탭코드
종목명
현재가
등락구분
전일비
등락률
순자산가치(NAV)
3개월수익률
거래량
거래대금(백만)
시가총액(억)
유형'''

In [46]:
col_name = col_name.split('\n')

In [47]:
df.columns = col_name

In [48]:
df.head(2)

Unnamed: 0,종목코드,탭코드,종목명,현재가,등락구분,전일비,등락률,순자산가치(NAV),3개월수익률,거래량,거래대금(백만),시가총액(억),유형
0,69500,1,KODEX 200,31425,5,-155,-0.49,31510.0,-10.6016,2100418,65964,52118,국내 시장지수
1,371460,4,TIGER 차이나전기차SOLACTIVE,17125,5,-365,-2.09,17176.0,32.8018,6904743,119235,37908,해외 주식


#### 브랜드 컬럼 추가

In [63]:
# 종목명 KODEX200 일 때 -> KODEX 만

df['브랜드'] = df['종목명'].str.split(' ', expand=True)[0]

In [68]:
df[['종목명','브랜드']]

Unnamed: 0,종목명,브랜드
0,KODEX 200,KODEX
1,TIGER 차이나전기차SOLACTIVE,TIGER
2,KODEX 레버리지,KODEX
3,TIGER 미국나스닥100,TIGER
4,TIGER 200,TIGER
...,...,...
588,KBSTAR 모멘텀밸류,KBSTAR
589,KBSTAR 200에너지화학,KBSTAR
590,KBSTAR 200생활소비재,KBSTAR
591,KBSTAR 200산업재,KBSTAR


#### 인버스 컬럼 생성
- 인버스는 지수가 하락하면 수익률이 오르는 상품

In [69]:
# 종목명에 인버스 라는 글자가 들어가면 True

df['인버스'] = df['종목명'].str.contains('인버스')

In [70]:
# df['인버스'] 빈도수 구하기
df['인버스'].value_counts()

False    552
True      41
Name: 인버스, dtype: int64

In [71]:
# 비율
df['인버스'].value_counts(normalize=True) * 100

False    93.086003
True      6.913997
Name: 인버스, dtype: float64

#### 레버리지 컬럼 생성
- 레버리지는 타인의 자본을 지렛대처럼 이용하여 자기 자본의 이익률 높임

In [72]:
# 종목명에 레버리지 라는 글자가 들어가면 True

df['레버리지'] = df['종목명'].str.contains('레버리지')

In [73]:
# df['레버리지'] 빈도수 구하기

df['레버리지'].value_counts()

False    555
True      38
Name: 레버리지, dtype: int64

In [74]:
# 비율

df['레버리지'].value_counts(normalize=True) * 100

False    93.591906
True      6.408094
Name: 레버리지, dtype: float64

#### 환헤지(H) 컬럼 생성

In [77]:
# 종목명 끝 부분에 H) 끝나면 True

df['환헤지(H)'] = df['종목명'].str.endswith('H)')

In [78]:
# 빈도수
df['환헤지(H)'].value_counts()

False    526
True      67
Name: 환헤지(H), dtype: int64

In [79]:
df['환헤지(H)'].value_counts(normalize=True) * 100

False    88.701518
True     11.298482
Name: 환헤지(H), dtype: float64

#### 등락구분 컬럼 빈도수

In [81]:
df['등락구분'].value_counts() # 5, 2, 3

5    379
2    173
3     41
Name: 등락구분, dtype: int64

In [82]:
df

Unnamed: 0,종목코드,탭코드,종목명,현재가,등락구분,전일비,등락률,순자산가치(NAV),3개월수익률,거래량,거래대금(백만),시가총액(억),유형,브랜드,인버스,레버리지,환헤지(H)
0,069500,1,KODEX 200,31425,5,-155,-0.49,31510.0,-10.6016,2100418,65964,52118,국내 시장지수,KODEX,False,False,False
1,371460,4,TIGER 차이나전기차SOLACTIVE,17125,5,-365,-2.09,17176.0,32.8018,6904743,119235,37908,해외 주식,TIGER,False,False,False
2,122630,3,KODEX 레버리지,14780,5,-150,-1.00,14837.0,-21.4418,20353741,300394,24128,국내 파생,KODEX,False,True,False
3,133690,4,TIGER 미국나스닥100,69990,5,-1230,-1.73,69780.0,-6.4225,99724,6986,20724,해외 주식,TIGER,False,False,False
4,102110,1,TIGER 200,31495,5,-150,-0.47,31576.0,-10.5467,820110,25811,20362,국내 시장지수,TIGER,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
588,252720,2,KBSTAR 모멘텀밸류,11900,5,-90,-0.75,11930.0,-13.5732,7,0,12,국내 업종/테마,KBSTAR,False,False,False
589,284990,2,KBSTAR 200에너지화학,9865,2,65,0.66,9890.0,-11.8706,72,0,10,국내 업종/테마,KBSTAR,False,False,False
590,287330,2,KBSTAR 200생활소비재,6555,5,-20,-0.30,6589.0,-9.0344,2575,16,9,국내 업종/테마,KBSTAR,False,False,False
591,287320,2,KBSTAR 200산업재,10795,2,65,0.61,10820.0,-8.6887,8,0,9,국내 업종/테마,KBSTAR,False,False,False


In [83]:
file_name

'etf_2022-07-19_row.csv'

In [85]:
save_file_name = file_name.replace('_row','')
save_file_name

'etf_2022-07-19.csv'

In [86]:
df.to_csv(save_file_name, index=False)