In [1]:
!pip install pykrx

Defaulting to user installation because normal site-packages is not writeable
Collecting pykrx
  Obtaining dependency information for pykrx from https://files.pythonhosted.org/packages/1e/5a/ffc6741ce31570a968c0c757dc3ff28e0a89f3fbc8d09b4c444a98226c21/pykrx-1.0.45-py3-none-any.whl.metadata
  Downloading pykrx-1.0.45-py3-none-any.whl.metadata (62 kB)
     ---------------------------------------- 62.4/62.4 kB 3.5 MB/s eta 0:00:00
Collecting datetime (from pykrx)
  Obtaining dependency information for datetime from https://files.pythonhosted.org/packages/95/88/3b9d4042b396221a132180b392ab2a174031a6fb579f7927f3909fc183a7/DateTime-5.2-py3-none-any.whl.metadata
  Using cached DateTime-5.2-py3-none-any.whl.metadata (33 kB)
Collecting deprecated (from pykrx)
  Obtaining dependency information for deprecated from https://files.pythonhosted.org/packages/20/8d/778b7d51b981a96554f29136cd59ca7880bf58094338085bcf2a979a0e6a/Deprecated-1.2.14-py2.py3-none-any.whl.metadata
  Downloading Deprecated-1.2.


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
from tqdm import tqdm
from pykrx import stock

# 1. 데이터 불러오기
pricedata = pd.read_csv('enlist_ver_2.0.csv')
fsdata = pd.read_csv('kor_fs.csv', encoding='cp949')

# 2. fsdata의 컬럼명 및 인덱스를 초기화하고 파일로 저장
fsdata.columns = ['회사명', '종목코드', '날짜', '당기순이익', '자산', '부채', '자본금', '이익잉여금', '매출액', '매출총이익', '영업이익', '자본', '유동자산']
fsdata.reset_index(drop=True, inplace=True)
fsdata.to_csv('fs_ver_1.0.csv')

# 3. fsdata에서 날짜가 12개가 아닌 종목코드를 삭제
for i in tqdm(uni):
    if len(fsdata[fsdata['종목코드'] == i]['날짜'].unique()) != 12:
        fsdata = fsdata.drop(fsdata[fsdata['종목코드'] == i].index)

# 4. fsdata에서 가격 데이터에 있는 종목코드만 필터링하여 추출
pricelist = pricedata['종목코드'].unique().tolist()
fsdata_filt = pd.DataFrame()
for i in tqdm(pricelist):
    fsdata_filt = pd.concat([fsdata_filt, fsdata[fsdata['종목코드'] == i]])

# 5. 불필요한 날짜를 기준으로 데이터 필터링
date_list = ['2011/03', '2012/03', '2013/03', ...]  # 해당 날짜 리스트는 코드에서 주어진 것을 기반으로 함
fsdata_filt_date = fsdata_filt.copy()
for i in tqdm(date_list):
    fsdata_filt_date = fsdata_filt_date.drop(fsdata_filt_date[fsdata_filt['날짜'] == i].index)

# 6. 다양한 재무 지표 및 증가율 계산
fsdata_cal = fsdata_filt_date.copy()
fsdata_cal['ROA'] = fsdata_cal['당기순이익'] / fsdata_cal['자산']
fsdata_cal['자산회전율'] = fsdata_cal['매출액'] / fsdata_cal['자산']
fsdata_cal['매출액영업이익률'] = fsdata_cal['영업이익'] / fsdata_cal['매출액'] * 100
fsdata_cal['매출총이익률'] = fsdata_cal['매출총이익'] / fsdata_cal['매출액'] * 100
fsdata_cal['매출액증가율'] = fsdata_cal['매출액'].pct_change()
fsdata_cal['영업이익증가율'] = fsdata_cal['영업이익'].pct_change()
fsdata_cal['순이익증가율'] = fsdata_cal['당기순이익'].pct_change()
fsdata_cal['총자본증가율'] = fsdata_cal['자본'].pct_change()
fsdata_cal = fsdata_cal.drop(fsdata_cal[fsdata_cal['날짜'] == '2011/12'].index)  # 2011/12 날짜 데이터 삭제
fsdata_cal['부채비율'] = fsdata_cal['부채'] / fsdata_cal['자본']

# 7. 시가총액 데이터를 가져와서 기존 데이터와 병합
result = pd.DataFrame()
for date in tqdm(date_list):
    date_data = stock.get_market_cap_by_ticker(date)
    date_data['날짜'] = date
    result = pd.concat([result, date_data], axis=0)

result.reset_index(inplace=True)
fs_cap_data = pd.concat([result, fsdata_cal], axis=1)
fs_cap_data['PER'] = fs_cap_data['시가총액'] / fs_cap_data['당기순이익']
fs_cap_data['PBR'] = fs_cap_data['시가총액'] / (fs_cap_data['자본금'] + fs_cap_data['이익잉여금'])
fs_cap_data['PSR'] = fs_cap_data['시가총액'] / fs_cap_data['매출액']

# 8. 가격 데이터의 전처리
pricedata = pd.read_csv('price_ver_3.0.csv')
pricedata.columns = ['X', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', '종목코드']
pricedata.drop('X', axis=1, inplace=True)
pricedata['Date'] = pd.to_datetime(pricedata['Date'])
pricedata = pricedata.groupby('종목코드').apply(lambda group: group.set_index('Date').sort_index()).reset_index(level=0, drop=True)