## Import

In [1]:
import numpy as np
import pandas as pd
import pickle
import FinanceDataReader as fdr
import pandas_datareader.data as pdr
import yfinance as yf
import requests
from bs4 import BeautifulSoup

## Read Data

In [2]:
# 분석 시점은 2013년 10월부터 2023년 6월까지로 설정한다.
start, end = '2013-10-01', '2023-07-02'

#### 주식 데이터
Microsoft, `MasterCard`, Intel, McDonald, Apple 주식을 수집한다.

In [3]:
msft = fdr.DataReader('MSFT', start, end)
ma = fdr.DataReader('MA', start, end)
intc = fdr.DataReader('INTC', start, end)
mcd = fdr.DataReader('MCD', start, end)    
aapl = fdr.DataReader('AAPL', start, end)
display(msft)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-10-01,33.349998,33.610001,33.299999,33.580002,28.063431,36718700
2013-10-02,33.360001,34.029999,33.290001,33.919998,28.347569,46946800
2013-10-03,33.880001,34.000000,33.419998,33.860001,28.297424,38703800
2013-10-04,33.689999,33.990002,33.619999,33.880001,28.314150,33008100
2013-10-07,33.599998,33.709999,33.200001,33.299999,27.829426,35069300
...,...,...,...,...,...,...
2023-06-26,333.720001,336.109985,328.489990,328.600006,327.241577,21520600
2023-06-27,331.859985,336.149994,329.299988,334.570007,333.186890,24354100
2023-06-28,334.660004,337.980011,333.809998,335.850006,334.461609,20259500
2023-06-29,334.709991,336.109985,332.619995,335.049988,333.664886,16997000


#### 경제지표
GDP(국내총생산), CPI(소비자물가지수), PPI(생산자물가지수), 실업률, 3년금리, 환율,<br>
소매 판매 지수, 무역 수지, S&P500의 변동지수, 나스닥, 원유가격, S&P500, SOX(반도체지수)

In [4]:
econ = ['GDP', 'CPIAUCNS', 'PPIACO', 'UNRATE', 'GS3', 'DEXUSEU', 
        'RSAFS', 'NETEXC', 'VIXCLS', 'NASDAQCOM', 'DCOILWTICO']

economics = pd.DataFrame({'DATE': pd.date_range('2013-10-01','2023-07-01')})
for code in econ:
    economics = economics.merge(pdr.DataReader(code, 'fred', start, end), on='DATE', how='left')

In [5]:
# S&P500, SOX(반도체지수)는 별도로 수집한다.
economics = economics.merge(pd.concat([fdr.DataReader('S&P500', start, end)['Close'].rename('SP500'), 
                                       yf.download("^SOX", start, end)['Adj Close'].rename('SOX')], axis=1).reset_index(),
                            left_on='DATE', right_on='Date', how='left')
display(economics)

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,DATE,GDP,CPIAUCNS,PPIACO,UNRATE,GS3,DEXUSEU,RSAFS,NETEXC,VIXCLS,NASDAQCOM,DCOILWTICO,Date,SP500,SOX
0,2013-10-01,17192.019,233.546,202.500,7.2,0.63,1.3534,419288.0,-275.552,15.54,3817.98,102.09,2013-10-01,1695.000000,495.769989
1,2013-10-02,,,,,,1.3592,,,16.60,3815.02,104.15,2013-10-02,1693.869995,495.100006
2,2013-10-03,,,,,,1.3642,,,17.67,3774.34,103.29,2013-10-03,1678.660034,493.799988
3,2013-10-04,,,,,,1.3582,,,16.74,3807.75,103.83,2013-10-04,1690.500000,497.720001
4,2013-10-05,,,,,,,,,,,,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3556,2023-06-27,,,,,,1.0961,,,13.74,13555.67,67.68,2023-06-27,4378.410156,3642.409912
3557,2023-06-28,,,,,,1.0904,,,13.43,13591.75,69.54,2023-06-28,4376.859863,3609.479980
3558,2023-06-29,,,,,,1.0881,,,13.54,13591.33,69.85,2023-06-29,4396.439941,3614.179932
3559,2023-06-30,,,,,,1.0920,,,13.59,13787.92,70.66,2023-06-30,4450.379883,3673.060059


#### 재무제표
stockanalysis 사이트를 통해 10년치 재무제표를 크롤링한다.

In [6]:
def FS(ticker):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}

    url = f"https://stockanalysis.com/stocks/{ticker}/financials/?p=quarterly"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    element_tables = soup.select("table[data-test='financials']")
    # 매출액, 매출성장률, 당기순이익, 순이익 성장률, 이익률, 주당 배당금
    income = ['Revenue', 'Revenue Growth (YoY)', 'Net Income', 'Net Income Growth', 'Profit Margin','Dividend Per Share']
    df_income = pd.read_html(str(element_tables))[0].rename(columns={'Quarter Ended':'QuarterEnded'}).query('QuarterEnded in @income')

    url = f"https://stockanalysis.com/stocks/{ticker}/financials/balance-sheet/?p=quarterly"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    element_tables = soup.select('html > body')
    # 총 자산, 주당 장부가치
    balance = ['Total Current Assets', 'Book Value Per Share']
    df_balance = pd.read_html(str(element_tables))[0].rename(columns={'Quarter Ended':'QuarterEnded'}).query('QuarterEnded in @balance')
    
    url = f"https://stockanalysis.com/stocks/{ticker}/financials/ratios/?p=quarterly"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    element_tables = soup.select('html > body')
    
    # 시가총액, PER, PBR, 부채비율, 당좌비율, 자본수익률, 배당수익률
    ratio = ['Market Capitalization', 'PE Ratio', 'PB Ratio','Debt / Equity Ratio', 'Quick Ratio', 
             'Return on Capital (ROIC)', 'Dividend Yield']
    df_ratio = pd.read_html(str(element_tables))[0].rename(columns={'Quarter Ended':'QuarterEnded'}).query('QuarterEnded in @ratio')
    df = pd.concat([df_income, df_balance, df_ratio]).T.iloc[1:]
    return df

In [7]:
msft_fs = FS('msft')
ma_fs = FS('ma')
intc_fs = FS('intc')
mcd_fs = FS('mcd')
aapl_fs = FS('aapl')
display(msft_fs)

Unnamed: 0,0,1,13,14,23,27,6,32,0.1,3,5,8,9,12,13.1
2023-09-30,56517,12.76%,22291,26.97%,0.680,39.44%,207586,29.71,2345948,30.43,10.63,0.39,1.45,26.94%,0.90%
2023-06-30,56189,8.34%,20081,19.96%,0.680,35.74%,184257,27.74,2532081,34.99,12.28,0.29,1.54,26.96%,0.80%
2023-03-31,52857,7.08%,18299,9.39%,0.680,34.62%,163889,26.16,2146049,31.09,11.02,0.31,1.66,27.81%,0.90%
2022-12-31,52747,1.97%,16425,-12.47%,0.680,31.14%,157823,24.58,1787732,26.51,9.76,0.33,1.66,29.81%,1.10%
2022-09-30,50122,10.60%,17556,-14.38%,0.620,35.03%,160812,23.28,1736943,24.89,10.01,0.35,1.59,31.79%,1.10%
2022-06-30,51865,12.38%,16740,1.71%,0.620,32.28%,169684,22.29,1920840,26.41,11.53,0.37,1.57,32.04%,0.90%
2022-03-31,49360,18.35%,16728,8.22%,0.620,33.89%,153922,21.74,2311359,31.90,14.19,0.38,1.77,31.35%,0.80%
2021-12-31,51728,20.09%,18765,21.35%,0.620,36.28%,174188,21.32,2525084,35.47,15.78,0.40,2.05,31.02%,0.70%
2021-09-30,45317,21.97%,20505,47.59%,0.560,45.25%,174326,20.23,2118598,31.21,13.94,0.42,1.96,28.73%,0.80%
2021-06-30,46152,21.35%,16458,46.92%,0.560,35.66%,184406,18.87,2040304,33.30,14.37,0.48,1.90,27.37%,0.80%


## Save Data

In [8]:
pickle.dump((msft, msft_fs), open('data/msft.pkl', 'wb'))
pickle.dump((ma, ma_fs), open('data/ma.pkl', 'wb'))
pickle.dump((intc, intc_fs), open('data/intc.pkl', 'wb'))
pickle.dump((mcd, mcd_fs), open('data/mcd.pkl', 'wb'))
pickle.dump((aapl, aapl_fs), open('data/aapl.pkl', 'wb'))

In [9]:
pickle.dump(economics, open('data/economics.pkl', 'wb'))