In [22]:
# 必要なパッケージのインストール

In [1]:
#!pip install pandas
#!pip install datapackage
#!pip install SQLAlchemy
#!pip install tableschema-sql

In [23]:
# 銘柄データを取得してSQLLiteで保存
# 参考（https://rainbow-engine.com/python-sqlalchemy-importcsv/）

In [15]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///periodic-table-datapackage_company.db')

In [20]:
from datapackage import Package
package = Package('https://datahub.io/core/s-and-p-500-companies/datapackage.json')
#package.resource_names

['validation_report', 'constituents_csv', 'constituents_json', 's-and-p-500-companies_zip', 'constituents']


In [21]:
package.save(storage='sql', engine=engine)

Storage <Engine(sqlite:///periodic-table-datapackage_company.db)/None>

In [35]:
# 銘柄情報テーブルの内容確認

In [33]:
from sqlalchemy import Table, Column, Integer, String, MetaData
tbl = Table('constituents_csv', MetaData(), autoload_with=engine)

In [34]:
print(tbl.c.keys())

['Symbol', 'Name', 'Sector']


In [38]:
print(list(engine.execute('SELECT count() from constituents_csv')))

[(505,)]


In [26]:
# SQLで銘柄情報を取得

In [80]:
#print(list(engine.execute('SELECT * from constituents_csv where Name like \'%Bank%\'')))
print(list(engine.execute('SELECT Symbol from constituents_csv limit 10')))

[('3M Company',), ('A.O. Smith Corp',), ('Abbott Laboratories',), ('AbbVie Inc.',), ('Accenture plc',), ('Activision Blizzard',), ('Acuity Brands Inc',), ('Adobe Systems Inc',), ('Advance Auto Parts',), ('Advanced Micro Devices Inc',)]


In [33]:
# 追加情報付きのデータ取得

In [2]:
from datapackage import Package
package = Package('https://datahub.io/core/s-and-p-500-companies-financials/datapackage.json')

In [34]:
## 取得したデータ内容の確認

In [17]:
package.resource_names
# package.get_resource('constituents-financials_csv').read()

['validation_report',
 'constituents_csv',
 'constituents-financials_csv',
 'constituents_json',
 'constituents-financials_json',
 's-and-p-500-companies-financials_zip',
 'constituents',
 'constituents-financials']

In [27]:
engine = create_engine('sqlite:///periodic-table-datapackage_company-wk.db')
package.save(storage='sql', engine=engine)

Storage <Engine(sqlite:///periodic-table-datapackage_company-wk.db)/None>

In [35]:
## 登録されたテーブル名の確認

In [30]:
meta = MetaData()
meta.reflect(bind=engine)
meta.tables.keys()

dict_keys(['constituents', 'constituents_csv', 'constituents_financials', 'constituents_financials_csv'])

In [36]:
from sqlalchemy import Table, Column, Integer, String, MetaData
tbl = Table('constituents_financials_csv', MetaData(), autoload_with=engine)

In [37]:
print(tbl.c.keys())

['Symbol', 'Name', 'Sector', 'Price', 'Price/Earnings', 'Dividend Yield', 'Earnings/Share', '52 Week Low', '52 Week High', 'Market Cap', 'EBITDA', 'Price/Sales', 'Price/Book', 'SEC Filings']


In [39]:
print(list(engine.execute('SELECT * from constituents_financials_csv limit 1')))

[('MMM', '3M Company', 'Industrials', 222.89, 24.31, 2.3328617, 7.92, 259.77, 175.49, 138721055226.0, 9048000000.0, 4.3902707, 11.34, 'http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=MMM')]


In [46]:
# 1970~2018年の株価データの確認

In [57]:
import pandas as pd
df = pd.read_csv('./resource_files/historical_stock_prices.csv.zip', sep=',', header=0)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20973889 entries, 0 to 20973888
Data columns (total 8 columns):
ticker       object
open         float64
close        float64
adj_close    float64
low          float64
high         float64
volume       int64
date         object
dtypes: float64(5), int64(1), object(2)
memory usage: 1.3+ GB


In [60]:
df.head()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.5,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.5,11.66,275800,2013-05-09
2,AHH,11.55,11.6,8.507822,11.5,11.6,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.6,11.53,8.456484,11.5,11.6,184100,2013-05-14


In [61]:
df.tail()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
20973884,NZF,14.6,14.59,14.59,14.58,14.62,137500,2018-08-20
20973885,NZF,14.6,14.58,14.58,14.57,14.61,151200,2018-08-21
20973886,NZF,14.58,14.59,14.59,14.57,14.63,185400,2018-08-22
20973887,NZF,14.6,14.57,14.57,14.57,14.64,135600,2018-08-23
20973888,NZF,14.6,14.69,14.69,14.59,14.69,180900,2018-08-24


In [64]:
df.agg({"date":[min,max]})

Unnamed: 0,date
min,1970-01-02
max,2018-08-24


In [63]:
df.query("ticker=='MMM'").agg({"date":[min,max]})

Unnamed: 0,date
min,1970-01-02
max,2018-08-24


In [65]:
# Macrotrendsからデータを抽出

In [66]:
## クローラーの準備

In [67]:
### 必要なパッケージのインストール

In [90]:
# !pip install selenium

In [7]:
import os

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# headless optionの設定
options = Options()
# options.add_argument("--headless");
# driverの定義
driver = webdriver.Chrome(executable_path="./resource_files/chromedriver", chrome_options=options)

  import sys


In [9]:
url = "https://www.macrotrends.net/stocks/charts/"

In [27]:
# MacrotrendsのTopページへアクセス
driver.get(os.path.join(url, "MMM"))
target_url = driver.current_url

# 以降、Topページからの検索はjQueryでうまく実行できないため、銘柄ごとのURLを取得して、必要な情報のパスを追加してアクセスする

In [63]:
## revenue
driver.get(os.path.join(target_url, "revenue"))
elements = driver.find_elements_by_css_selector('#style-1 > .col-xs-6:nth-child(2) > .historical_data_table.table > tbody > tr')

# サンプルの取得例
# elements[0].text.split(" $")

In [70]:
## gross-profit
driver.get(os.path.join(target_url, "gross-profit"))
elements = driver.find_elements_by_css_selector('#style-1 > .col-xs-6:nth-child(2) > .historical_data_table.table > tbody > tr')

# サンプルの取得例
elements[0].text.split(" $")

['2020-09-30', '4,047']