In [1]:
import requests

def download_file(url, local_filename):
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

# Download the filing index

In [2]:
years = [i for i in range(1993, 2019)]
quarters = [i for i in range(1, 5)]
index_url = 'https://www.sec.gov/Archives/edgar/full-index/{0}/QTR{1}/master.idx'

for year in years:
    for quarter in quarters:
        print("Start download year {0}, Q{1}".format(year, quarter))

        try:
            url = index_url.format(year, quarter)
            filename = './data/index/{0}Q{1}.master.idx'.format(year, quarter)
            download_file(url, filename)
        except Exception:
            print("Failed download year {0}, Q{1}".format(year, quarter))

Start download year 1993, Q1
Start download year 1993, Q2
Start download year 1993, Q3
Start download year 1993, Q4
Start download year 1994, Q1
Start download year 1994, Q2
Start download year 1994, Q3
Start download year 1994, Q4
Start download year 1995, Q1
Start download year 1995, Q2
Start download year 1995, Q3
Start download year 1995, Q4
Start download year 1996, Q1
Start download year 1996, Q2
Start download year 1996, Q3
Start download year 1996, Q4
Start download year 1997, Q1
Start download year 1997, Q2
Start download year 1997, Q3
Start download year 1997, Q4
Start download year 1998, Q1
Start download year 1998, Q2
Start download year 1998, Q3
Start download year 1998, Q4
Start download year 1999, Q1
Start download year 1999, Q2
Start download year 1999, Q3
Start download year 1999, Q4
Start download year 2000, Q1
Start download year 2000, Q2
Start download year 2000, Q3
Start download year 2000, Q4
Start download year 2001, Q1
Start download year 2001, Q2
Start download

# only download 10K/10Q and their amendment

In [11]:
import pandas as pd
from sqlalchemy import create_engine
import os
con = create_engine('postgresql://postgres:0103@localhost/research')

In [8]:
ciks = pd.read_sql("""select cik from public.security""", con=con).cik.tolist()

In [9]:
f_10K = ['10-K', '10-K405', '10KSB', '10-KSB', '10KSB40']
f_10KA = ['10-K/A', '10-K405/A', '10KSB/A', '10-KSB/A', '10KSB40/A']
f_10KT = ['10-KT', '10KT405', '10-KT/A', '10KT405/A']
f_10Q = ['10-Q', '10QSB', '10-QSB']
f_10QA = ['10-Q/A', '10QSB/A', '10-QSB/A']
f_10QT = ['10-QT', '10-QT/A']
# List of all 10-X related forms
f_10X = f_10K + f_10KA + f_10KT + f_10Q + f_10QA + f_10QT
# Regulation A+ related forms
f_1X = ['1-A', '1-A/A', '1-K', '1-SA', '1-U', '1-Z']

In [20]:
root_dir = './data/index/'
files = os.listdir(root_dir)

filings = []
for file in files:
    df = pd.read_csv(root_dir + file,
        sep='|',
        skiprows=[i for i in range(9)] + [10],
        parse_dates=['Date Filed'])
    
    filings.append(df.loc[df['Form Type'].isin(f_10X) & df['CIK'].isin(ciks)])

filings = pd.concat(filings)

In [21]:
len(filings)

42853

In [23]:
filings.head()

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename
100,1000228,HENRY SCHEIN INC,10-K,2014-02-11,edgar/data/1000228/0001000228-14-000010.txt
621,1000697,WATERS CORP /DE/,10-K,2014-02-27,edgar/data/1000697/0001193125-14-072515.txt
706,1001039,WALT DISNEY CO/,10-Q,2014-02-05,edgar/data/1001039/0001001039-14-000082.txt
762,1001082,DISH Network CORP,10-K,2014-02-21,edgar/data/1001082/0001104659-14-012023.txt
881,1001250,ESTEE LAUDER COMPANIES INC,10-Q,2014-02-05,edgar/data/1001250/0001104659-14-006766.txt


# only download 10-K for now and only look at the recent two year

In [25]:
downloads = filings.loc[(filings['Form Type'] == '10-K') & (filings['Date Filed'] > '2016-01-01')]

In [26]:
downloads

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename
943,1002047,"NetApp, Inc.",10-K,2017-06-20,edgar/data/1002047/0001564590-17-012758.txt
9574,1037038,RALPH LAUREN CORP,10-K,2017-05-18,edgar/data/1037038/0001037038-17-000004.txt
22618,1087423,RED HAT INC,10-K,2017-04-26,edgar/data/1087423/0001193125-17-139562.txt
38970,1170010,CARMAX INC,10-K,2017-04-21,edgar/data/1170010/0001170010-17-000053.txt
58899,12659,H&R BLOCK INC,10-K,2017-06-16,edgar/data/12659/0001574842-17-000019.txt
73981,1341439,ORACLE CORP,10-K,2017-06-27,edgar/data/1341439/0001193125-17-214833.txt
103786,14693,BROWN FORMAN CORP,10-K,2017-06-15,edgar/data/14693/0000014693-17-000119.txt
119604,1530721,Michael Kors Holdings Ltd,10-K,2017-05-31,edgar/data/1530721/0001530721-17-000022.txt
136896,1585364,PERRIGO Co plc,10-K,2017-05-22,edgar/data/1585364/0001585364-17-000071.txt
143199,1604778,"Qorvo, Inc.",10-K,2017-05-23,edgar/data/1604778/0001604778-17-000031.txt


In [43]:
root_url = 'https://www.sec.gov/Archives/'
file_dir = './data/tenk/'
for row in downloads.itertuples():
    url = root_url + row.Filename
    outfilename = file_dir + '-'.join(row.Filename.split('/')[2:])
    print('start download {0}'.format(row._2))
    download_file(url, outfilename)

start download NetApp, Inc.
start download RALPH LAUREN CORP
start download RED HAT INC
start download CARMAX INC
start download H&R BLOCK INC
start download ORACLE CORP
start download BROWN FORMAN CORP
start download Michael Kors Holdings Ltd
start download PERRIGO Co plc
start download Qorvo, Inc.
start download CONSTELLATION BRANDS, INC.
start download CA, INC.
start download GENERAL MILLS INC
start download LOWES COMPANIES INC
start download ELECTRONIC ARTS INC.
start download XILINX INC
start download MICROCHIP TECHNOLOGY INC
start download SYMANTEC CORP
start download J M SMUCKER Co
start download MCKESSON CORP
start download TAKE TWO INTERACTIVE SOFTWARE INC
start download NetApp, Inc.
start download RALPH LAUREN CORP
start download RED HAT INC
start download CARMAX INC
start download H&R BLOCK INC
start download ORACLE CORP
start download BROWN FORMAN CORP
start download Michael Kors Holdings Ltd
start download Qorvo, Inc.
start download CONSTELLATION BRANDS, INC.
start downloa

start download Zoetis Inc.
start download Intercontinental Exchange, Inc.
start download Allergan plc
start download Allegion plc
start download Hilton Worldwide Holdings Inc.
start download NAVIENT CORP
start download IHS Markit Ltd.
start download Synchrony Financial
start download PayPal Holdings, Inc.
start download Kraft Heinz Co
start download Alphabet Inc.
start download Fortive Corp
start download TechnipFMC plc
start download ABBOTT LABORATORIES
start download CATERPILLAR INC
start download CENTURYLINK, INC
start download JPMORGAN CHASE & CO
start download JOHNSON & JOHNSON
start download CINCINNATI FINANCIAL CORP
start download COCA COLA CO
start download COLGATE PALMOLIVE CO
start download TEXTRON INC
start download MOLSON COORS BREWING CO
start download CORNING INC /NY
start download ADVANCED MICRO DEVICES INC
start download CUMMINS INC
start download TARGET CORP
start download GRAINGER W W INC
start download CSX CORP
start download DELTA AIR LINES INC /DE/
start download C

start download DOLLAR TREE INC
start download DTE ENERGY CO
start download LOCKHEED MARTIN CORP
start download STATE STREET CORP
start download BALL Corp
start download LEUCADIA NATIONAL CORP
start download TEXAS INSTRUMENTS INC
start download THERMO FISHER SCIENTIFIC INC.
start download TIFFANY & CO
start download WALT DISNEY CO/
start download TYSON FOODS INC
start download ROCKWELL AUTOMATION INC
start download F5 NETWORKS INC
start download BECTON DICKINSON & CO
start download AGILENT TECHNOLOGIES INC
start download MONSANTO CO /NEW/
start download ROCKWELL COLLINS INC
start download AMERISOURCEBERGEN CORP
start download ACUITY BRANDS INC
start download TransDigm Group INC
start download Viacom Inc.
start download TE Connectivity Ltd.
start download VISA INC.
start download Accenture plc
start download Walgreens Boots Alliance, Inc.
start download WestRock Co
start download Hewlett Packard Enterprise Co
start download VARIAN MEDICAL SYSTEMS INC
start download AIR PRODUCTS & CHEMICA

start download General Growth Properties, Inc.
start download HUNTINGTON INGALLS INDUSTRIES, INC.
start download KINDER MORGAN, INC.
start download Marathon Petroleum Corp
start download Norwegian Cruise Line Holdings Ltd.
start download Fortune Brands Home & Security, Inc.
start download Delphi Automotive PLC
start download Xylem Inc.
start download TripAdvisor, Inc.
start download Express Scripts Holding Co.
start download Phillips 66
start download AbbVie Inc.
start download Eaton Corp plc
start download Zoetis Inc.
start download Intercontinental Exchange, Inc.
start download Allergan plc
start download Allegion plc
start download Hilton Worldwide Holdings Inc.
start download NAVIENT CORP
start download Synchrony Financial
start download PayPal Holdings, Inc.
start download Kraft Heinz Co
start download Alphabet Inc.
start download ABBOTT LABORATORIES
start download CATERPILLAR INC
start download CENTURYLINK, INC
start download JPMORGAN CHASE & CO
start download JOHNSON & JOHNSON
s

start download MARTIN MARIETTA MATERIALS INC
start download TRACTOR SUPPLY CO /DE/
start download LABORATORY CORP OF AMERICA HOLDINGS
start download ESSEX PROPERTY TRUST INC
start download LENNAR CORP /NEW/
start download SOUTHERN CO
start download PPL Corp
start download BB&T CORP
start download APARTMENT INVESTMENT & MANAGEMENT CO
start download SOUTHWEST AIRLINES CO
start download DAVITA HEALTHCARE PARTNERS INC.
start download CAPITAL ONE FINANCIAL CORP
start download CHEVRON CORP
start download STANLEY BLACK & DECKER, INC.
start download DOLLAR TREE INC
start download DTE ENERGY CO
start download LOCKHEED MARTIN CORP
start download STATE STREET CORP
start download BALL CORP
start download LEUCADIA NATIONAL CORP
start download TEXAS INSTRUMENTS INC
start download THERMO FISHER SCIENTIFIC INC.
start download TIFFANY & CO
start download HENRY SCHEIN INC
start download WATERS CORP /DE/
start download DISH Network CORP
start download AMEREN CORP
start download AFFILIATED MANAGERS GROUP,

start download US BANCORP \DE\
start download M&T BANK CORP
start download FMC CORP
start download FORD MOTOR CO
start download GAP INC
start download GENERAL DYNAMICS CORP
start download GENERAL ELECTRIC CO
start download GENUINE PARTS CO
start download GOODYEAR TIRE & RUBBER CO /OH/
start download Arconic Inc.
start download HESS CORP
start download HALLIBURTON CO
start download HASBRO INC
start download HERSHEY CO
start download AMERICAN ELECTRIC POWER CO INC
start download HUMANA INC
start download HUNTINGTON BANCSHARES INC/MD
start download AMERICAN EXPRESS CO
start download AFLAC INC
start download ILLINOIS TOOL WORKS INC
start download ANDEAVOR
start download INTEL CORP
start download INTERNATIONAL BUSINESS MACHINES CORP
start download INTERNATIONAL FLAVORS & FRAGRANCES INC
start download INTERNATIONAL PAPER CO /NEW/
start download INTERPUBLIC GROUP OF COMPANIES, INC.
start download AMERICAN INTERNATIONAL GROUP INC
start download KANSAS CITY SOUTHERN
start download KELLOGG CO
st