In [None]:
# default_exp parquet

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# Download Data From SEC

This notebook provides to code to download the business report data of all companies which is provided by the SEC (US Security and Exchange Commission).
There is a zip-file for each quarter since 2009, but only in files starting from 2012 do contain all report data off all companies which reported in that period.

In [None]:
# imports
from bfh_cas_bgd_fs2020_sa.core import * # initialze spark
import urllib.request  # used to download resources from the web 
import shutil          # provides high level file operations
import time            # used to measure execution time
import os

In [None]:
# basic definitions
output_path = "d:/data/sec_zips" 

In [None]:
from pathlib import Path
Path(output_path).mkdir(parents=True, exist_ok=True) # create directory if necessary

## prepare download urls

In [None]:
# definitions to create download urls
sec_base_path = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/"
start_year = 2009        # start year to download the data
end_year   = 2020        # end year for download
format_str = "{}q{}.zip" # all file names are like 2020q1.zip 

In [None]:
# create list with all download links
download_urls = []
for year in range(start_year, end_year + 1):
    for quarter in range(1,5):
        download_urls.append(sec_base_path + format_str.format(year, quarter))

Unfortunately, the file for 2020q1.zip is located at a different location. For what reason ever. Of course, we could parse the site https://www.sec.gov/dera/data/financial-statement-data-sets.html to extract download links directly from there, but since this thesis isn't about parsing and scrapping data from html, we simply add the proper link to the list.

In [None]:
download_urls.append("https://www.sec.gov/files/node/add/data_distribution/2020q1.zip")

## downloading with spark

Since we have a couple of files, the download should be done in parallel. Of course, basic parallel python packages like "multiprocessing" could be used, but since this is a thesis about Spark, Spark should be used to do the things it shines at: parallelize work.

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

### using parkContext.parallelize to parallelize with spark

The blog post "https://medium.com/@joshua_robinson/parallelizing-downloads-with-spark-16bab8e337eb" shows how the download of resources can be parallelized with spark.

In [None]:
# Download URL and save to outpath.
def downloader(url, outpath):
    # From URL construct the destination path and filename.
    file_name = os.path.basename(urllib.parse.urlparse(url).path)
    file_path = os.path.join(outpath, file_name)
    
    # Check if the file has already been downloaded.
    if os.path.exists(file_path):
        return
    
    # Download and write to file.
    try:
        with urllib.request.urlopen(url, timeout=5) as urldata,\
                open(file_path, 'wb') as out_file:
            shutil.copyfileobj(urldata, out_file)
    except Exception as ex:
        pass # we cannot really provide a feedback, so we simply ignore failures and assume that these are 404 errors
        

In [None]:
# Convert URL list to an RDD in order to distribute to workers.
# listing = spark.sparkContext.parallelize(download_urls[:2]) # reduce to just two entries for testing
listing = spark.sparkContext.parallelize(download_urls)

In [None]:
start_time = time.time()
listing.foreach(lambda url: downloader(url, output_path))
execution_time = (time.time() - start_time)
print("execution time:      ", execution_time)
print("size output folder:  ", get_size_format(get_directory_size(output_path)))

execution time:       207.43249559402466
size output folder:   1.53GB


It took about 3.5 minutes to download the 1.53 GB of data on my laptop over my private wlan, connect to my provider Quickline

### using user defined functions 

It should be possible to use user defined functions and a sparkdataframe with the urls to download the files. Let us find out how this could look. 
An simple example on how to define a udf can be found here: https://changhsinlee.com/pyspark-udf/

As a first step, the download_urls list has to be converted into a spark dataframe. This can be done with one line, but since we want to have a meaningful columnname, the default columnname "value" is changed to "url"

In [None]:
from pyspark.sql.types import StringType

download_urls_df = spark.createDataFrame(download_urls, StringType())
download_urls_df = download_urls_df.withColumnRenamed("value","url")
download_urls_df.printSchema()

root
 |-- url: string (nullable = true)



Next, the downloader function is adapted. It now returns also a result and just accepts one parameter.

In [None]:
def downloader_function(url):
    # From URL construct the destination path and filename.
    file_name = os.path.basename(urllib.parse.urlparse(url).path)
    file_path = os.path.join(output_path, file_name) 
    
    # Check if the file has already been downloaded.
    if os.path.exists(file_path):
        return "already downloaded"
    
    # Download and write to file.
    try:
        with urllib.request.urlopen(url, timeout=30) as urldata,\
                open(file_path, 'wb') as out_file:
            shutil.copyfileobj(urldata, out_file)
            return "success"
    except Exception as ex:
        return "failed"

In order to use the above download function as UDF, it needs to be converted/registered as a udf-function. This is a one liner.

In [None]:
from pyspark.sql.functions import udf
downloader_udf = udf(lambda s: downloader_function(s), StringType())

Finally, the we execute it by simply using an SQL statment, which uses our udf function. With this approach we can also return a result for every url.

In [None]:
start_time = time.time()
result_df =  download_urls_df.select('url', downloader_udf('url').alias('result'))
result_df_failed = result_df.filter("result='failed'")
print(result_df_failed.count())
print(result_df_failed.collect())
execution_time = (time.time() - start_time)
print("execution time:      ", execution_time)
print("size output folder:  ", get_size_format(get_directory_size(output_path)))

3
[Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2020q3.zip', result='failed'), Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2020q4.zip', result='failed')]
execution time:       330.34656715393066
size output folder:   1.53GB


With this approach, it took 5.5 minutes, but that is more likely due to slower transport of the data over the network than to due to the fact that this version used a UDF.  