In [None]:
# default_exp prepare_whole

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim ver√§ndern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

## Basic Settings

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import get_spark_session # initialze spark
from pathlib import Path
from typing import List, Tuple, Union, Set
import urllib.request  # used to download resources from the web 
import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import zipfile

from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.dataframe import DataFrame

In [None]:
# Basic Definitions
all_zip_folder = "d:/data/sec_zips/"
target_csv_folder = "d:/data/zip_joined/"
extract_temp_folder = "d:/data/tmp/"

In [None]:
Path(all_zip_folder).mkdir(parents=True, exist_ok=True)
Path(target_csv_folder).mkdir(parents=True, exist_ok=True)
Path(extract_temp_folder).mkdir(parents=True, exist_ok=True)

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## 01_Download_ZIP

### Prepare download urls

In [None]:
# definitions to create download urls
sec_base_path = "https://www.sec.gov/files/dera/data/financial-statement-data-sets/"
start_year = 2009        # start year to download the data
end_year   = 2020        # end year for download
format_str = "{}q{}.zip" # all file names are like 2020q1.zip 

In [None]:
# create list with all download links
download_urls = []
for year in range(start_year, end_year + 1):
    for quarter in range(1,5):
        download_urls.append(sec_base_path + format_str.format(year, quarter))

In [None]:
download_urls.append("https://www.sec.gov/files/node/add/data_distribution/2020q1.zip")

In [None]:
download_urls_df = spark.createDataFrame(download_urls, StringType())
download_urls_df = download_urls_df.withColumnRenamed("value","url")

### download the data

In [None]:
def downloader_function(url):
    """
    """
  
    # From URL construct the destination path and filename.
    file_name = os.path.basename(urllib.parse.urlparse(url).path)
    file_path = os.path.join(all_zip_folder, file_name) 

    # Check if the file has already been downloaded.
    if os.path.exists(file_path):
        return "already downloaded"

    # Download and write to file.
    try:
        with urllib.request.urlopen(url, timeout=30) as urldata,\
              open(file_path, 'wb') as out_file:
            shutil.copyfileobj(urldata, out_file)
            return "success"
    except Exception as ex:
        return "failed: {}".format(ex)

In [None]:
downloader_udf = udf(lambda s: downloader_function(s), StringType())

In [None]:
start_time = time.time()
result_df =  download_urls_df.select('url', downloader_udf('url').alias('result')).collect()
execution_time = (time.time() - start_time)
print("execution time:      ", execution_time)

execution time:       14.016472101211548


In [None]:
result_df

[Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2009q1.zip', result='already downloaded'),
 Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2009q2.zip', result='already downloaded'),
 Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2009q3.zip', result='already downloaded'),
 Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2009q4.zip', result='already downloaded'),
 Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2010q1.zip', result='already downloaded'),
 Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2010q2.zip', result='already downloaded'),
 Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2010q3.zip', result='already downloaded'),
 Row(url='https://www.sec.gov/files/dera/data/financial-statement-data-sets/2010q4.zip', result='already downloaded'),
 Row(url='https://www.sec.gov/files/dera/data/fi

## join sec data

In [None]:
# Define constants for the names of the filese inside the zip file
SUB_TXT = "sub.txt"
PRE_TXT = "pre.txt"
NUM_TXT = "num.txt"
TAG_TXT = "tag.txt"

In [None]:
# create a list with paths to all the zip files
all_zip_path = Path(all_zip_folder)
zip_files = [str(file) for file in all_zip_path.glob("*.zip")]

In [None]:
def read_csv_in_zip_into_df_extract(zip_file: str, data_file: str) -> DataFrame:
    """
       Extracts the data from zipfile and stores it on disk. 
       Uses spark.csv.read to read the data into the df
    """
    with zipfile.ZipFile(zip_file, "r") as container_zip:
        with container_zip.open(data_file) as f:
            # create a unique tempfile to extract the data
            tempfile = extract_temp_folder +Path(zip_file).name.replace(".zip","").replace("/","").replace("\\","")+"_"+data_file
            
            with open(tempfile, "wb+") as f_temp:
                data = f.read()
                f_temp.write(data)
                f_temp.close()
                f_temp_dbfs  = tempfile.replace("/dbfs","")
         
                df = spark.read.csv(f_temp_dbfs, sep='\t', header=True)
                return df

In [None]:
def join_files(zip_file: str, target_folder: str) -> str:
    """
        Joins the content of the 3 csv files that are contained in the provided zip_file and 
        create on csv file containing all relevant columns inside target_folder.
    """
    
    target_path = target_folder + Path(zip_file).name.replace(".zip","").replace("/","").replace("\\","")
    
    if os.path.exists(target_path):
        return zip_file + " : " + " already Joined"
    
    df_sub = read_csv_in_zip_into_df_extract(zip_file, SUB_TXT)
    df_pre = read_csv_in_zip_into_df_extract(zip_file, PRE_TXT)
    df_num = read_csv_in_zip_into_df_extract(zip_file, NUM_TXT)
    
    df_joined = df_num.join(df_sub, ["adsh"]).join(df_pre, ["adsh","tag","version"],"left")
    
    target_path  = target_path.replace("/dbfs","")
    df_joined.write.csv(target_path, compression="gzip", header=True)
    
    return target_path

In [None]:
for file in zip_files:
    try: 
        print(join_files(file, target_csv_folder))
    except Exception as ex:
        print("failed: ", file, str(ex))

d:\data\sec_zips\2009q1.zip :  already Joined
d:\data\sec_zips\2009q2.zip :  already Joined
d:\data\sec_zips\2009q3.zip :  already Joined
d:\data\sec_zips\2009q4.zip :  already Joined
d:\data\sec_zips\2010q1.zip :  already Joined
d:\data\sec_zips\2010q2.zip :  already Joined
d:\data\sec_zips\2010q3.zip :  already Joined
d:\data\sec_zips\2010q4.zip :  already Joined
d:\data\sec_zips\2011q1.zip :  already Joined
d:\data\sec_zips\2011q2.zip :  already Joined
d:\data\sec_zips\2011q3.zip :  already Joined
d:\data\sec_zips\2011q4.zip :  already Joined
d:\data\sec_zips\2012q1.zip :  already Joined
d:\data\sec_zips\2012q2.zip :  already Joined
d:\data\sec_zips\2012q3.zip :  already Joined
d:\data\sec_zips\2012q4.zip :  already Joined
d:\data\sec_zips\2013q1.zip :  already Joined
d:\data\sec_zips\2013q2.zip :  already Joined
d:\data\sec_zips\2013q3.zip :  already Joined
d:\data\sec_zips\2013q4.zip :  already Joined
d:\data\sec_zips\2014q1.zip :  already Joined
d:\data\sec_zips\2014q2.zip :  alr

In [None]:
# Helper Code to clear the extract_temp_folder
shutil.rmtree(extract_temp_folder)
Path(extract_temp_folder).mkdir(parents=True, exist_ok=True) # create directory after it was deleted