In [None]:
# default_exp join_sec_data

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim ver√§ndern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# 01_02_Join_SEC_Data

This notebook contains the code to join the attributs from the thre files "num.txt", "sub.txt", and "pre.txt" together into one single CSV-file which can then be used for further processing.

In [None]:
# imports
from bfh_cas_bgd_fs2020_sa.core import * # initialze spark

from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys
import zipfile

In [None]:
# basic definitions
zip_folder = "./data/" 
zip_path = Path(zip_folder)

extract_temp_folder = "./tmp/extract/"
Path(extract_temp_folder).mkdir(parents=True, exist_ok=True) # create directory if necessary

target_folder = "./tmp/joined/"
Path(target_path).mkdir(parents=True, exist_ok=True) # create directory if necessary

## Init Spark

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## Create Zip-Files dataframe

In [None]:
zip_files = [str(file) for file in zip_path.glob("*.zip")]

## Read file inside zip and convert it to a spark dataframe

In [None]:
# Define constants for the names of the filese inside the zip file
SUB_TXT = "sub.txt"
PRE_TXT = "pre.txt"
NUM_TXT = "num.txt"
TAG_TXT = "tag.txt"

I was looking for a way to directly read the content from csv.file inside a zip file into a spark dataframe. But after spending some time researching, i wasn't able to find a way to do it directly.<br>
Since that doesn't seem possible, we need to find other solutions and compare them.

### Baseline -> loading an extracted num.txt directly into a Spark dataframe
In order to compare the performance of loading csv data into a spark_dataframe we should have a baseline value.<br>
We will load the extracted num.txt file from 2019q3 and see how long it will take.
Note, the num.txt has to be extracted into the folder "tmp/2019q3/"

In [None]:
start = time.time()
df_test_num = spark.read.csv('tmp/2019q3/num.txt', sep='\t', header=True)
print(df_test_num.count()) # we need to execute an action, otherwise only the graph is created
duration = time.time() - start
print("duration: ", duration)

2325267
duration:  1.0419988632202148


In [None]:
print(df_test_num.first())

Row(adsh='0001625376-19-000017', tag='EntityPublicFloat', version='dei/2014', coreg=None, ddate='20180430', qtrs='0', uom='USD', value='0.0000', footnote=None)


The result is pretty reasonable. It took less than a second to load and parse the file into a spark dataframe. (we have to keep in mind, that the disk very likely caches this file after the first load, so it should be called twice) 

### V1: Extract file from zip and load it with spark.csv.read

One solution could be to extract the content and write it as a temporary file and then load that file into a spark dataframe. We cannot use a temporary file (tempfile.TemporaryFile()), since spark will try to access it from another process which is not possible for a temporary file

In [None]:
test_zip.replace(".zip","").replace("/","").replace("\\","")

'data2019q3'

In [None]:
import tempfile

test_zip = zip_files[0]
data_file = NUM_TXT

start = time.time()
with zipfile.ZipFile(test_zip, "r") as container_zip:
    with container_zip.open(data_file) as f:
        tempfile = extract_temp_folder + test_zip.replace(".zip","").replace("/","").replace("\\","")+"_"+data_file
        with open(tempfile, "wb+") as fp:
            data = f.read()
            fp.write(data)
            fp.seek(0)
            df_test_num = spark.read.csv(fp.name, sep='\t', header=True)
            print(df_test_num.count())
duration = time.time() - start
print("duration: ", duration)

2325267
duration:  2.1444883346557617


As expected, it takes a little longer, but it is still a very good result.

### V2: Using spark.read.csv with RDD parallelize

In [None]:
import pandas as pd
from io import StringIO

start = time.time()
with zipfile.ZipFile(test_zip, "r") as container_zip:
    with container_zip.open(NUM_TXT) as f:
        lines = [line.decode("utf-8") for line in f.readlines()]
        df_test_num = spark.read.csv(spark.sparkContext.parallelize(lines), sep='\t', header=True)
        print(df_test_num.count())        
duration = time.time() - start
print("duration: ", duration)

2325267
duration:  14.239033222198486


It takeslonger than loading the file directly. But it is easy to implement. 

### V3: Load data into tuples and create spark dataframe from tuple

Another solution is to load the data into a list of tuples and then use that list of tuples to create the spark dataframe. This is code a wrote a few months ago, slightly adapted.<br>
This code is not suitable for CSV files containing real text columns, because no escaping is checked.

In [None]:
def clear_empty_fields(row: List[str]) -> Tuple[Union[str,None]]:
    """ This helper method makes sure, that empty entries are converted to None
    """
    return tuple([entry if entry != '' else None for entry in row])

In [None]:
def get_file_data(zip_file: str, data_file: str) -> Tuple[List[str],List[Tuple[str]]]:
    """ This function extracts the file with the name provided in data_file from a zipfile which name is provided in zip_file.
        It then parses the file and returns a list of all tuples.
        The function assumes, that there is a header row and that the columns are separated by a \t.
        Furthermore, it assumes that no string escaping has to be done.
        
    """
    with zipfile.ZipFile(zip_file, "r") as container_zip:
        with container_zip.open(data_file) as f:
            lines = f.readlines()

            tuple_lines: List[Tuple[str]] = []
            for line in lines:
                try:
                    line = line.decode("utf-8")
                    line = line.replace("\n", "")
                    line = clear_empty_fields(line.split("\t"))
                    tuple_lines.append(line)
                except Exception as ex:
                    # sometimes there were encoding problems when storing to windows fs. if utf8 failed, trying to read as
                    # as windows-1252 helped in these cases
                    try:
                        line = line.decode("windows-1252")
                        line = line.replace("\n", "")
                        line = clear_empty_fields(line.split("\t"))
                        tuple_lines.append(line)
                    except:
                        sys.stderr.write(str(ex), "   ", line)
            return list(tuple_lines[:1][0]), tuple_lines[1:] # skip the header row, since we know that all files that we read have a header row

In order to test the code above and to have feeling for the performance, we measure the time that is needed to load the num.txt file directly from the zip file and convert it into a list of tuples.

In [None]:
# A short check to see if the reading works
start = time.time()
headers, list_of_tuples = get_file_data(zip_files[0], NUM_TXT)
print(headers[0],":", list_of_tuples[1][0])
duration = time.time() - start
print("duration: ", duration)

adsh : 0000034563-19-000064
duration:  10.322304248809814


It takes about 9 seconds. <br>
Just creating the list with tuples is already much slower than extracting the file and using spark.read.csv. Lets check how long the creation of a spark dataframe out of the tuple will take.

In [None]:
start = time.time()
headers, list_of_tuples = get_file_data(zip_files[0], NUM_TXT)
df_tuple = spark.createDataFrame(list_of_tuples , headers)
print(df_tuple.count())
duration = time.time() - start
print("duration: ", duration)

2325267
duration:  132.30002903938293


It takes over 2 minutes. Of course, it maybe that there are better ways to do it, but since the performance of reading directly from a file performs way better, it doesn't make sense to try to follow this approach

### Conclusion
Since extracting the file and reading it directly with the read.csv method performs the best, we will use this approach. We have to pay attention, that we create unique temporary files and that the clean them once the job is finished. So we will extract them in its own folder "extract_temp_folder".

### DF Reader implementation

#### V1: Function: Extract the file into a temporary file and use spark.read.csv directly with the file

The following method will be the method we are going to use in the final version

In [None]:
def read_csv_in_zip_into_df_extract(zip_file: str, data_file: str) -> DataFrame:
    """
       Extracts the data from zipfile and stores it on disk. 
       Uses spark.csv.read to read the data into the df
    """
    with zipfile.ZipFile(test_zip, "r") as container_zip:
        with container_zip.open(data_file) as f:
            # create a unique tempfile to extract the data
            tempfile = extract_temp_folder + zip_file.replace(".zip","").replace("/","").replace("\\","")+"_"+data_file
            with open(tempfile, "wb+") as fp:
                data = f.read()
                fp.write(data)
                fp.seek(0)
                df = spark.read.csv(fp.name, sep='\t', header=True)
                return df

#### V2: Function: Reading directly from Zip into RDD and create Dataframe from RDD

To compare the overall performance, we implement also the method who reads the data directly from the zip into memory and uses RDDs to create the DF

In [None]:
def read_csv_in_zip_into_df_direct(zip_file: str, data_file: str) -> DataFrame:
    """
       extracting the data directly from zipfile into the memory. we need to call decode with utf-8. 
    """
    with zipfile.ZipFile(test_zip, "r") as container_zip:
        with container_zip.open(data_file) as f:
            lines = [line.decode("utf-8") for line in f.readlines()]
            df = spark.read.csv(spark.sparkContext.parallelize(lines), sep='\t', header=True)
    return df

## Joining the data into one spark dataframe

We will use the spark dataframe's join method to join the data and we will compare the performance between to variants descriped above

### Note about joining

When defining joins, we have to pay attention to define them correctly if the join columns have the same name in both dataframes. Otherwise it can happen, that the join columns appear twice in the resulting dataframe.<br>
Details can be found here: https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html

In [None]:
# this join produces a df with two duplicated columns named adsh
# that should be prevented: 
df_join1 = df_num.join(df_sub, df_num.adsh == df_sub.adsh)
[x for x in df_join1.columns if x == "adsh"] # shows that the column adsh is twice in the dataframe

['adsh', 'adsh']

In [None]:
# correct way of joining using a list with the column names
df_join1 = df_num.join(df_sub, ["adsh"])
[x for x in df_join1.columns if x == "adsh"] # shows that the column adsh appears now only once in the df

['adsh']

### V1: Extract the file into a temporary file and use spark.read.csv directly with the file

In [None]:
# this just creates and prepares the graph, nothing happens yet
df_sub = read_csv_in_zip_into_df_extract(zip_files[0], SUB_TXT)
df_pre = read_csv_in_zip_into_df_extract(zip_files[0], PRE_TXT)
df_num = read_csv_in_zip_into_df_extract(zip_files[0], NUM_TXT)

In [None]:
# joining is the same as above
start = time.time()
df_joined = df_num.join(df_sub, ["adsh"]).join(df_pre, ["adsh","tag","version"],"left")
print("count: ", df_joined.count()) # again, calling count to ensure that the df is completely initialized
duration = time.time() - start
print("duration: ", duration)

count:  2570409
duration:  6.892997980117798


The result speaks for itself. Only 7 seconds were used to load the data and join it. Also here, during these 7 seconds, almost 100% of the availabe CPU power was used on all cores.

![TaskManager_Extract.png](./images/TaskManager_Extract.png)

### V2: Reading directly from Zip into RDD and create Dataframe from RDD

In [None]:
# this just creates and prepares the graph, nothing happens yet
df_sub = read_csv_in_zip_into_df_direct(zip_files[0], SUB_TXT)
df_pre = read_csv_in_zip_into_df_direct(zip_files[0], PRE_TXT)
df_num = read_csv_in_zip_into_df_direct(zip_files[0], NUM_TXT)

The following cells defines the join. <br>
df_num and df_sub have to be joined based on attribut "adsh". We can use an inner join for that since we know that every entry in num has a reference in sub. The result of this join is then joined with pre. We use a left outer join for that, since not every num must have an entry in pre. Since it is possible that there is more than one entry in pre for a pre, the total number of records will be likely larger than the rows in num alone

In [None]:
# joining the dataframes together
start = time.time()
df_joined = df_num.join(df_sub, ["adsh"]).join(df_pre, ["adsh","tag","version"],"left")
print("count: ", df_joined.count()) # again, calling count to ensure that the df is completely initialized
duration = time.time() - start
print("duration: ", duration)

count:  2570409
duration:  40.54203271865845


To Execute the whole graph, it took about 40 seconds.<br>
Checking the Windows TaskManager, we see that almost 100% CPU was used on all cores to execute the task.

![TaskManager_RDD.png](./images/TaskManager_RDD.png)

## Glueing it together and adding writing as csv

### V1: Final version

In [None]:
def join_files(zip_file: str, target_folder: str) -> str:
    """
        Joins the content of the 3 csv files that are contained in the zip file and 
        create on csv file containing all relevant columns.
    """
    
    target_path = target_folder + Path(zip_file).name.replace(".zip","").replace("/","").replace("\\","")
    
    df_sub = read_csv_in_zip_into_df_extract(zip_file, SUB_TXT)
    df_pre = read_csv_in_zip_into_df_extract(zip_file, PRE_TXT)
    df_num = read_csv_in_zip_into_df_extract(zip_file, NUM_TXT)
    
    df_joined = df_num.join(df_sub, ["adsh"]).join(df_pre, ["adsh","tag","version"],"left")
    
    df_joined.write.csv(target_path, compression="gzip", header=True)
    
    return target_path

In [None]:
start = time.time()
print(join_files(zip_files[0], target_folder))
duration = time.time() - start
print("duration: ", duration)

./tmp/joined/data2019q3
duration:  65.65727496147156


It takes about 60 seconds, which is a little surprising, since a count on the joined df only took about 7 seconds. So the question is if this is just the writing or were there other optimisation for count possible and which explain the difference between the "V1: extract to file" and "V2: Reading directly from Zip into RDD".

### V2: Final Version

In order to check if this version is also slower when data is written into a csv, let us adapt the code and use the direct "rdd" version.

In [None]:
def join_files_rdd(zip_file: str) -> str:
    """
        Joins the content of the 3 csv files that are contained in the zip file and 
        create on csv file containing all relevant columns.
    """
    
    target_path = target_folder + Path(zip_file).name.replace(".zip","").replace("/","").replace("\\","")
    
    df_sub = read_csv_in_zip_into_df_direct(zip_file, SUB_TXT)
    df_pre = read_csv_in_zip_into_df_direct(zip_file, PRE_TXT)
    df_num = read_csv_in_zip_into_df_direct(zip_file, NUM_TXT)
    
    df_joined = df_num.join(df_sub, ["adsh"]).join(df_pre, ["adsh","tag","version"],"left")
    
    df_joined.write.csv(target_path, compression="gzip", header=True)
    
    return target_path

In [None]:
start = time.time()
print(join_files_rdd(zip_files[0]))
duration = time.time() - start
print("duration: ", duration)

./tmp/joined/data2019q3
duration:  96.1389946937561


This proves that the RDD version (V2) is slower and we will stick with the extract version (V1)

## Running on all 46 zipfiles

As a final test and to get a baseline, we will run the "joining" an all 46 zip files.<br>
Since the processing of every single zip file already uses all cores very much in parallel, there is no need to try to parallize the processing of multpiple zip files at the same time. So this will be just one loop which processes the files sequentially.

In [None]:
all_zip_file_folder = "D:/data/sec_zips/"
all_zip_path = Path(all_zip_file_folder)
zip_files = [str(file) for file in all_zip_path.glob("*.zip")]

target = "d:/data/zip_joined/"
Path(target).mkdir(parents=True, exist_ok=True) # create directory if necessary

In [None]:
start = time.time()
print(len(zip_files))
for file in zip_files:
    try: 
        print(file)
        print(join_files(file, target))
    except Exception as ex:
        print("failed: ", file, str(ex))
duration = time.time() - start

print("duration: ", duration)

46
D:\data\sec_zips\2009q1.zip
d:/data/zip_joined/2009q1
D:\data\sec_zips\2009q2.zip
d:/data/zip_joined/2009q2
D:\data\sec_zips\2009q3.zip
d:/data/zip_joined/2009q3
D:\data\sec_zips\2009q4.zip
d:/data/zip_joined/2009q4
D:\data\sec_zips\2010q1.zip
d:/data/zip_joined/2010q1
D:\data\sec_zips\2010q2.zip
d:/data/zip_joined/2010q2
D:\data\sec_zips\2010q3.zip
d:/data/zip_joined/2010q3
D:\data\sec_zips\2010q4.zip
d:/data/zip_joined/2010q4
D:\data\sec_zips\2011q1.zip
d:/data/zip_joined/2011q1
D:\data\sec_zips\2011q2.zip
d:/data/zip_joined/2011q2
D:\data\sec_zips\2011q3.zip
d:/data/zip_joined/2011q3
D:\data\sec_zips\2011q4.zip
d:/data/zip_joined/2011q4
D:\data\sec_zips\2012q1.zip
d:/data/zip_joined/2012q1
D:\data\sec_zips\2012q2.zip
d:/data/zip_joined/2012q2
D:\data\sec_zips\2012q3.zip
d:/data/zip_joined/2012q3
D:\data\sec_zips\2012q4.zip
d:/data/zip_joined/2012q4
D:\data\sec_zips\2013q1.zip
d:/data/zip_joined/2013q1
D:\data\sec_zips\2013q2.zip
d:/data/zip_joined/2013q2
D:\data\sec_zips\2013q3.z

It took about 2600 seconds (about 43 min), all files were processed and there was no exception. All content in the joined folder has a total size of about 5.5GB.  

In [None]:
spark.stop()

In [None]:
# clear extract folder
shutil.rmtree(extract_temp_folder)