In [None]:
# default_exp join_sec_data

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# 01_02_Join_SEC_Data

This notebook contains the code to join the attributs from the thre files "num.txt", "sub.txt", and "pre.txt" together into one single CSV-file which can then be used for further processing.

In [None]:
# imports
from bfh_cas_bgd_fs2020_sa.core import * # initialze spark

from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys
import zipfile

In [None]:
# basic definitions
zip_folder = "./data/" 
zip_path = Path(zip_folder)

## Init Spark

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## Create Zip-Files dataframe

In [None]:
zip_files = [str(file) for file in zip_path.glob("*.zip")]

In [None]:
# convert the list into a Spark dataframe
from pyspark.sql.types import StringType

zip_files_df = spark.createDataFrame(zip_files, StringType())
zip_files_df = zip_files_df.withColumnRenamed("value","url")
zip_files_df.printSchema()

root
 |-- url: string (nullable = true)



## Read file inside zip and convert it to a spark dataframe

In [None]:
# Define constants for the names of the filese inside the zip file
SUB_TXT = "sub.txt"
PRE_TXT = "pre.txt"
NUM_TXT = "num.txt"
TAG_TXT = "tag.txt"

I was looking for a way to directly read the content from csv.file inside a zip file into a spark dataframe. But after spending some time researching, i wasn't able to find a way to do it directly.<br>
Since that doesn't seem possible, we need to find other solutions and compare them.

### Baseline -> loading an extracted num.txt directly into a Spark dataframe
In order to compare the performance of loading csv data into a spark_dataframe we should have a baseline value.<br>
We will load the extracted num.txt file from 2019q3 and see how long it will take.
Note, the num.txt has to be extracted into the folder "tmp/2019q3/"

In [None]:
start = time.time()
df_test_num = spark.read.csv('tmp/2019q3/num.txt', sep='\t', header=True)
print(df_test_num.count()) # we need to execute an action, otherwise only the graph is created
duration = time.time() - start
print("duration: ", duration)

2325267
duration:  0.8069617748260498


In [None]:
print(df_test_num.first())

Row(adsh='0001625376-19-000017', tag='EntityPublicFloat', version='dei/2014', coreg=None, ddate='20180430', qtrs='0', uom='USD', value='0.0000', footnote=None)


The result is pretty reasonable. It took less than a second to load and parse the file into a spark dataframe. (we have to keep in mind, that the disk very likely caches this file after the first load) 

### Extract file from zip and load it with spark.csv.read

One solution could be to extract the content and write it as a temporary file and then load that file into a spark dataframe. We cannot use a temporary file (tempfile.TemporaryFile()), since spark will try to access it from another process which is not possible for a temporary file

In [None]:
import tempfile

test_zip = zip_files[0]

start = time.time()
with zipfile.ZipFile(test_zip, "r") as container_zip:
    with container_zip.open(NUM_TXT) as f:
        with open("./tmp/tempfile.xt", "wb+") as fp:
            data = f.read()
            fp.write(data)
            fp.seek(0)
            df_test_num = spark.read.csv(fp.name, sep='\t', header=True)
            print(df_test_num.count())
duration = time.time() - start
print("duration: ", duration)

2325267
duration:  2.333956718444824


As expected, it takes a little longer, but it is still a very good result.

### Load data into tuples and create spark dataframe from tuple

Another solution is to load the data into a list of tuples and then use that list of tuples to create the spark dataframe. This is code a wrote a few months ago, slightly adapted.<br>
This code is not suitable for CSV files containing real text columns, because no escaping is checked.

In [None]:
def clear_empty_fields(row: List[str]) -> Tuple[Union[str,None]]:
    """ This helper method makes sure, that empty entries are converted to None
    """
    return tuple([entry if entry != '' else None for entry in row])

In [None]:
def get_file_data(zip_file: str, data_file: str) -> Tuple[List[str],List[Tuple[str]]]:
    """ This function extracts the file with the name provided in data_file from a zipfile which name is provided in zip_file.
        It then parses the file and returns a list of all tuples.
        The function assumes, that there is a header row and that the columns are separated by a \t.
        Furthermore, it assumes that no string escaping has to be done.
        
    """
    with zipfile.ZipFile(zip_file, "r") as container_zip:
        with container_zip.open(data_file) as f:
            lines = f.readlines()

            tuple_lines: List[Tuple[str]] = []
            for line in lines:
                try:
                    line = line.decode("utf-8")
                    line = line.replace("\n", "")
                    line = clear_empty_fields(line.split("\t"))
                    tuple_lines.append(line)
                except Exception as ex:
                    # sometimes there were encoding problems when storing to windows fs. if utf8 failed, trying to read as
                    # as windows-1252 helped in these cases
                    try:
                        line = line.decode("windows-1252")
                        line = line.replace("\n", "")
                        line = clear_empty_fields(line.split("\t"))
                        tuple_lines.append(line)
                    except:
                        sys.stderr.write(str(ex), "   ", line)
            return list(tuple_lines[:1][0]), tuple_lines[1:] # skip the header row, since we know that all files that we read have a header row

In order to test the code above and to have feeling for the performance, we measure the time that is needed to load the num.txt file directly from the zip file and convert it into a list of tuples.

In [None]:
# A short check to see if the reading works
start = time.time()
headers, list_of_tuples = get_file_data(zip_files[0], NUM_TXT)
print(headers[0],":", list_of_tuples[1][0])
duration = time.time() - start
print("duration: ", duration)

adsh : 0000034563-19-000064
duration:  8.604996681213379


It takes about 9 seconds. <br>
Just creating the list with tuples is already much slower than extracting the file and using spark.read.csv. Lets check how long the creation of a spark dataframe out of the tuple will take.

In [None]:
start = time.time()
headers, list_of_tuples = get_file_data(zip_files[0], NUM_TXT)
df_tuple = spark.createDataFrame(list_of_tuples , headers)
print(df_tuple.count())
duration = time.time() - start
print("duration: ", duration)

2325267
duration:  132.30002903938293


It takes over 2 minutes. Of course, it maybe that there are better ways to do it, but since the performance of reading directly from a file performs way better, it doesn't make sense to try to follow this approach

### Using spark.read.csv with RDD parallelize

In [None]:
import pandas as pd
from io import StringIO

start = time.time()
with zipfile.ZipFile(test_zip, "r") as container_zip:
    with container_zip.open(NUM_TXT) as f:
        lines = [line.decode("utf-8") for line in f.readlines()]
        df_test_num = spark.read.csv(spark.sparkContext.parallelize(lines), sep='\t', header=True)
        print(df_test_num.count())        
duration = time.time() - start
print("duration: ", duration)

2325267
duration:  13.77500033378601


In [None]:
df_test_num.columns

['adsh',
 'tag',
 'version',
 'coreg',
 'ddate',
 'qtrs',
 'uom',
 'value',
 'footnote']

It takeslonger than loading the file directly. But it would be a very easy to implement. 

### Conclusion
Since this is mainly a ontime operation, i will use the "Using spark.read.csv with RDD parallelize" as a first approach. If that shouldn't work out well, i would go for the extract and save to disk approach.

### DF Reader implementation

In [None]:
def read_csv_in_zip_into_df(zip_file: str, data_file: str) -> DataFrame:
    with zipfile.ZipFile(test_zip, "r") as container_zip:
        with container_zip.open(NUM_TXT) as f:
            lines = [line.decode("utf-8") for line in f.readlines()]
            df = spark.read.csv(spark.sparkContext.parallelize(lines), sep='\t', header=True)
    return df

## Joining the data into one spark dataframe

In [None]:
# this takes some time till loaded
# lap1:  2.0500004291534424
# lap2:  66.70102596282959
# lap2:  133.26235961914062 -> loading the data and creating a tuple is only about 8 seconds... so about 2 minutes are needed to create the df from the tuple
start = time.time()
df_sub = read_csv_in_zip_into_df(zip_files[0], SUB_TXT)
lap1 = time.time()
lap1_time = lap1-start
print("lap1: ", lap1_time)
df_pre = read_csv_in_zip_into_df(zip_files[0], PRE_TXT)
lap2 = time.time()
lap2_time = lap2-lap1
print("lap2: ", lap2_time)
df_num = read_csv_in_zip_into_df(zip_files[0], NUM_TXT)
lap3 = time.time()
lap3_time = lap3-lap2
print("lap2: ", lap3_time)

lap1:  2.0500004291534424
lap2:  66.70102596282959
lap2:  133.26235961914062


In [None]:
# this join produces a df with two duplicated columns named adsh
# that should be prevented: https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html
df_join1 = df_num.join(df_sub, df_num.adsh == df_sub.adsh)
[x for x in df_join1.columns if x == "adsh"] # shows that the column adsh is twice in the dataframe

['adsh', 'adsh']

In [None]:
# correct way of joining using a list with the column names
df_join1 = df_num.join(df_sub, ["adsh"])
[x for x in df_join1.columns if x == "adsh"] # shows that the column adsh appears now only once in the df

['adsh']

In [None]:
# joining the dataframes together
df_joined = df_num.join(df_sub, ["adsh"]).join(df_pre, ["adsh","version","tag"],"left")

In [None]:
df_joined.count() # this will start the whole DAG and executes the join

2570409

In [None]:
spark.stop()

In [None]:
zip_files[0]

'data\\2019q3.zip'