In [None]:
# default_exp join_sec_data

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 01_02_Join_SEC_Data

This notebook contains the code to join the attributs from the thre files "num.txt", "sub.txt", and "pre.txt" together into one single CSV-file which can then be used for further processing.

In [None]:
# imports
from bfh_cas_bgd_fs2020_sa.core import * # initialze spark

from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys
import zipfile

In [None]:
# basic definitions
zip_folder = "./data/" 
zip_path = Path(zip_folder)

## Init Spark

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## Create Zip-Files dataframe

In [None]:
zip_files = [str(file) for file in zip_path.glob("*.zip")]

In [None]:
# convert the list into a Spark dataframe
from pyspark.sql.types import StringType

zip_files_df = spark.createDataFrame(zip_files, StringType())
zip_files_df = zip_files_df.withColumnRenamed("value","url")
zip_files_df.printSchema()

root
 |-- url: string (nullable = true)



## Join the data

### Read file inside zip and convert it to a spark dataframe

I was looking for a way to directly read the content from csv.file inside a zip file into a spark dataframe. But after spending some time researching, i wasn't able to find a way to do it directly.<br>

One solution could be to extract the content and write it as a temporary file and then load that file into a spark dataframe. But spark does the loading in the background, which means, as soon as the cell is finished, the tempfile gets deleted and the spark task fails.

Another solution is to load the data into a list of tuples and then use that list of tuples to create the spark dataframe. This is code a wrote a few months ago, slightly adapted.<br>
This code is not suitable for CSV files containing real text columns, because no escaping is checked.

In [None]:
def clear_empty_fields(row: List[str]) -> Tuple[Union[str,None]]:
    """ This helper method makes sure, that empty entries are converted to None
    """
    return tuple([entry if entry != '' else None for entry in row])

In [None]:
def get_file_data(zip_file: str, data_file: str) -> Tuple[List[str],List[Tuple[str]]]:
    """ This function extracts the file with the name provided in data_file from a zipfile which name is provided in zip_file.
        It then parses the file and returns a list of all tuples.
        The function assumes, that there is a header row and that the columns are separated by a \t.
        Furthermore, it assumes that no string escaping has to be done.
        
    """
    with zipfile.ZipFile(zip_file, "r") as container_zip:
        with container_zip.open(data_file) as f:
            lines = f.readlines()

            tuple_lines: List[Tuple[str]] = []
            for line in lines:
                try:
                    line = line.decode("utf-8")
                    line = line.replace("\n", "")
                    line = clear_empty_fields(line.split("\t"))
                    tuple_lines.append(line)
                except Exception as ex:
                    # sometimes there were encoding problems when storing to windows fs. if utf8 failed, trying to read as
                    # as windows-1252 helped in these cases
                    try:
                        line = line.decode("windows-1252")
                        line = line.replace("\n", "")
                        line = clear_empty_fields(line.split("\t"))
                        tuple_lines.append(line)
                    except:
                        sys.stderr.write(str(ex), "   ", line)
            return list(tuple_lines[:1][0]), tuple_lines[1:] # skip the header row, since we know that all files that we read have a header row

In [None]:
# A short check to see if the reading works
headers, list_of_tuples = get_file_data(zip_files[0], "sub.txt")
print(headers[0],":", list_of_tuples[1][0])

adsh : 0000002488-19-000104


In [None]:
def read_csv_in_zip_into_df(zip_file: str, data_file: str) -> DataFrame:
    headers, list_of_tuples = get_file_data(zip_file, data_file)
    return spark.createDataFrame(list_of_tuples , headers)

In [None]:
# check if it is working
df = read_csv_in_zip_into_df(zip_files[0], "sub.txt")
df.describe()

DataFrame[summary: string, adsh: string, cik: string, name: string, sic: string, countryba: string, stprba: string, cityba: string, zipba: string, bas1: string, bas2: string, baph: string, countryma: string, stprma: string, cityma: string, zipma: string, mas1: string, mas2: string, countryinc: string, stprinc: string, ein: string, former: string, changed: string, afs: string, wksi: string, fye: string, form: string, period: string, fy: string, fp: string, filed: string, accepted: string, prevrpt: string, detail: string, instance: string, nciks: string, aciks: string]

## Joining the data into one dataframe

In [None]:
# Define constants for the names of the filese inside the zip file
SUB_TXT = "sub.txt"
PRE_TXT = "pre.txt"
NUM_TXT = "num.txt"
TAG_TXT = "tag.txt"

In [None]:
# this takes some time till loaded
df_sub = read_csv_in_zip_into_df(zip_files[0], SUB_TXT)
df_pre = read_csv_in_zip_into_df(zip_files[0], PRE_TXT)
df_num = read_csv_in_zip_into_df(zip_files[0], NUM_TXT)

In [None]:
# this join produces a df with two duplicated columns named adsh
# that should be prevented: https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html
df_join1 = df_num.join(df_sub, df_num.adsh == df_sub.adsh)
[x for x in df_join1.columns if x == "adsh"] # shows that the column adsh is twice in the dataframe

['adsh', 'adsh']

In [None]:
# correct way of joining using a list with the column names
df_join1 = df_num.join(df_sub, ["adsh"])
[x for x in df_join1.columns if x == "adsh"] # shows that the column adsh appears now only once in the df

['adsh']