In [None]:
# default_exp merge_to_single_parquet

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim ver√§ndern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# 01_03_Merge_To_Single_Parquet

The main goal of this notebook is to merge all quarterly-CSV files together into one Dataframe and store it as Parquet. Moreover, a schema with the correkt datatypes is defined, so that Parquet stores the types appropriately.<br>
In addition, the tickersymbol is also merged into the dataset.

In [None]:
# imports
from bfh_cas_bgd_fs2020_sa.core import * # initialze spark

from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame

import pandas as pd

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

In [None]:
# basic definitions

# our test folder just contains the content of two zip files
tst_csv_folders = "./tmp/joined/"
tst_csv_path = Path(tst_csv_folders)
tst_csv_path_list = [x.name for x in tst_csv_path.iterdir() if x.is_dir()]
print("Test-paths: " , tst_csv_path_list)

tst_parquet_folder = "./tmp/parquet/"

# The "all"-folder contains the csv files from all of the zipfiles 
all_csv_folders = "D:/data/zip_joined/"
all_csv_path = Path(all_csv_folders)
all_csv_path_list = [x.name for x in all_csv_path.iterdir() if x.is_dir()]
print("All-paths: ", all_csv_path_list)

all_parquet_folder = "D:/data/parquet/"

Test-paths:  ['2019q3', '2019q4']
All-paths:  ['2009q1', '2009q2', '2009q3', '2009q4', '2010q1', '2010q2', '2010q3', '2010q4', '2011q1', '2011q2', '2011q3', '2011q4', '2012q1', '2012q2', '2012q3', '2012q4', '2013q1', '2013q2', '2013q3', '2013q4', '2014q1', '2014q2', '2014q3', '2014q4', '2015q1', '2015q2', '2015q3', '2015q4', '2016q1', '2016q2', '2016q3', '2016q4', '2017q1', '2017q2', '2017q3', '2017q4', '2018q1', '2018q2', '2018q3', '2018q4', '2019q1', '2019q2', '2019q3', '2019q4', '2020q1', '2020q2']


## Init Spark

In [None]:
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## Define Schme for reading from CSV

In [None]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, DoubleType, BooleanType

schema = StructType([  # num.txt  \
                StructField("adsh", 	 StringType(), True), \
                StructField("tag", 	 	 StringType(), True), \
                StructField("version", 	 StringType(), True), \
                StructField("coreg", 	 IntegerType(), True), \
                StructField("ddate", 	 DateType(), True), # date \ 
                StructField("qtrs", 	 StringType(), True), \
                StructField("uom", 	 	 StringType(), True), \
                StructField("value", 	 DoubleType(), True), \
                StructField("footnote",  StringType(), True), \
                      # sub.txt \ 
                StructField("cik", 	 	 IntegerType(), True), \
                StructField("name", 	 StringType(), True), \
                StructField("sic", 	 	 IntegerType(), True), \
                StructField("countryba", StringType(), True), \
                StructField("stprba", 	 StringType(), True), \
                StructField("cityba", 	 StringType(), True), \
                StructField("zipba", 	 StringType(), True), \
                StructField("bas1", 	 StringType(), True), \
                StructField("bas2", 	 StringType(), True), \
                StructField("baph", 	 StringType(), True), \
                StructField("countryma", StringType(), True), \
                StructField("stprma", 	 StringType(), True), \
                StructField("cityma", 	 StringType(), True), \
                StructField("zipma", 	 StringType(), True), \
                StructField("mas1", 	 StringType(), True), \
                StructField("mas2", 	 StringType(), True), \
                StructField("countryinc",StringType(), True), \
                StructField("stprinc", 	 StringType(), True), \
                StructField("ein", 	 	 IntegerType(), True), \
                StructField("former", 	 StringType(), True), \
                StructField("changed", 	 StringType(), True), \
                StructField("afs", 	 	 StringType(), True), \
                StructField("wksi", 	 IntegerType(), True), \
                StructField("fye", 	     StringType(), True), \
                StructField("form", 	 StringType(), True), \
                StructField("period", 	 DateType(), True),  # date \
                StructField("fy", 	 	 IntegerType(), True), \
                StructField("fp", 	 	 StringType(), True), \
                StructField("filed", 	 DateType(), True), # date \
                StructField("accepted",  StringType(), True), # datetime \
                StructField("prevrpt", 	 IntegerType(), True), \
                StructField("detail", 	 IntegerType(), True), \
                StructField("instance",  StringType(), True), \
                StructField("nciks", 	 IntegerType(), True), \
                StructField("aciks", 	 StringType(), True), \
                      # pre.txt \
                StructField("report", 	 IntegerType(), True), \
                StructField("line", 	 IntegerType(), True), \
                StructField("stmt", 	 StringType(), True), \
                StructField("inpth", 	 IntegerType(), True), \
                StructField("rfile", 	 StringType(), True), \
                StructField("plabel", 	 StringType(), True), \
                StructField("negating",  StringType(), True) \
])

## Read all csv files into one DF

### Read the test dataset 

In [None]:
start = time.time()
df_tst = spark.read.csv(tst_csv_folders + "*", header=True, dateFormat="yyyyMMdd", schema=schema)
print("{:_}".format(df_tst.count())) # print number of lines in the test dataset
duration = time.time() - start
print("duration: ", duration)

5_239_639
duration:  7.223994970321655


Reading and counting just the two CSV-folders is really fast, but we have to be aware that they are stored on a SSD.

In [None]:
# df_tst.show(1) # if we need to check that reading the schema was possible

### Read the whole dataset

In [None]:
start = time.time()
df_all = spark.read.csv(all_csv_folders + "*", header=True, dateFormat="yyyyMMdd", schema=schema)
print("{:_}".format(df_all.count())) # print number of lines in the whole dataset
duration = time.time() - start
print("duration: ", duration)

109_392_813
duration:  332.6548318862915


The first time, reading and counting all CSV-folders, takes about 7 minutes. But they are also read from "normal disk" and not a SSD. This is also clearly visible when checking the Windows Task Manager: The disk was at 100%. <br>
It took only 45 seconds the second time and when I checked the Windows Task Manager, the CPU was at 100% and the disk was at 0%. So it looks as if the system cached the whole data

In [None]:
#df_all.show(1) # if we need to check that reading the schema was possible

### Print all the contained column names

In [None]:
_ = [print(x, end=", ") for x in df_all.columns] # print the name of the columns for convenience

adsh, tag, version, coreg, ddate, qtrs, uom, value, footnote, cik, name, sic, countryba, stprba, cityba, zipba, bas1, bas2, baph, countryma, stprma, cityma, zipma, mas1, mas2, countryinc, stprinc, ein, former, changed, afs, wksi, fye, form, period, fy, fp, filed, accepted, prevrpt, detail, instance, nciks, aciks, report, line, stmt, inpth, rfile, plabel, negating, 

## Merge TickerSymbol to the dataset

During the further analysis, it could make sense to know the TickerSymbol and the Exchange where the stock is traded. This information is available in a CSV located at http://rankandfiled.com/static/export/cik_ticker.csv. We simply load it into a dataframe and use the join method to join it with the rest.

### Read the cik-ticker dataset

In [None]:
from pyspark.sql.functions import col

df_cik_ticker = spark.read.csv("./data/cik_ticker.csv", sep="|", header=True)[['CIK','Ticker','Name','Exchange']]
# renaming the column
df_cik_ticker = df_cik_ticker.withColumnRenamed('Name', "name_cik_tic") \
                                .withColumnRenamed('Ticker', "ticker") \
                                .withColumnRenamed('Exchange', "exchange") \
                                .withColumn("cik", col("CIK").cast(IntegerType()))

In [None]:
df_cik_ticker.show(5)

+-------+------+--------------------+--------+
|    cik|ticker|        name_cik_tic|exchange|
+-------+------+--------------------+--------+
|1090872|     A|Agilent Technolog...|    NYSE|
|   4281|    AA|           Alcoa Inc|    NYSE|
|1332552| AAACU|Asia Automotive A...|    null|
|1287145|  AABB|  Asia Broadband Inc|     OTC|
|1024015|  AABC|Access Anytime Ba...|    null|
+-------+------+--------------------+--------+
only showing top 5 rows



### Available exchanges and count of traded stocks

In [None]:
exchanges_df = df_cik_ticker.select(["cik","exchange"]).distinct().toPandas() # we convert to pandas in order to visualize the data

In [None]:
ct = pd.crosstab(index=exchanges_df['exchange'], columns='count')
print(ct) # 

col_0      count
exchange        
BATS           4
NASDAQ      2669
NYSE        2880
NYSE ARCA    115
NYSE MKT     561
OTC         2345
OTCBB        330


### Join the test dataframe with the cik_ticker dataframe

Let's now join the test dataset together with the ticker information. We expect the same number of lines in the dataset as we had above.

In [None]:
df_tst_join = df_tst.join(df_cik_ticker, ["cik"], "left")
print("{:_}".format(df_tst_join.count())) # merge and display the number of lines in the test dataset

5_239_639


In [None]:
df_tst_join[['adsh','cik','ticker','name_cik_tic','exchange']].show(2) # show a few columns to make sure the join worked

+--------------------+----+------+--------------------+--------+
|                adsh| cik|ticker|        name_cik_tic|exchange|
+--------------------+----+------+--------------------+--------+
|0000002178-19-000107|2178|    AE|Adams Resources &...|NYSE MKT|
|0000002178-19-000107|2178|    AE|Adams Resources &...|NYSE MKT|
+--------------------+----+------+--------------------+--------+
only showing top 2 rows



### Join the complete dataframe with the cik_ticker dataframe

We only create the definition of the joined dataframe, we don't execute it yet.

In [None]:
# join the all dataset with the ticker information
df_all_join = df_all.join(df_cik_ticker, ["cik"], "left")

## Storing as Parquet with default partitions

The correct datytype is defined for each column and the tickersymbols have been added to the dataset. Now it is time to store the whole dataset as a new Parquet file.<br>
We will simply use Parquet's default partition.

### Create Parquet file with default partitions on the test dataset

In [None]:
shutil.rmtree(tst_parquet_folder,  ignore_errors=True) # make sure the target folder is empty

start = time.time()
df_tst_join.write.parquet(tst_parquet_folder)
duration = time.time() - start
print("duration: ", duration)

duration:  39.32404708862305


### Create Parquet file with default partitions on the whole dataset

In [None]:
shutil.rmtree(all_parquet_folder,  ignore_errors=True) # make sure the target folder is empty

start = time.time()
df_all_join.write.parquet(all_parquet_folder)
duration = time.time() - start
print("duration: ", duration)

duration:  758.595376253128


It took about 13 minutes to store all the data in parquet.<br>
Let us compare the file sizes of the compressed zip folder and the parquet folder:

In [None]:
print('compressed zip: ', get_size_format(get_directory_size(all_csv_folders)))
print('parquet       : ', get_size_format(get_directory_size(all_parquet_folder)))

compressed zip:  4.81GB
parquet       :  5.05GB


It is about the same size, especially if we consider that the Parquet version also contains the ticker information. Moreover, Parquet contains more metainformation which will help to faster access the data.

## Stop Spark

In [None]:
spark.stop()