In [None]:
# default_exp parquet

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# 00_01_Parquet_Basics
Notebook to explore the basics of Parquet
Links
* Spark documentation https://spark.apache.org/docs/2.4.5/

In [None]:
# common imports
import os
import zipfile
from bfh_cas_bgd_fs2020_sa.core import *

In [None]:
# Check the current working dir
print(os.getcwd())

C:\ieu\projects\bfh_cas_bgd_fs2020_sa


In [None]:
# Basic Definition
data_folder = "./data/" # folder with testdata
temp_folder = "./tmp/"
parquet_folder = "./parquet/"
data_files = ['2019q3.zip','2019q4.zip']

## Init Spark
This code initialises the SparkSession and therefore the SparkContext. Pressing the link "Spark UI" opens the Spark UI for this session.

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## Datafiles
The directory contains two zipfiles (2019q3.zip, 2019q4.zip). Each of them contains 4 csv files. The columns and the relation between these files are described in the readme.htm.<br>
Each zip file contains all quarterly and yearly reports that were filled during the quarter denoted by the filename.

### Unpacking
In a first step, the content of the files are unzipped and placed in separated folders

In [None]:
for data_file in data_files:
    path_to_zip_file = data_folder + data_file
    directory_to_extract_to = temp_folder + data_file[:-4]
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to)

the sizes of the directories

In [None]:
print('Data folder: ', get_size_format(get_directory_size(data_folder)))
print('Temp folder: ', get_size_format(get_directory_size(temp_folder)))

Data folder:  76.76MB
Temp folder:  748.04MB


## Using Parquet
let us read a csv file and store it as a parquet file
* API doc of the csv reader: https://spark.apache.org/docs/2.4.5/api/python/pyspark.sql.html?highlight=parquet#pyspark.sql.DataFrameReader.csv
* API doc of the parquet writer: https://spark.apache.org/docs/2.4.5/api/python/pyspark.sql.html?highlight=parquet#pyspark.sql.DataFrameWriter.parquet

In [None]:
# imports that are used in this section
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType
from pyspark.sql.functions import countDistinct, year, month

In [None]:
# as a test csv file, "num.txt" from the folder 2019q3 is used
test_file = temp_folder + '2019q3/num.txt'
print('size of test file: ', get_size_format(os.path.getsize(test_file)))

size of test file:  236.70MB


### Reading a CSV file
As a first step, the csv file has to be loaded into a spark df. <br>
The file has a header row and the columns are separated by a TAB (\t).

In [None]:
df_test_num = spark.read.csv(test_file, sep='\t', header=True)

When checking the format in the next cell, we see that all columns were read as a string. That is ok for most of the columns but when checking the definitions for the num.txt file in the readme.htm we see, that ddate is a date in the format 'yyyymmdd', qtrs and coreg are 'int' and value is a float.

In [None]:
print('first row:      ', df_test_num.head(1))
print('number of rows :', df_test_num.count())
df_test_num.printSchema()

first row:       [Row(adsh='0001625376-19-000017', tag='EntityPublicFloat', version='dei/2014', coreg=None, ddate='20180430', qtrs='0', uom='USD', value='0.0000', footnote=None)]
number of rows : 2325267
root
 |-- adsh: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- version: string (nullable = true)
 |-- coreg: string (nullable = true)
 |-- ddate: string (nullable = true)
 |-- qtrs: string (nullable = true)
 |-- uom: string (nullable = true)
 |-- value: string (nullable = true)
 |-- footnote: string (nullable = true)



In [None]:
# show how many different reports are available in this quarter
df2 = df_test_num.select(countDistinct("adsh"))
df2.show()

+--------------------+
|count(DISTINCT adsh)|
+--------------------+
|                6283|
+--------------------+



Acutally, spark can try to infer the types of the columns from the data itself, so lets try that by using the "inferSchema" option

In [None]:
df_test_num = spark.read.csv(test_file, sep='\t', header=True, inferSchema=True)

As we can see in the next cell, we were only partially sucessfull. The reader was able to detect that qtrs is an integer and that value is a double. But it failed to recognize that ddate is actually a date and that coreg should be an int. That was to be expected: ddate looks like an int and the coreg field is only used in special situations, so there is a good change that its content is None for all entries in the file. <br>
It looks as if we have to define the schema by hand

In [None]:
print(df_test_num.head(1))
df_test_num.printSchema()

[Row(adsh='0001625376-19-000017', tag='EntityPublicFloat', version='dei/2014', coreg=None, ddate=20180430, qtrs=0, uom='USD', value=0.0, footnote=None)]
root
 |-- adsh: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- version: string (nullable = true)
 |-- coreg: string (nullable = true)
 |-- ddate: integer (nullable = true)
 |-- qtrs: integer (nullable = true)
 |-- uom: string (nullable = true)
 |-- value: double (nullable = true)
 |-- footnote: string (nullable = true)



All necessary classes to define a schema are located inside the package pyspark.sql.types and for our example we need the following import<br>
```from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType, IntegerType```

An important point is that the dateFormat has to be defined as parameter when calling spark.read.csv() 

In [None]:
schema = StructType([StructField("adsh",    StringType(),  True),\
                     StructField("tag",     StringType(),  True),\
                     StructField("version", StringType(),  True),\
                     StructField("coreg",   IntegerType(), True),\
                     StructField("ddate",   DateType(),    True),\
                     StructField("qtrs",    IntegerType(), True),\
                     StructField("uom",     StringType(),  True),\
                     StructField("value",   DoubleType(),  True),\
                     StructField("footnote",StringType(),  True)\
                    ])
df_test_num = spark.read.csv(test_file, sep='\t', header=True, dateFormat="yyyyMMdd", schema = schema)

In [None]:
print(df_test_num.head(1))
df_test_num.printSchema()

[Row(adsh='0001625376-19-000017', tag='EntityPublicFloat', version='dei/2014', coreg=None, ddate=datetime.date(2018, 4, 30), qtrs=0, uom='USD', value=0.0, footnote=None)]
root
 |-- adsh: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- version: string (nullable = true)
 |-- coreg: integer (nullable = true)
 |-- ddate: date (nullable = true)
 |-- qtrs: integer (nullable = true)
 |-- uom: string (nullable = true)
 |-- value: double (nullable = true)
 |-- footnote: string (nullable = true)



### Simple write as Parquet
As first version the dataframe is stored directly in parquet format without additional options

In [None]:
parquet_folder_pure = parquet_folder+"pure/"
df_test_num.write.parquet(parquet_folder_pure, mode="overwrite") # mode 'overwrite' overwrites the data, if they are already present

In [None]:
print('size of parquet_folder_pure: ', get_size_format(get_directory_size(parquet_folder_pure)))
os.listdir(parquet_folder_pure)

size of parquet_folder_pure:  19.11MB


['.part-00000-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet.crc',
 '.part-00001-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet.crc',
 '.part-00002-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet.crc',
 '.part-00003-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet.crc',
 '.part-00004-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet.crc',
 '.part-00005-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet.crc',
 '.part-00006-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet.crc',
 '.part-00007-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet.crc',
 '._SUCCESS.crc',
 'part-00000-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet',
 'part-00001-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet',
 'part-00002-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet',
 'part-00003-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet',
 'part-00004-a86b89ce-d5ca-43f6-b587-e58d0c1f77d3-c000.snappy.parquet',
 'part

Parquet was able to compress the data down to about 10% of the orginal size. It splitted the data up in 8 different data files. So every file is containing approximatly 300'000 data rows and has a size of about 3MB

### Writing using partitions
Parquet can also store the data in different partitions wich will create a new directory for every partition.

As a first approach, we could try to create a partition for every report that means for every distinct "adsh" value. However, since we have about 6300 different reports in the used csv file that would result in data files less than 5kb each. Such small files are very inefficient for parquet, so we do soemthing else.<br>
Since we read the "ddate" column as a proper date-format we can create partitions based on the year and month. In order to do that we need to add two columns for year and month to the dataframe.

In [None]:
df_test_num = df_test_num.withColumn("year", year("ddate")).withColumn("month", month("ddate"))

In [None]:
parquet_folder_by_month = parquet_folder+"month"
df_test_num.write.partitionBy('year','month').parquet(parquet_folder_by_month, mode="overwrite")

Looking at the result may be a little surprising. There are folders for years starting 1978 up to 2028. The 'ddate' column is defined as "The end date for the data value, rounded to the nearest month end". A lot of values that appear in a report may not be from reported period. For instance, often results from the last couple of years are also included in a yearly report. Or expected returns for the following couple of years appear in the report.<br>
The total size of on disk has also increased significantly. It is still small compared to the originial CSV file, but around 25% to 30% bigger compared to the size that was needed when the data were stored without defining partitions.

In [None]:
print('size of parquet_folder_month: ', get_size_format(get_directory_size(parquet_folder_by_month)))
os.listdir(parquet_folder_by_month)

size of parquet_folder_month:  26.72MB


['._SUCCESS.crc',
 'year=1978',
 'year=1982',
 'year=1987',
 'year=1989',
 'year=1990',
 'year=1993',
 'year=1994',
 'year=1995',
 'year=1996',
 'year=1997',
 'year=1998',
 'year=1999',
 'year=2000',
 'year=2001',
 'year=2002',
 'year=2003',
 'year=2004',
 'year=2005',
 'year=2006',
 'year=2007',
 'year=2008',
 'year=2009',
 'year=2010',
 'year=2011',
 'year=2012',
 'year=2013',
 'year=2014',
 'year=2015',
 'year=2016',
 'year=2017',
 'year=2018',
 'year=2019',
 'year=2020',
 'year=2021',
 'year=2022',
 'year=2023',
 'year=2025',
 'year=2027',
 'year=2028',
 'year=__HIVE_DEFAULT_PARTITION__',
 '_SUCCESS']

### Reading parquet
reading parquet is even simpler as reading a cvs since parquet contains metainformation about the file structure

In [None]:
df_test_num = spark.read.parquet(parquet_folder_pure)
print('number of rows :', df_test_num.count())
df_test_num.printSchema()

number of rows : 2325267
root
 |-- adsh: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- version: string (nullable = true)
 |-- coreg: integer (nullable = true)
 |-- ddate: date (nullable = true)
 |-- qtrs: integer (nullable = true)
 |-- uom: string (nullable = true)
 |-- value: double (nullable = true)
 |-- footnote: string (nullable = true)

