In [None]:
# default_exp pivoting

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# pivoting

In order to create uniform datasets for the different primary financial statements (Income Statement, CaschFlow, BalanceSheet, ..) it is easier if the data is converted from its vertical for to a horizontal form. This means that we pivot the values based on the tag.

Therfore, for every primary financial statement a separate dataset is created.

Statement types

- IS: IncomeStatement
- CF: CashFlow
- BS: BalanceSheet
- CI: Comprehensive Income
- EQ: Equity
- CP: CoverPage
- UN: Unclassifiable Statement

## Basic Settings

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import get_spark_session # initialze spark
from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col

import pandas as pd

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

In [None]:
all_filtered_folder        = "D:/data/parq_filtered"      # source folder with the prepared parquet file
all_pivot_selected_folder  = "D:/data/parq_pivot_select"  # target folder which will contain only the needed subset of columns
all_pivoted_folder         = "D:/data/parq_pivot_split"   # target folder for the different pivoted and separated datasets

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## 01_Load Data

**only execute if necessary**

Loads the and filtered data with all columns

In [None]:
df_all = spark.read.parquet(all_filtered_folder).cache()

In [None]:
# Expected Amount of Data
# Entries:  15_650_848
# duration:  66 sec

# load all data into memory
start = time.time()
print("Entries: ", "{:_}".format(df_all.count())) # loading all dataset into memory
duration = time.time() - start
print("duration: ", duration)

Entries:  15_650_848
duration:  96.77243518829346


## 02_Select

**only execute if necessary**

Creates a new dataset containing only  the columns that are needed during the next steps.

In [None]:
df_all_selected = df_all.select(["stmt","cik","ticker", "adsh","period","form","tag","value","report", "line", "fp", "uom"])

In [None]:
shutil.rmtree(all_pivot_selected_folder,  ignore_errors=True)
df_all_selected.write.parquet(all_pivot_selected_folder)

## 03_Pivoting

In [None]:
df_all_selected = spark.read.parquet(all_pivot_selected_folder).cache()

In [None]:
# Expected Amount of Data
# Entries:  15_650_848
# duration:  23 sec

# load all data into memory
start = time.time()
print("Entries: ", "{:_}".format(df_all_selected.count())) # loading all dataset into memory
duration = time.time() - start
print("duration: ", duration)

Entries:  15_650_848
duration:  16.718958616256714


In [None]:
pivot_attrs = ['value'] # column that contains the value which has to be pivoted

def pivot_statement(all_data_df, statement:str):
    all_stmt_data = all_data_df.where("stmt == '" + statement + "'").cache()
    
    shutil.rmtree(all_pivoted_folder + "/" + statement,  ignore_errors=True)
    
    grouped_df = all_stmt_data.groupby(["cik","ticker","adsh","form","period","fp"])
    
    for attr in pivot_attrs:
        
        # using max() is not the best approach. generally, a tag is only contained once in a report, but there are excptions
        # like the CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents in the CF statement.
        pivoted_df = grouped_df.pivot("tag").max(attr)
        
        # repartition(1), so that only one file is created
        pivoted_df.repartition(1).write.parquet(all_pivoted_folder + "/" + statement + "/" + attr) 
        
    all_stmt_data.unpersist()

In [None]:
def pivot_statements(all_data_df, statements):
    for stmt in statements:
        print (stmt)
        pivot_statement(all_data_df, statement=stmt)

## 99_Execution

In [None]:
#statements = ['IS','CF','CP','BS','CI','EQ','UN']
#statements = ['UN']
pivot_statements(df_all_selected, statements)

In [None]:
spark.stop()