In [None]:
# default_exp uniformbs

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# uniform BS

## Basic Settings

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import * 
from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col

import pandas as pd

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

In [None]:
all_pivot_selected_folder  = "D:/data/parq_pivot_select"
all_pivoted_folder = "D:/data/parq_pivot_split"
all_processed_folder = "D:/data/parq_processed/"

col_list =    ["stmt","cik","ticker", "adsh","period","filed", "form","tag","value","report", "line", "fp", "uom"]
pivot_group = ["cik","ticker","adsh","form","period","filed","fp", "qtrs"]

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## 00_Raw_data

In [None]:
# loading the complete unpivoted dataset - if it is needed for debbuging
df_all_selected = spark.read.parquet(all_pivot_selected_folder).cache()

In [None]:
# it happens sometimes, that the data could not be associated with a right sheet (bs, is, cf, ..). in this cases, the data can appea under "UN"
# so if expected information cannot be found in the appropriate statement, we have to look in the un statement
un_pivot_value = load_data(all_pivoted_folder, spark, "UN", "value")
un_pivot_pd = un_pivot_value.toPandas()

In [None]:
un_pivot_pd.shape

(5989, 1775)

In [None]:
def prepare_un_values(df_to_merge_into, attr_list):
    # add possible columns from un set to  cf data with prefix cpy_
    attributes = pivot_group[:] # create copy
    attributes.extend(attr_list)

    un_prepared = un_pivot_pd[attributes].copy()
    un_prepared.rename(columns=lambda x: x  if x in pivot_group else ("cpy_" + x), inplace=True)

    return pd.merge(df_to_merge_into, un_prepared, how='left', on=pivot_group)

## 01_Balance_Sheet

In [None]:
bs_pivot_value = load_data(all_pivoted_folder, spark, "BS", "value")
spark_shape(bs_pivot_value)

(117868, 2244)

In [None]:
bs_pivot_pd = bs_pivot_value.toPandas()

In [None]:
bs_pivot_pd_copy = bs_pivot_pd.copy()

### Merge Data from "unknown" statement

In [None]:
# merge relevant columns from the UN dataset
bs_pivot_pd_copy = prepare_un_values(bs_pivot_pd_copy, [
    'AssetsNet',
    'Assets',
    'AssetsNoncurrent',
    'AssetsCurrent',
    'Liabilities',
    'LiabilitiesNoncurrent',
    'LiabilitiesCurrent',
    'StockholdersEquity',
    'LiabilitiesAndStockholdersEquity',
    'RetainedEarningsAccumulatedDeficit',
    'PartnersCapital',
    'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
])
bs_pivot_pd_copy.shape

(117868, 2256)

### Assets

In [None]:
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   1801
AssetsNoncurrent   111944
AssetsCurrent   28053


In [None]:
# if present, copy data from UN dataset
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_AssetsNet', 'AssetsNet')
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_Assets', 'Assets')
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_AssetsNoncurrent', 'AssetsNoncurrent')
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_AssetsCurrent', 'AssetsCurrent')


# Sometimes AssetsNet is present instead of Assets, copy its content to Assets
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsNet', 'Assets')

# if one of the three provided columns is missing, calculate its content based on Assets = AssetsCurrent + AssetsNoncurrent
complete_addition(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')

# if Assets contains data but AssetsCurrent and AssetsNoncurrent are empty, assume that only AssetsCurrent is present
# copy value from Assets to AssetsCurrent and set AssetsNoncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')

# if AssetsCurrent contains data and Assets  and AssetsNoncurrent are empty, assume that only AssetsCurrent is present
# copy value from AssetsCurrent to Assets and set AssetsNoncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsCurrent', 'Assets', 'AssetsNoncurrent')

In [None]:
# check for how many entries Assets, AssetsNoncurrent and AsstesCurrent couldn't be completed
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   1592
AssetsNoncurrent   1592
AssetsCurrent   1592


### Liabilities

In [None]:
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   29982
LiabilitiesNoncurrent   103819
LiabilitiesCurrent   28416


In [None]:
# if present, copy data from UN dataset
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_Liabilities', 'Liabilities')
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_LiabilitiesNoncurrent', 'LiabilitiesNoncurrent')
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_LiabilitiesCurrent', 'LiabilitiesCurrent')

complete_addition(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')

copy_if_not_empty(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')
copy_if_not_empty(bs_pivot_pd_copy, 'LiabilitiesCurrent', 'Liabilities', 'LiabilitiesNoncurrent')

In [None]:
# check for how many entries we were not able to complete the Liabilities information
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   1900
LiabilitiesNoncurrent   1899
LiabilitiesCurrent   1900


### Equity
In the Equity section of the balance sheet, we are intereste in the StockholdersEquity and the Earnings (Tag. RetainedEarningsAccumulatedDeficit)

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit'])

StockholdersEquity   11898
RetainedEarningsAccumulatedDeficit   10396


In [None]:
bs_pivot_pd_copy['Equity_hj'] = None

copy_if_not_empty(bs_pivot_pd_copy, 'cpy_PartnersCapital', 'PartnersCapital')
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_StockholdersEquity', 'StockholdersEquity')
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest', 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest')
copy_if_not_empty(bs_pivot_pd_copy, 'cpy_RetainedEarningsAccumulatedDeficit', 'RetainedEarningsAccumulatedDeficit')

copy_if_not_empty(bs_pivot_pd_copy, 'StockholdersEquity', 'Equity_hj') 

# if there is partner capital but no StockholdersEquite, we consider it the same as stockholder equity
copy_if_not_empty(bs_pivot_pd_copy, 'PartnersCapital', 'Equity_hj') 

# if there is StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest instead of StockholdersEquity, we use this as StocholdersEquity
copy_if_not_empty(bs_pivot_pd_copy, 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest', 'Equity_hj') 

# if RetainedEarnings has no value, we set it to zero
set_to_zero_if_null(bs_pivot_pd_copy, 'RetainedEarningsAccumulatedDeficit')

In [None]:
print_null_count(bs_pivot_pd_copy, ['Equity_hj','RetainedEarningsAccumulatedDeficit'])

Equity_hj   2336
RetainedEarningsAccumulatedDeficit   0


### Save

In [None]:
bs_pivot_pd_copy[["cik","ticker", "adsh","period","filed","form", "qtrs","fp",
                 'Assets','AssetsNoncurrent', 'AssetsCurrent', 
                 'Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent', 
                 'Equity_hj','RetainedEarningsAccumulatedDeficit']] \
        .to_csv(all_processed_folder + "bs_not_cleaned.csv", index=False)

In [None]:
spark.stop()