In [None]:
# default_exp uniform

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# uniform and complete

...

## Basic Settings

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import get_spark_session # initialze spark
from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col

import pandas as pd

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

In [None]:
all_pivot_selected_folder  = "D:/data/parq_pivot_select"
all_pivoted_folder = "D:/data/parq_pivot_split"
all_processed_folder = "D:/data/parq_processed/"

col_list =    ["stmt","cik","ticker", "adsh","period","form","tag","value","report", "line", "fp", "uom"]
pivot_attrs = ['value', 'report', 'line']
statements =  ['IS','CF','CP','BS','CI','EQ','UN']

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## 00_Tools

In [None]:
def load_data(stmt:str, attr:str):
    return spark.read.parquet(all_pivoted_folder + "/" + stmt + "/" + attr).cache()

In [None]:
def spark_shape(self):
    return (self.count(), len(self.columns))

In [None]:
### Get count of both null and missing values in pyspark
from pyspark.sql.functions import isnan, when, count, col
def get_empty_count(df):
    return df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

In [None]:
def complete_addition(df, sumcol, addcol1, addcol2):
    missingtwo = (df[sumcol].notnull()) & (df[addcol1].notnull()) & (df[addcol2].isnull())
    df.loc[missingtwo, addcol2] = df.loc[missingtwo, sumcol] - df.loc[missingtwo, addcol1]

    missingone = (df[sumcol].notnull()) & (df[addcol2].notnull()) & (df[addcol1].isnull())
    df.loc[missingone, addcol1] = df.loc[missingone, sumcol] - df.loc[missingone, addcol2]  
    
    missingsum = (df[sumcol].isnull()) & (df[addcol2].notnull()) & (df[addcol1].notnull())
    df.loc[missingsum, sumcol] = df.loc[missingsum, addcol1] + df.loc[missingsum, addcol2]  

In [None]:
def copy_if_not_empty(df, sourcecol, targetcol, to_zero_col = None):
    do_copy = (df[sourcecol].notnull()) & (df[targetcol].isnull())
    df.loc[do_copy, targetcol] = df.loc[do_copy, sourcecol]
    if to_zero_col != None:
        df.loc[do_copy, to_zero_col] = 0.0

In [None]:
def copy_if_not_empty_for_ticker(df, ticker, sourcecol, targetcol, to_zero_col = None):
    do_copy = (df['ticker'] == ticker) & (sel_df[sourcecol].notnull()) & (sel_df[targetcol].isnull())

    df.loc[do_copy, targetcol] = df.loc[do_copy, sourcecol]
    if to_zero_col != None:
        df.loc[do_copy, to_zero_col] = 0.0

In [None]:
def set_to_zero_if_null(df, col):
    do_set = (df[col].isnull())
    df.loc[do_set, col] = 0.0

In [None]:
def print_null_count(df, cols):
    for col in cols:
        print(col, ' ', df[col].isnull().sum())

## 00_Raw_data

In [None]:
df_all_selected = spark.read.parquet(all_pivot_selected_folder).cache()

## 01_Balance_Sheet

In [None]:
bs_pivot_value = load_data("BS", "value")
spark_shape(bs_pivot_value)

(133872, 2342)

In [None]:
bs_pivot_pd = bs_pivot_value.toPandas()

In [None]:
bs_pivot_pd_copy = bs_pivot_pd.copy()

### Assets

In [None]:
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   579
AssetsNoncurrent   127026
AssetsCurrent   30421


In [None]:
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsNet', 'Assets')
complete_addition(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')
# if Assets contains data and current not, copy from Assets to current and set noncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')
# if AssetsCurrent contains data and Assets does not, copy to Assets ans set noncurrent t 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsCurrent', 'Assets', 'AssetsNoncurrent')

In [None]:
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   419
AssetsNoncurrent   419
AssetsCurrent   419


In [None]:
bs_pivot_pd_copy.shape

(133872, 2342)

### Liabilities

In [None]:
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   34118
LiabilitiesNoncurrent   117766
LiabilitiesCurrent   30837


In [None]:
complete_addition(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')
# if Liabilities contains data and current is empty, copy to current and set NonCurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')
# if only current Liabilities, copy to Liabilities and set Noncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'LiabilitiesCurrent', 'Liabilities', 'LiabilitiesNoncurrent')

In [None]:
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   783
LiabilitiesNoncurrent   781
LiabilitiesCurrent   783


In [None]:
bs_pivot_pd_copy.shape

(133872, 2342)

### Equity

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit','LiabilitiesAndStockholdersEquity'])

StockholdersEquity   12883
RetainedEarningsAccumulatedDeficit   10583
LiabilitiesAndStockholdersEquity   710


In [None]:
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'LiabilitiesAndStockholdersEquity') # has to be the same
copy_if_not_empty(bs_pivot_pd_copy, 'PartnersCapital', 'StockholdersEquity') # if there is partner capital, we consider it the same as stockholder equity
copy_if_not_empty(bs_pivot_pd_copy, 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest', 'StockholdersEquity') #dito
set_to_zero_if_null(bs_pivot_pd_copy, 'RetainedEarningsAccumulatedDeficit')

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit','LiabilitiesAndStockholdersEquity'])

StockholdersEquity   1277
RetainedEarningsAccumulatedDeficit   0
LiabilitiesAndStockholdersEquity   409


In [None]:
bs_pivot_pd_copy.shape

(133872, 2342)

### Clean empty companies

In [None]:
bs_cols_selected = bs_pivot_pd_copy[["cik","ticker", "adsh","period","form", 
                                     'Assets','AssetsNoncurrent', 'AssetsCurrent', 
                                     'Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent', 
                                     'StockholdersEquity','RetainedEarningsAccumulatedDeficit']]

In [None]:
incomplete_ciks = bs_cols_selected[bs_cols_selected.isnull().sum(axis=1) > 0].cik.unique()

In [None]:
bs_cols_cleaned = bs_cols_selected[~bs_pivot_pd_copy.cik.isin(incomplete_ciks)]

In [None]:
bs_cols_cleaned.isnull().sum(axis=1).sum()

0

### Save

In [None]:
bs_cols_cleaned.to_csv(all_processed_folder + "bs.csv", index=False)

# xx_trials

In [None]:
selected = bs_piivot_pd_copy['ticker'] == 'FXE'
sel_df = bs_piivot_pd_copy.loc[selected]
do_copy = (sel_df['AssetsCurrent'].notnull()) & (sel_df['Assets'].isnull())
do_copy.sum()

34

In [None]:
bs_pivot_liabilities_copy[(bs_pivot_liabilities_copy.StockholdersEquity.isnull())  ][["cik","ticker", "adsh","period", "StockholdersEquityBeforeTreasuryStock", "LiabilitiesAndStockholdersEquity", "RetainedEarningsAccumulatedDeficit"]].sort_values(by=['period'])

Unnamed: 0,cik,ticker,adsh,period,StockholdersEquityBeforeTreasuryStock,LiabilitiesAndStockholdersEquity,RetainedEarningsAccumulatedDeficit
119910,9892,BCR,0001193125-09-155929,2009-06-30,,2.762700e+09,1.097400e+09
19396,1357615,KBR,0001140361-09-017252,2009-06-30,,5.647000e+09,7.320000e+08
5106,783325,WEC,0000107815-09-000047,2009-06-30,,1.229830e+10,0.000000e+00
104965,1037676,ACI,0000950123-09-031842,2009-06-30,,4.028997e+09,4.684620e+08
92620,72903,XEL,0001104659-09-046313,2009-06-30,,2.498725e+10,0.000000e+00
...,...,...,...,...,...,...,...
26879,1089819,CNL,0001089819-20-000013,2020-06-30,,7.729180e+09,0.000000e+00
8114,783325,WEC,0000107815-20-000246,2020-06-30,,3.504200e+10,6.222800e+09
40671,799233,HTLD,0000799233-20-000044,2020-06-30,,9.304990e+08,8.558300e+08
91662,1549922,SMLP,0001564590-20-038480,2020-06-30,,2.586502e+09,0.000000e+00


In [None]:
bs_pivot_liabilities_copy[(bs_pivot_liabilities_copy.StockholdersEquity.isnull())][["cik","ticker", "adsh","period", "Assets", "StockholdersEquity", "LiabilitiesAndStockholdersEquity", "RetainedEarningsAccumulatedDeficit"]].sort_values(by=['period'])

Unnamed: 0,cik,ticker,adsh,period,Assets,StockholdersEquity,LiabilitiesAndStockholdersEquity,RetainedEarningsAccumulatedDeficit
14125,97476,TXN,0001140361-09-017223,2009-06-30,1.151000e+10,,1.151000e+10,2.116300e+10
5967,890801,MFE,0000950123-09-031492,2009-06-30,3.710919e+09,,3.710919e+09,6.208520e+08
78888,1170650,MHS,0000950123-09-027011,2009-06-30,1.777026e+10,,1.777030e+10,4.532500e+09
71915,1140859,ABC,0000950123-09-030790,2009-06-30,1.273575e+10,,1.273575e+10,2.806404e+09
128768,1004440,CEG,0001047469-09-007384,2009-06-30,1.999840e+10,,1.999840e+10,2.002700e+09
...,...,...,...,...,...,...,...,...
13387,29644,DCI,0000029644-20-000046,2020-07-31,2.244600e+09,,2.244600e+09,1.430000e+09
69179,65172,MSB,0001558370-20-011056,2020-07-31,1.293815e+07,,1.293815e+07,0.000000e+00
13338,1041859,PLCE,0001628280-20-013276,2020-07-31,1.167939e+09,,1.167939e+09,-6.386200e+07
64756,900075,CPRT,0000900075-20-000021,2020-07-31,3.455261e+09,,3.455261e+09,1.937853e+09


In [None]:
bs_pivot_liabilities_copy[(bs_pivot_liabilities_copy.adsh == '0001104659-09-029605')][["cik","ticker", "adsh","period","report", "line"]].sort_values(by=['period'])

KeyError: "['line', 'report'] not in index"

In [None]:
empty_count = get_empty_count(bs_pivot_value)

In [None]:
empty_pd = empty_count.toPandas()

In [None]:
melt_pd = empty_pd.melt(var_name = 'Tag', value_name = "Count")
# df2 = pd.melt(df, id_vars=["location", "name"], var_name="Date", value_name="Value")

In [None]:
melt_pd.columns

Index(['Tag', 'Count'], dtype='object')

In [None]:
pd_frame = df_all_selected.where("adsh == '0001104659-20-108360' and stmt = 'BS'").toPandas()
pd_frame.sort_values(['report', 'line'])

Unnamed: 0,stmt,cik,ticker,adsh,period,form,tag,value,report,line,fp,uom
5,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,CashAndCashEquivalentsAtCarryingValue,107700000.0,2,3,Q1,USD
10,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,RestrictedCashAndCashEquivalentsAtCarryingValue,6400000.0,2,4,Q1,USD
38,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,AccountsReceivableNetCurrent,166700000.0,2,5,Q1,USD
32,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,ContractWithCustomerAssetNetCurrent,45200000.0,2,6,Q1,USD
17,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,InventoryNet,597700000.0,2,7,Q1,USD
1,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,AssetsOfDisposalGroupIncludingDiscontinuedOper...,22000000.0,2,9,Q1,USD
12,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,OtherAssetsCurrent,61600000.0,2,10,Q1,USD
31,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,AssetsCurrent,1074200000.0,2,11,Q1,USD
26,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,PropertyPlantAndEquipmentNet,127000000.0,2,12,Q1,USD
9,BS,1750,AIR,0001104659-20-108360,2020-08-31,10-Q,IntangibleAssetsNetIncludingGoodwill,123100000.0,2,14,Q1,USD


In [None]:
bs_pivot_report.where("adsh == '0001492298-20-000025'").show()

In [None]:
df_all_selected.select('adsh','stmt').distinct().count()

585411

In [None]:
df_all_selected.select('adsh','stmt').distinct().where('stmt = "BS"').count()

133872

In [None]:
df_all_selected.select('adsh','stmt').distinct().where('stmt = "EQ"').count()

86120

In [None]:
[x for x in bs_pivot_liabilities_copy.columns.values if x.startswith('StockholdersEquity')]

['StockholdersEquity',
 'StockholdersEquityAttributableToParentNotAllowableForNetCapital',
 'StockholdersEquityBeforeTreasuryStock',
 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterestAdjustedBalance1',
 'StockholdersEquityNoteStockSplitConversionRatio',
 'StockholdersEquityNoteStockSplitConversionRatio1',
 'StockholdersEquityNoteSubscriptionsReceivable']