In [None]:
# default_exp uniform

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# todos




# uniform and complete

Once the data is pivoted, we want to create a dataset that contains only columns which appear in every report.

For instance, we want to have all the columns 'Assets', 'AssetsCurrent', 'AssetsNoncurrent' to be present in every balancesheet. However, not all desired tags are present and we have to find ways to calculate and complete them. Often, only 2 of 'Assets', 'AssetsCurrent', 'AssetsNoncurrent' are present, however, since Assets = AssetsCurrent + AssetsNoncurrent we can calculate the third.

Sometimes different tags are used to express the same meaning, so we have to figure out which tags belong together.

## Basic Settings

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import get_spark_session # initialze spark
from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col

import pandas as pd

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

In [None]:
all_pivot_selected_folder  = "D:/data/parq_pivot_select"
all_pivoted_folder = "D:/data/parq_pivot_split"
all_processed_folder = "D:/data/parq_processed/"

col_list =    ["stmt","cik","ticker", "adsh","period","form","tag","value","report", "line", "fp", "uom"]
pivot_group = ["cik","ticker","adsh","form","period","fp", "qtrs"]
pivot_attrs = ['value', 'report', 'line']
statements =  ['IS','CF','CP','BS','CI','EQ','UN']

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## 00_Tools

These are different helper methods which make transformation of the data easier

In [None]:
def load_data(stmt:str, attr:str):
    """ Loads the pivoted data into a spark dataframe.
    """
    return spark.read.parquet(all_pivoted_folder + "/" + stmt + "/" + attr).cache()

In [None]:
def spark_shape(self):
    return (self.count(), len(self.columns))

In [None]:
### Get count of both null and missing values in pyspark
from pyspark.sql.functions import isnan, when, count, col
def get_empty_count(df):
    return df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

In [None]:
def complete_addition(df, sumcol, addcol1, addcol2):
    """ 
    If there are columns that share the relation sumcol = addcol1 + addcol2
    this function ensures that a missing value is calculated based on the other two   
    """
    missingtwo = (df[sumcol].notnull()) & (df[addcol1].notnull()) & (df[addcol2].isnull())
    df.loc[missingtwo, addcol2] = df.loc[missingtwo, sumcol] - df.loc[missingtwo, addcol1]

    missingone = (df[sumcol].notnull()) & (df[addcol2].notnull()) & (df[addcol1].isnull())
    df.loc[missingone, addcol1] = df.loc[missingone, sumcol] - df.loc[missingone, addcol2]  
    
    missingsum = (df[sumcol].isnull()) & (df[addcol2].notnull()) & (df[addcol1].notnull())
    df.loc[missingsum, sumcol] = df.loc[missingsum, addcol1] + df.loc[missingsum, addcol2]  

In [None]:
def copy_if_not_empty(df, sourcecol, targetcol, to_zero_col = None):
    """ 
    copies the value from the sourceol to the targetcol if the sourcecol is not empty and the targetcol is empty.
    As a third parameter, a column can be provided that has to be set to 0.0 in the rows where the values are copied    
    """
    do_copy = (df[sourcecol].notnull()) & (df[targetcol].isnull())
    df.loc[do_copy, targetcol] = df.loc[do_copy, sourcecol]
    if to_zero_col != None:
        df.loc[do_copy, to_zero_col] = 0.0

In [None]:
def sum_into_empty_target(df, add1col, add2col, targetcol):
    """
    adds the value of two not empty columns and stores the result in the empty targetcol
    """
    do_sum = (df[add1col].notnull()) & (df[add2col].notnull()) & (df[targetcol].isnull())
    df.loc[do_sum, targetcol] = df.loc[do_sum, add1col] + df.loc[do_sum, add2col]

In [None]:
def sum_cols_into_new_target(df, targetcol, sumcolslist):
    """ sums up the value  of several columns and stores the result in the targetcol.
        the columns that contain values to sum up may not be empty.
    """
    df[targetcol] = 0.0
    for col in sumcolslist:
        set_to_zero_if_null(df, col)
        df[targetcol] += df[col]

In [None]:
def copy_if_not_empty_for_ticker(df, ticker, sourcecol, targetcol, to_zero_col = None):
    """ copy the not empty sourcecol to the empty targetcol of a certain ticker.
        if provided, set the to_zero_col for the affected rows to zero.
    """
    do_copy = (df['ticker'] == ticker) & (sel_df[sourcecol].notnull()) & (sel_df[targetcol].isnull())

    df.loc[do_copy, targetcol] = df.loc[do_copy, sourcecol]
    if to_zero_col != None:
        df.loc[do_copy, to_zero_col] = 0.0

In [None]:
def set_to_zero_if_null(df, col):
    """ set null values of a column to 0.0
    """
    do_set = (df[col].isnull())
    df.loc[do_set, col] = 0.0

In [None]:
def print_null_count(df, cols):
    """ print out howmany null values the provided cols contain 
    """
    for col in cols:
        print(col, ' ', df[col].isnull().sum())

## 00_Raw_data

In [None]:
# loading the complete unpivoted dataset - if it is needed for debbuging
df_all_selected = spark.read.parquet(all_pivot_selected_folder).cache()

In [None]:
# it happens sometimes, that the data could not be associated with a right sheet (bs, is, cf, ..). in this cases, the data can appea under "UN"
# so if expected information cannot be found in the appropriate statement, we have to look in the un statement
un_pivot_value = load_data("UN", "value")
un_pivot_pd = un_pivot_value.toPandas()

In [None]:
un_pivot_pd.shape

(6869, 1959)

In [None]:
def prepare_un_values(df_to_merge_into, attr_list):
    # add possible columns from un set to  cf data with prefix cpy_
    attributes = pivot_group[:] # create copy
    attributes.extend(attr_list)

    un_prepared = un_pivot_pd[attributes].copy()
    un_prepared.rename(columns=lambda x: x  if x in pivot_group else ("cpy_" + x), inplace=True)

    return pd.merge(df_to_merge_into, un_prepared, how='left', on=pivot_group)

## 01_Balance_Sheet

In [None]:
bs_pivot_value = load_data("BS", "value")
spark_shape(bs_pivot_value)

(135358, 2343)

In [None]:
bs_pivot_pd = bs_pivot_value.toPandas()

In [None]:
bs_pivot_pd_copy = bs_pivot_pd.copy()

### Assets

In [None]:
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   2065
AssetsNoncurrent   128512
AssetsCurrent   31907


In [None]:
# Somtimes AssetsNet is present instead of Assets, copy its content to Assets
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsNet', 'Assets')

# if one of the three provided columns is missing, calculate its content based on Assets = AssetsCurrent + AssetsNoncurrent
complete_addition(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')

# if Assets contains data but AssetsCurrent and AssetsNoncurrent are empty, assume that only AssetsCurrent is present
# copy value from Assets to AssetsCurrent and set AssetsNoncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')

# if AssetsCurrent contains data and Assets  and AssetsNoncurrent are empty, assume that only AssetsCurrent is present
# copy value from AssetsCurrent to Assets and set AssetsNoncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsCurrent', 'Assets', 'AssetsNoncurrent')

In [None]:
# check for how many entries Assets, AssetsNoncurrent and AsstesCurrent couldn't be completed
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   1905
AssetsNoncurrent   1905
AssetsCurrent   1905


### Liabilities

In [None]:
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   35604
LiabilitiesNoncurrent   119252
LiabilitiesCurrent   32323


In [None]:
# Completing the Liabilities columns follows the same logic as for the Assets columns

complete_addition(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')

copy_if_not_empty(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')
copy_if_not_empty(bs_pivot_pd_copy, 'LiabilitiesCurrent', 'Liabilities', 'LiabilitiesNoncurrent')

In [None]:
# check for how many entries we were not able to complete the Liabilities information
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   2269
LiabilitiesNoncurrent   2267
LiabilitiesCurrent   2269


### Equity
In the Equity section of the balance sheet, we are intereste in the StockholdersEquity and the Earnings (Tag. RetainedEarningsAccumulatedDeficit)

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit'])

StockholdersEquity   14369
RetainedEarningsAccumulatedDeficit   12069


In [None]:
# per  definition, LiabilitisAndStockholdersEquity has to match Assets in a balance sheet
# so if LiabilitiesAndStockholdersEquity is not set, we copy the value from the Assets column
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'LiabilitiesAndStockholdersEquity') # has to be the same

# if there is partner capital but no StockholdersEquite, we consider it the same as stockholder equity
copy_if_not_empty(bs_pivot_pd_copy, 'PartnersCapital', 'StockholdersEquity') 

# if there is StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest instead of StockholdersEquity, we use this as StocholdersEquity
copy_if_not_empty(bs_pivot_pd_copy, 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest', 'StockholdersEquity') 

# if RetainedEarnings has no value, we set it to zero
set_to_zero_if_null(bs_pivot_pd_copy, 'RetainedEarningsAccumulatedDeficit')

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit'])

StockholdersEquity   2763
RetainedEarningsAccumulatedDeficit   0


### Save

In [None]:
bs_pivot_pd_copy[["cik","ticker", "adsh","period","form", "qtrs","fp",
                 'Assets','AssetsNoncurrent', 'AssetsCurrent', 
                 'Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent', 
                 'StockholdersEquity','RetainedEarningsAccumulatedDeficit']] \
        .to_csv(all_processed_folder + "bs_not_cleaned.csv", index=False)

### Clean empty companies

In [None]:
bs_cols_selected = bs_pivot_pd_copy[["cik","ticker", "adsh","period","form", "qtrs","fp"
                                     'Assets','AssetsNoncurrent', 'AssetsCurrent', 
                                     'Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent', 
                                     'StockholdersEquity','RetainedEarningsAccumulatedDeficit']]

In [None]:
incomplete_ciks = bs_cols_selected[bs_cols_selected.isnull().sum(axis=1) > 0].cik.unique()

In [None]:
bs_cols_cleaned = bs_cols_selected[~bs_pivot_pd_copy.cik.isin(incomplete_ciks)]

In [None]:
bs_cols_cleaned.shape

(124520, 13)

In [None]:
bs_cols_cleaned.isnull().sum(axis=1).sum()

0

In [None]:
bs_cols_cleaned.to_csv(all_processed_folder + "bs.csv", index=False)

## 02_CashFlow

Operation
- NetIncomeLoss
- ProfitLoss
- NetCashProvidedByUsedInOperatingActivities: NetIncome + other positions ergibt diese Position

Investing
- NetCashProvidedByUsedInInvestingActivities

Financing activities
- PaymentsForRepurchaseOfCommonStock: Aktienrückkäufe
- PaymentsOfDividends
- NetCashProvidedByUsedInFinancingActivities

Cash Bestand unterschied
- CashAndCashEquivalentsPeriodIncreaseDecrease: increase/decrease in cash

In [None]:
cf_pivot_value = load_data("CF", "value")
spark_shape(cf_pivot_value)

(351336, 3053)

In [None]:
cf_empty_count = get_empty_count(cf_pivot_value)

In [None]:
cf_pivot_pd = cf_pivot_value.toPandas()

In [None]:
cf_pivot_pd_copy = cf_pivot_pd.copy()

In [None]:
cf_pivot_pd.shape

### Cash Increase/Decrease
- 'CashAndCashEquivalentsPeriodIncreaseDecrease',
- 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
- 'CashPeriodIncreaseDecrease',
- 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'NetCashProvidedByUsedInContinuingOperations'

In [None]:
print_null_count(cf_pivot_pd_copy, ['CashAndCashEquivalentsPeriodIncreaseDecrease'])

CashAndCashEquivalentsPeriodIncreaseDecrease   30244


In [None]:
# merge relevant columns from the UN dataset
cf_pivot_pd_copy = prepare_un_values(cf_pivot_pd_copy, [
    'CashAndCashEquivalentsPeriodIncreaseDecrease',
    'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
    'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect'
])
cf_pivot_pd_copy.shape

In [None]:
# if CashAndCashEquivalentsPeriodIncreaseDecrease is not present and CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect
# is present, we can replace CashAndCashEquivalentsPeriodIncreaseDecrease.
# there are only about 12 entries where both are present
copy_if_not_empty(cf_pivot_pd_copy, 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashPeriodIncreaseDecrease', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInContinuingOperations', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or


# try to find data in joined un data
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashAndCashEquivalentsPeriodIncreaseDecrease', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or

In [None]:
print_null_count(cf_pivot_pd_copy, ['CashAndCashEquivalentsPeriodIncreaseDecrease'])

CashAndCashEquivalentsPeriodIncreaseDecrease   1563


### Operation
- NetIncomeLoss
- NetCashProvidedByUsedInOperatingActivities: NetIncome + other positions ergibt diese Position

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetIncomeLoss', 'ProfitLoss', 'NetCashProvidedByUsedInOperatingActivities'])

NetIncomeLoss   58102
ProfitLoss   84716
NetCashProvidedByUsedInOperatingActivities   32125


In [None]:
# if only ProfitLoss is set, copy content to NetIncomeLoss
# if onlyNetIncomeLoss is set, copy to ProfitLoss
copy_if_not_empty(cf_pivot_pd_copy, 'ProfitLoss', 'NetIncomeLoss')
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLoss', 'ProfitLoss')
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLossAvailableToCommonStockholdersBasic', 'ProfitLoss') # certain CFs just have this position
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLossAvailableToCommonStockholdersBasic', 'NetIncomeLoss')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivitiesContinuingOperations', 'NetCashProvidedByUsedInOperatingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivities', 'ProfitLoss') # certain CFs just have this position
copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetIncomeLoss', 'ProfitLoss', 'NetCashProvidedByUsedInOperatingActivities'])

NetIncomeLoss   230
ProfitLoss   230
NetCashProvidedByUsedInOperatingActivities   501


### Investing
- NetCashProvidedByUsedInInvestingActivities

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInInvestingActivities'])

NetCashProvidedByUsedInInvestingActivities   33018
NetCashProvidedByUsedInInvestingActivitiesContinuingOperations   98645
CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations   128368


In [None]:
sum_into_empty_target(cf_pivot_pd_copy, 
                      'NetCashProvidedByUsedInInvestingActivitiesContinuingOperations',
                      'CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations',
                      'NetCashProvidedByUsedInInvestingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInInvestingActivitiesContinuingOperations', 'NetCashProvidedByUsedInInvestingActivities')
copy_if_not_empty(cf_pivot_pd_copy, 'CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations', 'NetCashProvidedByUsedInInvestingActivities')

set_to_zero_if_null(cf_pivot_pd_copy, 'NetCashProvidedByUsedInInvestingActivities')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInInvestingActivities'])

NetCashProvidedByUsedInInvestingActivities   0
NetCashProvidedByUsedInInvestingActivitiesContinuingOperations   98645
CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations   128368


### Financing activities
- PaymentsForRepurchaseOfCommonStock: Aktienrückkäufe
- PaymentsOfDividends
- NetCashProvidedByUsedInFinancingActivities

('CashProvidedByUsedInDiscontinuedOperationsFinancingActivities',
 'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations',
 'NetCashProvidedByUsedInFinancingActivities',
 'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations')

#### NetCashProvidedByUsedInFinancingActivities

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInFinancingActivities'])

NetCashProvidedByUsedInFinancingActivities   33202
NetCashProvidedByUsedInFinancingActivitiesContinuingOperations   100132
CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations   130954


In [None]:
sum_into_empty_target(cf_pivot_pd_copy, 
                      'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations',
                      'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations',
                      'NetCashProvidedByUsedInFinancingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations', 'NetCashProvidedByUsedInFinancingActivities')
copy_if_not_empty(cf_pivot_pd_copy, 'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations', 'NetCashProvidedByUsedInFinancingActivities')

set_to_zero_if_null(cf_pivot_pd_copy, 'NetCashProvidedByUsedInFinancingActivities')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInFinancingActivities'])

NetCashProvidedByUsedInFinancingActivities   0


#### PaymentsOfDividends
Simply set to 0.0 if no data is present
 'PaymentsOfDividends',
 'PaymentsOfDividendsCommonStock',
 'PaymentsOfDividendsMinorityInterest',
 'PaymentsOfDividendsPreferredStockAndPreferenceStock',
 'PaymentsOfOrdinaryDividends',

In [None]:
sum_cols_into_new_target(cf_pivot_pd_copy, 'PaymentsOfDividendsTotal', 
                                 ['PaymentsOfDividends',
                                 'PaymentsOfDividendsCommonStock',
                                 'PaymentsOfDividendsMinorityInterest',
                                 'PaymentsOfDividendsPreferredStockAndPreferenceStock',
                                 'PaymentsOfOrdinaryDividends'])

#### PaymentsForRepurchaseOfCommonStock: Stock buybacks
 'PaymentsForRepurchaseOfCommonStock',
 'PaymentsForRepurchaseOfCommonStockForEmployeeTaxWithholdingObligations',
 'PaymentsForRepurchaseOfConvertiblePreferredStock',
 'PaymentsForRepurchaseOfPreferredStockAndPreferenceStock',
 'PaymentsForRepurchaseOfRedeemableConvertiblePreferredStock',
 'PaymentsForRepurchaseOfRedeemablePreferredStock'

In [None]:
sum_cols_into_new_target(cf_pivot_pd_copy, 'PaymentsForRepurchaseOfStockTotal', 
                                 ['PaymentsForRepurchaseOfCommonStock',
                                     'PaymentsForRepurchaseOfCommonStockForEmployeeTaxWithholdingObligations',
                                     'PaymentsForRepurchaseOfConvertiblePreferredStock',
                                     'PaymentsForRepurchaseOfPreferredStockAndPreferenceStock',
                                     'PaymentsForRepurchaseOfRedeemableConvertiblePreferredStock',
                                     'PaymentsForRepurchaseOfRedeemablePreferredStock'])

### Save

In [None]:
cf_pivot_pd_copy[["cik","ticker", "adsh","period","form", 
                                     'CashAndCashEquivalentsPeriodIncreaseDecrease',
                                     'NetIncomeLoss', 'ProfitLoss', 
                                     'NetCashProvidedByUsedInOperatingActivities', 
                                     'NetCashProvidedByUsedInInvestingActivities',
                                     'NetCashProvidedByUsedInFinancingActivities',
                                     'PaymentsOfDividendsTotal'
                                     'PaymentsForRepurchaseOfStockTotal']] \
                  .to_csv(all_processed_folder + "cf_not_cleaned.csv", index=False)

### Clean empty companies

In [None]:
cf_cols_selected = cf_pivot_pd_copy[["cik","ticker", "adsh","period","form", 
                                     'CashAndCashEquivalentsPeriodIncreaseDecrease',
                                     'NetIncomeLoss', 'ProfitLoss', 
                                     'NetCashProvidedByUsedInOperatingActivities', 
                                     'NetCashProvidedByUsedInInvestingActivities',
                                     'NetCashProvidedByUsedInFinancingActivities'.
                                     'PaymentsOfDividendsTotal'
                                     'PaymentsForRepurchaseOfStockTotal']]

In [None]:
incomplete_ciks = cf_cols_selected[cf_cols_selected.isnull().sum(axis=1) > 0].cik.unique()

In [None]:
len(incomplete_ciks)

514

In [None]:
cf_cols_cleaned = cf_cols_selected[~cf_pivot_pd_copy.cik.isin(incomplete_ciks)]

In [None]:
cf_cols_cleaned.shape

(117038, 11)

In [None]:
cf_cols_cleaned.isnull().sum(axis=1).sum()

0

In [None]:
cf_cols_cleaned.to_csv(all_processed_folder + "cf.csv", index=False)

## 03_IncomeStatement

Gross Margin
- Net Sales
- Cost of Sales
- Gross Margin -> NetSales - CostOfSales

Operating Expenses
- R&D
- Selling, general and admin
- Total op expenses = R&D + Selling, general and admin

- Operating Income = Gross Margin - Total op expenses      -> OperatingIncomeLoss
- other income
- Income before provision for income taxes = operating income + other income

- Provision for income taxes
- Net income = Income before taxes -taxes                   -> NetIncomeLoss -> also available in CF(!)

Earning per share
- Basic
- Diluted

Shares used in computing earnings per share:
- basic
- diluted


In [None]:
is_pivot_value = load_data("IS", "value")
spark_shape(is_pivot_value)

(133579, 2004)

In [None]:
# is_empty_count = get_empty_count(is_pivot_value)

In [None]:
is_pivot_pd = is_pivot_value.toPandas()

In [None]:
is_pivot_pd['value_count'] = is_pivot_pd.notnull().sum(axis=1)-len(pivot_group) # create a column that countains the number of not null values of the row

In [None]:
is_pivot_pd.shape

(133579, 2005)

In [None]:
is_pivot_pd_copy = is_pivot_pd.copy()

In [None]:
# if there are less than 5 columns with values it is likely that this is not a complete statement
# often this indicates, that the real information is inside the ComprehensiveIncome Statement and not in an IncomeStatement
is_pivot_pd_copy = is_pivot_pd_copy[is_pivot_pd_copy['value_count'] > 4] 
is_pivot_pd_copy.shape

(133134, 2005)

In [None]:
# merge relevant columns from the UN dataset
is_pivot_pd_copy = prepare_un_values(is_pivot_pd_copy, [
    'NetIncomeLoss',
    'ProfitLoss', 
    'NetIncomeLossAvailableToCommonStockholdersBasic',
    'NetIncomeLossAllocatedToLimitedPartners',
    'OperatingIncomeLoss',
    
    'Revenues',
    'SalesRevenueNet', 
    'RevenueFromContractWithCustomerExcludingAssessedTax',
    'RevenueFromContractWithCustomerIncludingAssessedTax'
])
is_pivot_pd_copy.shape

(133134, 2014)

#### NetIncome

In [None]:
print_null_count(is_pivot_pd_copy, [ 'NetIncomeLoss', 'NetIncomeLossAvailableToCommonStockholdersBasic', 'ProfitLoss'])

NetIncomeLoss   17468
NetIncomeLossAvailableToCommonStockholdersBasic   106041
ProfitLoss   86569


In [None]:
copy_if_not_empty(is_pivot_pd_copy, 'cpy_NetIncomeLoss', 'NetIncomeLoss')

copy_if_not_empty(is_pivot_pd_copy, 'ProfitLoss', 'NetIncomeLoss')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_ProfitLoss', 'NetIncomeLoss')

copy_if_not_empty(is_pivot_pd_copy, 'NetIncomeLossAvailableToCommonStockholdersBasic', 'NetIncomeLoss')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_NetIncomeLossAvailableToCommonStockholdersBasic', 'NetIncomeLoss')

copy_if_not_empty(is_pivot_pd_copy, 'NetIncomeLossAllocatedToLimitedPartners', 'NetIncomeLoss')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_NetIncomeLossAllocatedToLimitedPartners', 'NetIncomeLoss')

In [None]:
print_null_count(is_pivot_pd_copy, [ 'NetIncomeLoss'])

NetIncomeLoss   688


#### NetSales

In [None]:
print_null_count(is_pivot_pd_copy, [ 
    'Revenues',
    'SalesRevenueNet',
    'RevenueFromContractWithCustomerExcludingAssessedTax', # Sales
    'RevenueFromContractWithCustomerIncludingAssessedTax', # Sales
])

Revenues   83094
SalesRevenueNet   97291
RevenueFromContractWithCustomerExcludingAssessedTax   123053
RevenueFromContractWithCustomerIncludingAssessedTax   129931


In [None]:
copy_if_not_empty(is_pivot_pd_copy, 'cpy_Revenues', 'Revenues')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_SalesRevenueNet', 'SalesRevenueNet')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_RevenueFromContractWithCustomerExcludingAssessedTax', 'RevenueFromContractWithCustomerExcludingAssessedTax')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_RevenueFromContractWithCustomerIncludingAssessedTax', 'RevenueFromContractWithCustomerIncludingAssessedTax')
#copy_if_not_empty(is_pivot_pd_copy, 'cpy_RevenuesExcludingInterestAndDividends', 'RevenuesExcludingInterestAndDividends')

copy_if_not_empty(is_pivot_pd_copy, 'SalesRevenueNet', 'Revenues')
copy_if_not_empty(is_pivot_pd_copy, 'RevenueFromContractWithCustomerExcludingAssessedTax', 'Revenues')
copy_if_not_empty(is_pivot_pd_copy, 'RevenueFromContractWithCustomerIncludingAssessedTax', 'Revenues')
copy_if_not_empty(is_pivot_pd_copy, 'RevenuesExcludingInterestAndDividends', 'Revenues')
copy_if_not_empty(is_pivot_pd_copy, 'RegulatedAndUnregulatedOperatingRevenue', 'Revenues')



# some companies provide NonInterestIncome and InterestAndDividendIncomeOperating instead of a Revenue
sum_into_empty_target(is_pivot_pd_copy,  
                      'InterestAndDividendIncomeOperating',
                      'NoninterestIncome',
                      'Revenues')

sum_into_empty_target(is_pivot_pd_copy,  
                      'InterestIncomeExpenseNet',
                      'NonoperatingIncomeExpense',
                      'Revenues')



sum_into_empty_target(is_pivot_pd_copy,  
                      'InterestAndDividendIncomeOperating',
                      'NonoperatingIncomeExpense',
                      'Revenues')


sum_into_empty_target(is_pivot_pd_copy,  
                      'InterestIncomeExpenseNet',
                      'NoninterestIncome',
                      'Revenues')



In [None]:
print_null_count(is_pivot_pd_copy, [ 'Revenues'])

Revenues   22278


#### OperatingIncomeLoss
<pre>
   Gross Profit
 - SellingGeneralAndAdministrativeExpense
 - ResearchAndDevelopmentExpense
 - AmortizationOfIntangibleAssets
 - BusinessCombinationAcquisitionRelatedCosts
 = OperatingIncomeLoss
</pre>
Separat anschauen: 10-Ks


In [None]:
print_null_count(is_pivot_pd_copy, ['OperatingIncomeLoss'])

OperatingIncomeLoss   34604


#### Gross Margin
- Revenue (NetSales
- Cost of Sales
- Gross Margin

##### Revenue

In [None]:
 print_null_count(is_pivot_pd_copy, [   
     'Revenues',
'SalesRevenueNet',
     'RevenueFromContractWithCustomerExcludingAssessedTax', # Sales
'RevenueFromContractWithCustomerIncludingAssessedTax', # Sales

 'RevenueFromCollaborativeArrangementExcludingRevenueFromContractWithCustomer',
 'RevenueNotFromContractWithCustomer',
 'RevenueNotFromContractWithCustomerExcludingInterestIncome',
 'RevenueNotFromContractWithCustomerOther',
 'RevenuesFromExternalCustomers'])

Revenues   83795
SalesRevenueNet   97965
RevenueFromContractWithCustomerExcludingAssessedTax   123742
RevenueFromContractWithCustomerIncludingAssessedTax   130615
RevenueFromCollaborativeArrangementExcludingRevenueFromContractWithCustomer   133816
RevenueNotFromContractWithCustomer   133807
RevenueNotFromContractWithCustomerExcludingInterestIncome   133824
RevenueNotFromContractWithCustomerOther   133813
RevenuesFromExternalCustomers   133805


In [None]:
print_null_count(is_pivot_pd_copy, ['RevenueFromContractWithCustomerExcludingAssessedTax', 'CostOfGoodsAndServicesSold','GrossProfit'])

RevenueFromContractWithCustomerExcludingAssessedTax   123742
CostOfGoodsAndServicesSold   107983
GrossProfit   82255


# xx_trials

In [None]:
# index = is_pivot_pd_copy.form == '10-K'
# print('10-Ks', len(pd.unique(is_pivot_pd_copy[index].adsh))) # 32283

#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '4')].count() # 32188
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '0')].count() # 96

# len(pd.unique(is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & 
#                                (is_pivot_pd_copy.fp == 'FY') & 
#                                (is_pivot_pd_copy.qtrs.isin(['0','4']) )].adsh) )# 32207

# index = is_pivot_pd_copy.form == '10-Q'
# print('10-Qs', len(pd.unique(is_pivot_pd_copy[index].adsh))) # 101521
# is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-Q') & (is_pivot_pd_copy.qtrs == '1')].count() # 101378
# is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-Q') & (is_pivot_pd_copy.qtrs == '2')].count() # 101378
# is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-Q') & (is_pivot_pd_copy.qtrs == '3')].count() # 101378
is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-Q') & (is_pivot_pd_copy.qtrs == '4')].count() # 101378

cik                                                                        200
ticker                                                                     200
adsh                                                                       200
form                                                                       200
period                                                                     200
                                                                          ... 
WeightedAverageNumberOfSharesRestrictedStock                                 0
WeightedAverageNumberOfSharesTreasuryStock                                   0
WeightedAverageNumerDilutedLimitedPartnershipUnitsOutstandingAdjustment      0
WellServiceExpense                                                           0
WriteOffOfDeferredDebtIssuanceCost                                           0
Length: 2328, dtype: int64

In [None]:
pd.set_option('display.max_rows', 40)
is_pivot_pd_copy[
                (is_pivot_pd_copy.Revenues.isnull())  \
                &(is_pivot_pd_copy.SalesRevenueNet.isnull())  \
                &(is_pivot_pd_copy.RevenueFromContractWithCustomerExcludingAssessedTax.isnull())   \
                &(is_pivot_pd_copy.RevenueFromContractWithCustomerIncludingAssessedTax.isnull())   \
                &(is_pivot_pd_copy.InterestAndDividendIncomeOperating.isnull())   \
                &(is_pivot_pd_copy.NoninterestIncome.isnull())   \
               &(is_pivot_pd_copy.RevenuesExcludingInterestAndDividends.isnull())   \
                 &(is_pivot_pd_copy.RegulatedAndUnregulatedOperatingRevenue.isnull())  \
#                 &(is_pivot_pd_copy.SalesRevenueNet.isnull())  \
                ] \
  [["cik","ticker", "adsh","period", "form","fp","qtrs","value_count",
    'Revenues',
    'SalesRevenueNet', 
     'RevenueFromContractWithCustomerExcludingAssessedTax', # Sales
     'RevenueFromContractWithCustomerIncludingAssessedTax', # Sales
    'InterestAndDividendIncomeOperating',
    'NoninterestIncome',
    'RevenuesExcludingInterestAndDividends'
    
#  'Revenues',
#  'SalesRevenueNet',
#  'OperatingLeasesIncomeStatementLeaseRevenue',


#  'RevenueFromCollaborativeArrangementExcludingRevenueFromContractWithCustomer',
#  'RevenueNotFromContractWithCustomer',
#  'RevenueNotFromContractWithCustomerExcludingInterestIncome',
#  'RevenueNotFromContractWithCustomerOther',
#  'RevenuesFromExternalCustomers',    
# 'CostOfGoodsAndServicesSold',
#  'GrossProfit'
   
   
   
   ]] \
    .sort_values(by=['period'])

Unnamed: 0,cik,ticker,adsh,period,form,fp,qtrs,value_count,Revenues,SalesRevenueNet,RevenueFromContractWithCustomerExcludingAssessedTax,RevenueFromContractWithCustomerIncludingAssessedTax,InterestAndDividendIncomeOperating,NoninterestIncome,RevenuesExcludingInterestAndDividends
102763,1164727,NEM,0000891618-09-000150,2009-03-31,10-Q,Q1,1,27,,,,,,,
50670,831259,FCX,0000831259-09-000047,2009-03-31,10-Q,Q1,1,18,,,,,,,
88758,1274494,FSLR,0001274494-09-000019,2009-06-30,10-Q,Q2,1,18,,,,,,,
120548,831259,FCX,0000831259-09-000076,2009-06-30,10-Q,Q2,1,21,,,,,,,
71363,886982,GS,0000950123-09-029919,2009-06-30,10-Q,Q2,1,15,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22176,1580608,SC,0001580608-20-000095,2020-06-30,10-Q,Q2,1,24,,,,,,,
128215,894158,PP,0001104659-20-091503,2020-06-30,10-Q,Q2,1,12,,,,,,,
108742,1421461,IPI,0001421461-20-000023,2020-06-30,10-Q,Q2,1,18,,,,,,,
107678,764065,CLF,0000764065-20-000212,2020-06-30,10-Q,Q2,1,26,,,,,,,


In [None]:
is_pivot_pd_copy[is_pivot_pd_copy.adsh=="0000082166-20-000130"].dropna(how='all', axis=1)

Unnamed: 0,cik,ticker,adsh,form,period,fp,qtrs,AntidilutiveSecuritiesExcludedFromComputationOfEarningsPerShareAmount,NetIncomeLoss,cpy_NetIncomeLoss
19921,82166,RAVN,0000082166-20-000130,10-Q,2020-07-31,Q2,1,321613.0,5819000.0,5819000.0


In [None]:
#is_pivot_pd_copy[(is_pivot_pd_copy.qtrs == '4') & (is_pivot_pd_copy.fp != 'FY')]
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '4')].count() # 32188
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY')].count() #54080
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') ].count() #54100
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp != 'FY')].count() #20
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '0')].count() # 96
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '1')].count() # 21363
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '2')].count() # 69
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '3')].count() # 56

cik                                                                        21363
ticker                                                                     21363
adsh                                                                       21363
form                                                                       21363
period                                                                     21363
                                                                           ...  
WeightedAverageNumberOfSharesRestrictedStock                                   0
WeightedAverageNumberOfSharesTreasuryStock                                     0
WeightedAverageNumerDilutedLimitedPartnershipUnitsOutstandingAdjustment        0
WellServiceExpense                                                             1
WriteOffOfDeferredDebtIssuanceCost                                            13
Length: 2328, dtype: int64

In [None]:
is_pivot_pd_copy[is_pivot_pd_copy.adsh == '0001401521-20-000018'].notnull().sum(axis=1)-len(pivot_group)

128918    7
dtype: int64

In [None]:
#is_pivot_pd_copy[is_pivot_pd_copy.adsh == '0001401521-20-000018'].isnull().sum(axis=1)
selection = is_pivot_pd_copy[(is_pivot_pd_copy.qtrs == '0') | (is_pivot_pd_copy.qtrs > '4')].isnull().sum(axis=1)
selection = selection == 2320 # shape[1]-8
selection.sum()

1556

In [None]:
cf_empty_pd = cf_empty_count.toPandas()

In [None]:
cf_empty_pd.shape

(1, 3052)

In [None]:
cf_melt_pd = cf_empty_pd.melt(var_name = 'Tag', value_name = "Count")
cf_melt_pd['diff'] = 133811 -cf_melt_pd['Count']

In [None]:
canditates =  ['CashAndCashEquivalentsPeriodIncreaseDecrease','cpy_CashAndCashEquivalentsPeriodIncreaseDecrease',
 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
 'CashPeriodIncreaseDecrease',
 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect']

cf_melt_pd[cf_melt_pd['Tag'].isin(canditates)]

Unnamed: 0,Tag,Count,diff
350,CashAndCashEquivalentsPeriodIncreaseDecrease,30244,103567
351,CashAndCashEquivalentsPeriodIncreaseDecreaseEx...,132424,1387
360,CashCashEquivalentsRestrictedCashAndRestricted...,130603,3208
361,CashCashEquivalentsRestrictedCashAndRestricted...,119033,14778
382,CashPeriodIncreaseDecrease,132236,1575
383,CashPeriodIncreaseDecreaseExcludingExchangeRat...,133766,45


In [None]:
sorted = cf_melt_pd.sort_values('Count', ascending=True)[:100]
sorted.reset_index(drop = True, inplace = True)
sorted.plot.bar(x = 'Tag', y='Count', figsize = (15,10))

In [None]:
empty_count = get_empty_count(bs_pivot_value)

In [None]:
empty_pd = empty_count.toPandas()

In [None]:
melt_pd = empty_pd.melt(var_name = 'Tag', value_name = "Count")
# df2 = pd.melt(df, id_vars=["location", "name"], var_name="Date", value_name="Value")

In [None]:
melt_pd.columns

Index(['Tag', 'Count'], dtype='object')

In [None]:
#pd_frame = df_all_selected.where("adsh == '0000082166-20-000130' and stmt == 'IS'").toPandas()
pd_frame = df_all_selected.where("adsh == '0001193125-20-213555'").toPandas()
#print(pd_frame.sort_values(['report', 'line']))

In [None]:
pd.set_option('display.max_rows', pd_frame.shape[0]+1)
pd_frame[['fp','cik', 'tag', 'value', 'stmt', 'report', 'line','period', 'qtrs']].sort_values(['report','qtrs', 'line'])

Unnamed: 0,fp,cik,tag,value,stmt,report,line,period,qtrs
23,Q2,880631,CashAndCashEquivalentsAtCarryingValue,50255000.0,BS,2,7,2020-06-30,0
6,Q2,880631,SecurityOwnedAndSoldNotYetPurchasedFairValueSe...,13110000.0,BS,2,8,2020-06-30,0
3,Q2,880631,AccountsReceivableNetCurrent,24372000.0,BS,2,9,2020-06-30,0
15,Q2,880631,PrepaidExpenseCurrent,5621000.0,BS,2,10,2020-06-30,0
34,Q2,880631,OtherAssetsCurrent,1414000.0,BS,2,11,2020-06-30,0
13,Q2,880631,AssetsCurrent,94772000.0,BS,2,12,2020-06-30,0
40,Q2,880631,PropertyPlantAndEquipmentNet,7835000.0,BS,2,13,2020-06-30,0
12,Q2,880631,HeldToMaturitySecuritiesNoncurrent,581000.0,BS,2,16,2020-06-30,0
39,Q2,880631,DeferredIncomeTaxAssetsNet,5540000.0,BS,2,17,2020-06-30,0
9,Q2,880631,InvestmentsInAffiliatesSubsidiariesAssociatesA...,11192000.0,BS,2,18,2020-06-30,0


In [None]:
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsIncludingDisposalGroupAndDiscontinuedOperations
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsIncludingDisposalGroupAndDiscontinuedOperations

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents

In [None]:
bs_pivot_report.where("adsh == '0001492298-20-000025'").show()

In [None]:
df_all_selected.select('adsh','stmt').distinct().count()

585411

In [None]:
df_all_selected.select('adsh','stmt').distinct().where('stmt = "BS"').count()

133872

In [None]:
df_all_selected.select('adsh','stmt').distinct().where('stmt = "EQ"').count()

86120

In [None]:
cf_pivot_pd_copy.columns.tolist()

In [None]:
[x for x in is_pivot_pd_copy.columns.values if ('Revenues' in x)]
#[x for x in is_pivot_pd_copy.columns.values if ('Revenue' in x) and ('Customer' in x)]

['BusinessCombinationSeparatelyRecognizedTransactionsRevenuesAndGainsRecognized',
 'RelatedPartyTransactionOtherRevenuesFromTransactionsWithRelatedParty',
 'RelatedPartyTransactionRevenuesFromTransactionsWithRelatedParty',
 'Revenues',
 'RevenuesExcludingInterestAndDividends',
 'RevenuesFromExternalCustomers',
 'RevenuesFromTransactionsWithOtherOperatingSegmentsOfSameEntity',
 'RevenuesNetOfInterestExpense']

In [None]:
[x for x in bs_pivot_liabilities_copy.columns.values if x.startswith('StockholdersEquity')]

['StockholdersEquity',
 'StockholdersEquityAttributableToParentNotAllowableForNetCapital',
 'StockholdersEquityBeforeTreasuryStock',
 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterestAdjustedBalance1',
 'StockholdersEquityNoteStockSplitConversionRatio',
 'StockholdersEquityNoteStockSplitConversionRatio1',
 'StockholdersEquityNoteSubscriptionsReceivable']

In [None]:
spark.stop()