In [None]:
# default_exp uniform

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# uniform and complete

Once the data is pivoted, we want to create a dataset that contains only columns which appear in every report.

For instance, we want to have all the columns 'Assets', 'AssetsCurrent', 'AssetsNoncurrent' to be present in every balancesheet. However, not all desired tags are present and we have to find ways to calculate and complete them. Often, only 2 of 'Assets', 'AssetsCurrent', 'AssetsNoncurrent' are present, however, since Assets = AssetsCurrent + AssetsNoncurrent we can calculate the third.

Sometimes different tags are used to express the same meaning, so we have to figure out which tags belong together.

## Basic Settings

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import get_spark_session # initialze spark
from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col

import pandas as pd

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

In [None]:
all_pivot_selected_folder  = "D:/data/parq_pivot_select"
all_pivoted_folder = "D:/data/parq_pivot_split"
all_processed_folder = "D:/data/parq_processed/"

col_list =    ["stmt","cik","ticker", "adsh","period","form","tag","value","report", "line", "fp", "uom"]
pivot_attrs = ['value', 'report', 'line']
statements =  ['IS','CF','CP','BS','CI','EQ','UN']

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## 00_Tools

These are different helper methods which make transformation of the data easier

In [None]:
def load_data(stmt:str, attr:str):
    """ Loads the pivoted data into a spark dataframe.
    """
    return spark.read.parquet(all_pivoted_folder + "/" + stmt + "/" + attr).cache()

In [None]:
def spark_shape(self):
    return (self.count(), len(self.columns))

In [None]:
### Get count of both null and missing values in pyspark
from pyspark.sql.functions import isnan, when, count, col
def get_empty_count(df):
    return df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

In [None]:
def complete_addition(df, sumcol, addcol1, addcol2):
    """ 
    If there are columns that share the relation sumcol = addcol1 + addcol2
    this function ensures that a missing value is calculated based on the other two   
    """
    missingtwo = (df[sumcol].notnull()) & (df[addcol1].notnull()) & (df[addcol2].isnull())
    df.loc[missingtwo, addcol2] = df.loc[missingtwo, sumcol] - df.loc[missingtwo, addcol1]

    missingone = (df[sumcol].notnull()) & (df[addcol2].notnull()) & (df[addcol1].isnull())
    df.loc[missingone, addcol1] = df.loc[missingone, sumcol] - df.loc[missingone, addcol2]  
    
    missingsum = (df[sumcol].isnull()) & (df[addcol2].notnull()) & (df[addcol1].notnull())
    df.loc[missingsum, sumcol] = df.loc[missingsum, addcol1] + df.loc[missingsum, addcol2]  

In [None]:
def copy_if_not_empty(df, sourcecol, targetcol, to_zero_col = None):
    """ 
    copies the value from the sourceol to the targetcol if the sourcecol is not empty and the targetcol is empty.
    As a third parameter, a column can be provided that has to be set to 0.0 in the rows where the values are copied    
    """
    do_copy = (df[sourcecol].notnull()) & (df[targetcol].isnull())
    df.loc[do_copy, targetcol] = df.loc[do_copy, sourcecol]
    if to_zero_col != None:
        df.loc[do_copy, to_zero_col] = 0.0

In [None]:
def sum_into_empty_target(df, add1col, add2col, targetcol):
    """
    adds the value of two not empty columns and stores the result in the empty targetcol
    """
    do_sum = (df[add1col].notnull()) & (df[add2col].notnull()) & (df[targetcol].isnull())
    df.loc[do_sum, targetcol] = df.loc[do_sum, add1col] + df.loc[do_sum, add2col]

In [None]:
def sum_cols_into_new_target(df, targetcol, sumcolslist):
    """ sums up the value  of several columns and stores the result in the targetcol.
        the columns that contain values to sum up may not be empty.
    """
    df[targetcol] = 0.0
    for col in sumcolslist:
        set_to_zero_if_null(df, col)
        df[targetcol] += df[col]

In [None]:
def copy_if_not_empty_for_ticker(df, ticker, sourcecol, targetcol, to_zero_col = None):
    """ copy the not empty sourcecol to the empty targetcol of a certain ticker.
        if provided, set the to_zero_col for the affected rows to zero.
    """
    do_copy = (df['ticker'] == ticker) & (sel_df[sourcecol].notnull()) & (sel_df[targetcol].isnull())

    df.loc[do_copy, targetcol] = df.loc[do_copy, sourcecol]
    if to_zero_col != None:
        df.loc[do_copy, to_zero_col] = 0.0

In [None]:
def set_to_zero_if_null(df, col):
    """ set null values of a column to 0.0
    """
    do_set = (df[col].isnull())
    df.loc[do_set, col] = 0.0

In [None]:
def print_null_count(df, cols):
    """ print out howmany null values the provided cols contain 
    """
    for col in cols:
        print(col, ' ', df[col].isnull().sum())

## 00_Raw_data

In [None]:
df_all_selected = spark.read.parquet(all_pivot_selected_folder).cache()

In [None]:
# it happens sometimes, that the data can not be associated with a right sheet (bs, is, cf, ..). in this cases, the data can appea under "UN"
# so if expected information cannot be found in the appropriate statement, we have to look in the un statement
un_pivot_value = load_data("UN", "value")
un_pivot_pd = un_pivot_value.toPandas()

In [None]:
un_pivot_pd.shape

(4074, 1958)

## 01_Balance_Sheet

In [None]:
bs_pivot_value = load_data("BS", "value")
spark_shape(bs_pivot_value)

(133872, 2342)

In [None]:
bs_pivot_pd = bs_pivot_value.toPandas()

In [None]:
bs_pivot_pd_copy = bs_pivot_pd.copy()

### Assets

In [None]:
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   579
AssetsNoncurrent   127026
AssetsCurrent   30421


In [None]:
# Somtimes AssetsNet is present instead of Assets, copy its content to Assets
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsNet', 'Assets')

# if one of the three provided columns is missing, calculate its content based on Assets = AssetsCurrent + AssetsNoncurrent
complete_addition(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')

# if Assets contains data but AssetsCurrent and AssetsNoncurrent are empty, assume that only AssetsCurrent is present
# copy value from Assets to AssetsCurrent and set AssetsNoncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')

# if AssetsCurrent contains data and Assets  and AssetsNoncurrent are empty, assume that only AssetsCurrent is present
# copy value from AssetsCurrent to Assets and set AssetsNoncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsCurrent', 'Assets', 'AssetsNoncurrent')

In [None]:
# check for how many entries Assets, AssetsNoncurrent and AsstesCurrent couldn't be completed
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   419
AssetsNoncurrent   419
AssetsCurrent   419


### Liabilities

In [None]:
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   34118
LiabilitiesNoncurrent   117766
LiabilitiesCurrent   30837


In [None]:
# Completing the Liabilities columns follows the same logic as for the Assets columns

complete_addition(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')

copy_if_not_empty(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')
copy_if_not_empty(bs_pivot_pd_copy, 'LiabilitiesCurrent', 'Liabilities', 'LiabilitiesNoncurrent')

In [None]:
# check for how many entries we were not able to complete the Liabilities information
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   783
LiabilitiesNoncurrent   781
LiabilitiesCurrent   783


### Equity
In the Equity section of the balance sheet, we are intereste in the StockholdersEquity and the Earnings (Tag. RetainedEarningsAccumulatedDeficit)

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit'])

StockholdersEquity   12883
RetainedEarningsAccumulatedDeficit   10583
LiabilitiesAndStockholdersEquity   710


In [None]:
# per  definition, LiabilitisAndStockholdersEquity has to match Assets in a balance sheet
# so if LiabilitiesAndStockholdersEquity is not set, we copy the value from the Assets column
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'LiabilitiesAndStockholdersEquity') # has to be the same

# if there is partner capital but no StockholdersEquite, we consider it the same as stockholder equity
copy_if_not_empty(bs_pivot_pd_copy, 'PartnersCapital', 'StockholdersEquity') 

# if there is StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest instead of StockholdersEquity, we use this as StocholdersEquity
copy_if_not_empty(bs_pivot_pd_copy, 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest', 'StockholdersEquity') 

# if RetainedEarnings has no value, we set it to zero
set_to_zero_if_null(bs_pivot_pd_copy, 'RetainedEarningsAccumulatedDeficit')

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit'])

StockholdersEquity   1277
RetainedEarningsAccumulatedDeficit   0
LiabilitiesAndStockholdersEquity   409


### Clean empty companies

In [None]:
bs_cols_selected = bs_pivot_pd_copy[["cik","ticker", "adsh","period","form", 
                                     'Assets','AssetsNoncurrent', 'AssetsCurrent', 
                                     'Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent', 
                                     'StockholdersEquity','RetainedEarningsAccumulatedDeficit']]

In [None]:
incomplete_ciks = bs_cols_selected[bs_cols_selected.isnull().sum(axis=1) > 0].cik.unique()

In [None]:
bs_cols_cleaned = bs_cols_selected[~bs_pivot_pd_copy.cik.isin(incomplete_ciks)]

In [None]:
bs_cols_cleaned.shape

(124520, 13)

In [None]:
bs_cols_cleaned.isnull().sum(axis=1).sum()

0

### Save

In [None]:
bs_cols_cleaned.to_csv(all_processed_folder + "bs.csv", index=False)

## CashFlow

Operation
- NetIncomeLoss
- ProfitLoss
- NetCashProvidedByUsedInOperatingActivities: NetIncome + other positions ergibt diese Position

Investing
- NetCashProvidedByUsedInInvestingActivities

Financing activities
- PaymentsForRepurchaseOfCommonStock: Aktienrückkäufe
- PaymentsOfDividends
- NetCashProvidedByUsedInFinancingActivities

Cash Bestand unterschied
- CashAndCashEquivalentsPeriodIncreaseDecrease: increase/decrease in cash

In [None]:
cf_pivot_value = load_data("CF", "value")
spark_shape(cf_pivot_value)

(133811, 3052)

In [None]:
cf_empty_count = get_empty_count(cf_pivot_value)

In [None]:
cf_pivot_pd = cf_pivot_value.toPandas()

In [None]:
cf_pivot_pd_copy = cf_pivot_pd.copy()

In [None]:
cf_pivot_pd.shape

(133811, 3052)

### Cash Increase/Decrease
- 'CashAndCashEquivalentsPeriodIncreaseDecrease',
- 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
- 'CashPeriodIncreaseDecrease',
- 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'NetCashProvidedByUsedInContinuingOperations'

In [None]:
print_null_count(cf_pivot_pd_copy, ['CashAndCashEquivalentsPeriodIncreaseDecrease'])

CashAndCashEquivalentsPeriodIncreaseDecrease   30244


In [None]:
# add possible columns from un set to  cf data with prefix cpy_
un_cash_incdec = un_pivot_pd[["adsh",
    'CashAndCashEquivalentsPeriodIncreaseDecrease',
    'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
    'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
#    'CashPeriodIncreaseDecrease',
#    'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
#    'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect' 
]].copy()
un_cash_incdec.rename(columns=lambda x: "cpy_" + x, inplace=True)
un_cash_incdec.rename(columns={'cpy_adsh':'adsh'}, inplace=True)

In [None]:
cf_pivot_pd_copy = pd.merge(cf_pivot_pd_copy, un_cash_incdec, how='left', on=['adsh'])
cf_pivot_pd_copy.shape

(133811, 3055)

In [None]:
# if CashAndCashEquivalentsPeriodIncreaseDecrease is not present and CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect
# is present, we can replace CashAndCashEquivalentsPeriodIncreaseDecrease.
# there are only about 12 entries where both are present
copy_if_not_empty(cf_pivot_pd_copy, 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashPeriodIncreaseDecrease', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInContinuingOperations', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or


# try to find data in joined un data
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashAndCashEquivalentsPeriodIncreaseDecrease', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or

In [None]:
print_null_count(cf_pivot_pd_copy, ['CashAndCashEquivalentsPeriodIncreaseDecrease'])

CashAndCashEquivalentsPeriodIncreaseDecrease   1563


### Operation
- NetIncomeLoss
- NetCashProvidedByUsedInOperatingActivities: NetIncome + other positions ergibt diese Position

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetIncomeLoss', 'ProfitLoss', 'NetCashProvidedByUsedInOperatingActivities'])

NetIncomeLoss   58102
ProfitLoss   84716
NetCashProvidedByUsedInOperatingActivities   32125


In [None]:
# if only ProfitLoss is set, copy content to NetIncomeLoss
# if onlyNetIncomeLoss is set, copy to ProfitLoss
copy_if_not_empty(cf_pivot_pd_copy, 'ProfitLoss', 'NetIncomeLoss')
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLoss', 'ProfitLoss')
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLossAvailableToCommonStockholdersBasic', 'ProfitLoss') # certain CFs just have this position
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLossAvailableToCommonStockholdersBasic', 'NetIncomeLoss')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivitiesContinuingOperations', 'NetCashProvidedByUsedInOperatingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivities', 'ProfitLoss') # certain CFs just have this position
copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetIncomeLoss', 'ProfitLoss', 'NetCashProvidedByUsedInOperatingActivities'])

NetIncomeLoss   230
ProfitLoss   230
NetCashProvidedByUsedInOperatingActivities   501


### Investing
- NetCashProvidedByUsedInInvestingActivities

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInInvestingActivities'])

NetCashProvidedByUsedInInvestingActivities   33018
NetCashProvidedByUsedInInvestingActivitiesContinuingOperations   98645
CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations   128368


In [None]:
sum_into_empty_target(cf_pivot_pd_copy, 
                      'NetCashProvidedByUsedInInvestingActivitiesContinuingOperations',
                      'CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations',
                      'NetCashProvidedByUsedInInvestingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInInvestingActivitiesContinuingOperations', 'NetCashProvidedByUsedInInvestingActivities')
copy_if_not_empty(cf_pivot_pd_copy, 'CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations', 'NetCashProvidedByUsedInInvestingActivities')

set_to_zero_if_null(cf_pivot_pd_copy, 'NetCashProvidedByUsedInInvestingActivities')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInInvestingActivities'])

NetCashProvidedByUsedInInvestingActivities   0
NetCashProvidedByUsedInInvestingActivitiesContinuingOperations   98645
CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations   128368


### Financing activities
- PaymentsForRepurchaseOfCommonStock: Aktienrückkäufe
- PaymentsOfDividends
- NetCashProvidedByUsedInFinancingActivities

('CashProvidedByUsedInDiscontinuedOperationsFinancingActivities',
 'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations',
 'NetCashProvidedByUsedInFinancingActivities',
 'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations')

#### NetCashProvidedByUsedInFinancingActivities

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInFinancingActivities'])

NetCashProvidedByUsedInFinancingActivities   33202
NetCashProvidedByUsedInFinancingActivitiesContinuingOperations   100132
CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations   130954


In [None]:
sum_into_empty_target(cf_pivot_pd_copy, 
                      'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations',
                      'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations',
                      'NetCashProvidedByUsedInFinancingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations', 'NetCashProvidedByUsedInFinancingActivities')
copy_if_not_empty(cf_pivot_pd_copy, 'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations', 'NetCashProvidedByUsedInFinancingActivities')

set_to_zero_if_null(cf_pivot_pd_copy, 'NetCashProvidedByUsedInFinancingActivities')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInFinancingActivities'])

NetCashProvidedByUsedInFinancingActivities   0


#### PaymentsOfDividends
Simply set to 0.0 if no data is present
 'PaymentsOfDividends',
 'PaymentsOfDividendsCommonStock',
 'PaymentsOfDividendsMinorityInterest',
 'PaymentsOfDividendsPreferredStockAndPreferenceStock',
 'PaymentsOfOrdinaryDividends',

In [None]:
sum_cols_into_new_target(cf_pivot_pd_copy, 'PaymentsOfDividendsTotal', 
                                 ['PaymentsOfDividends',
                                 'PaymentsOfDividendsCommonStock',
                                 'PaymentsOfDividendsMinorityInterest',
                                 'PaymentsOfDividendsPreferredStockAndPreferenceStock',
                                 'PaymentsOfOrdinaryDividends'])

#### PaymentsForRepurchaseOfCommonStock: Stock buybacks
 'PaymentsForRepurchaseOfCommonStock',
 'PaymentsForRepurchaseOfCommonStockForEmployeeTaxWithholdingObligations',
 'PaymentsForRepurchaseOfConvertiblePreferredStock',
 'PaymentsForRepurchaseOfPreferredStockAndPreferenceStock',
 'PaymentsForRepurchaseOfRedeemableConvertiblePreferredStock',
 'PaymentsForRepurchaseOfRedeemablePreferredStock'

In [None]:
sum_cols_into_new_target(cf_pivot_pd_copy, 'PaymentsForRepurchaseOfStockTotal', 
                                 ['PaymentsForRepurchaseOfCommonStock',
                                     'PaymentsForRepurchaseOfCommonStockForEmployeeTaxWithholdingObligations',
                                     'PaymentsForRepurchaseOfConvertiblePreferredStock',
                                     'PaymentsForRepurchaseOfPreferredStockAndPreferenceStock',
                                     'PaymentsForRepurchaseOfRedeemableConvertiblePreferredStock',
                                     'PaymentsForRepurchaseOfRedeemablePreferredStock'])

### Clean empty companies

In [None]:
cf_cols_selected = cf_pivot_pd_copy[["cik","ticker", "adsh","period","form", 
                                     'CashAndCashEquivalentsPeriodIncreaseDecrease',
                                     'NetIncomeLoss', 'ProfitLoss', 
                                     'NetCashProvidedByUsedInOperatingActivities', 
                                     'NetCashProvidedByUsedInInvestingActivities',
                                     'NetCashProvidedByUsedInFinancingActivities']]

In [None]:
incomplete_ciks = cf_cols_selected[cf_cols_selected.isnull().sum(axis=1) > 0].cik.unique()

In [None]:
len(incomplete_ciks)

514

In [None]:
cf_cols_cleaned = cf_cols_selected[~cf_pivot_pd_copy.cik.isin(incomplete_ciks)]

In [None]:
cf_cols_cleaned.shape

(117038, 11)

In [None]:
cf_cols_cleaned.isnull().sum(axis=1).sum()

0

### Save

In [None]:
cf_cols_cleaned.to_csv(all_processed_folder + "cf.csv", index=False)

# xx_trials

In [None]:
cf_pivot_pd_copy[(cf_pivot_pd_copy.StockholdersEquity.isnull())  ][["cik","ticker", "adsh","period", "StockholdersEquityBeforeTreasuryStock", "LiabilitiesAndStockholdersEquity", "RetainedEarningsAccumulatedDeficit"]].sort_values(by=['period'])

In [None]:
selected = bs_piivot_pd_copy['ticker'] == 'FXE'
sel_df = bs_piivot_pd_copy.loc[selected]
do_copy = (sel_df['AssetsCurrent'].notnull()) & (sel_df['Assets'].isnull())
do_copy.sum()

34

In [None]:
pd.set_option('display.max_rows', 40)
cf_pivot_pd_copy[(cf_pivot_pd_copy.Dividends.notnull())  \
#              &(cf_pivot_pd_copy.PaymentsOfDividendsCommonStock.notnull())  \
#                &(cf_pivot_pd_copy.CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations.notnull()) 
                ] \
  [["cik","ticker", "adsh","period", 
'Dividends',
'PaymentsOfDividends',
 'PaymentsOfDividendsCommonStock',
 'PaymentsOfDividendsMinorityInterest',
 'PaymentsOfDividendsPreferredStockAndPreferenceStock',
 'PaymentsOfOrdinaryDividends'

    ]] \
    .sort_values(by=['period'])

Unnamed: 0,cik,ticker,adsh,period,Dividends,PaymentsOfDividends,PaymentsOfDividendsCommonStock,PaymentsOfDividendsMinorityInterest,PaymentsOfDividendsPreferredStockAndPreferenceStock,PaymentsOfOrdinaryDividends
65024,91576,KEY,0000950123-10-018789,2009-12-31,0.0,213000000.0,,,,
88525,813920,CEC,0001140361-11-025400,2011-03-31,-4001000.0,,,,,
131975,892482,QUMU,0000897101-11-001346,2011-06-30,-950000.0,,,,,
63359,1401521,UIHC,0001401521-11-000020,2011-06-30,0.0,,,,,
102842,1273801,NRF,0001047469-11-007027,2011-06-30,-27860000.0,,,18336000.0,,
...,...,...,...,...,...,...,...,...,...,...
104588,912593,SUI,0000912593-20-000137,2020-06-30,7781000.0,,,,,
18311,928340,CWCO,0001558370-20-010684,2020-06-30,1288154.0,,2564980.0,,5738.0,
99044,906345,CPT,0000906345-20-000052,2020-06-30,84138000.0,165085000.0,,,,
83351,706863,UNB,0000706863-20-000091,2020-06-30,2863000.0,,2844000.0,,,


In [None]:
bs_pivot_liabilities_copy[(bs_pivot_liabilities_copy.StockholdersEquity.isnull())][["cik","ticker", "adsh","period", "Assets", "StockholdersEquity", "LiabilitiesAndStockholdersEquity", "RetainedEarningsAccumulatedDeficit"]].sort_values(by=['period'])

In [None]:
bs_pivot_liabilities_copy[(bs_pivot_liabilities_copy.adsh == '0001104659-09-029605')][["cik","ticker", "adsh","period","report", "line"]].sort_values(by=['period'])

['cik',
 'ticker',
 'adsh',
 'form',
 'period',
 'fp',
 'AcceleratedShareRepurchaseProgramAdjustment',
 'AcceleratedShareRepurchasesSettlementPaymentOrReceipt',
 'AccountsAndNotesReceivableNet',
 'AccountsAndOtherReceivablesNetCurrent',
 'AccountsPayableAndAccruedLiabilities',
 'AccountsPayableAndAccruedLiabilitiesCurrentAndNoncurrent',
 'AccountsPayableAndOtherAccruedLiabilities',
 'AccountsPayableCurrent',
 'AccountsPayableCurrentAndNoncurrent',
 'AccountsPayableRelatedPartiesCurrentAndNoncurrent',
 'AccountsPayableRelatedPartiesNoncurrent',
 'AccountsReceivableNet',
 'AccountsReceivableNetCurrent',
 'AccretionAmortizationOfDiscountsAndPremiumsInvestments',
 'AccretionExpense',
 'AccretionExpenseIncludingAssetRetirementObligations',
 'AccretionOfDiscount',
 'AccrualForEnvironmentalLossContingenciesPayments',
 'AccrualForEnvironmentalLossContingenciesPeriodIncreaseDecrease',
 'AccruedInvestmentIncomeReceivable',
 'AccruedLiabilitiesAndOtherLiabilities',
 'AccruedProfessionalFeesCurren

In [None]:
cf_empty_pd = cf_empty_count.toPandas()

In [None]:
cf_empty_pd.shape

(1, 3052)

In [None]:
cf_melt_pd = cf_empty_pd.melt(var_name = 'Tag', value_name = "Count")
cf_melt_pd['diff'] = 133811 -cf_melt_pd['Count']

In [None]:
canditates =  ['CashAndCashEquivalentsPeriodIncreaseDecrease','cpy_CashAndCashEquivalentsPeriodIncreaseDecrease',
 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
 'CashPeriodIncreaseDecrease',
 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect']

cf_melt_pd[cf_melt_pd['Tag'].isin(canditates)]

Unnamed: 0,Tag,Count,diff
350,CashAndCashEquivalentsPeriodIncreaseDecrease,30244,103567
351,CashAndCashEquivalentsPeriodIncreaseDecreaseEx...,132424,1387
360,CashCashEquivalentsRestrictedCashAndRestricted...,130603,3208
361,CashCashEquivalentsRestrictedCashAndRestricted...,119033,14778
382,CashPeriodIncreaseDecrease,132236,1575
383,CashPeriodIncreaseDecreaseExcludingExchangeRat...,133766,45


In [None]:
sorted = cf_melt_pd.sort_values('Count', ascending=True)[:100]
sorted.reset_index(drop = True, inplace = True)
sorted.plot.bar(x = 'Tag', y='Count', figsize = (15,10))

In [None]:
empty_count = get_empty_count(bs_pivot_value)

In [None]:
empty_pd = empty_count.toPandas()

In [None]:
melt_pd = empty_pd.melt(var_name = 'Tag', value_name = "Count")
# df2 = pd.melt(df, id_vars=["location", "name"], var_name="Date", value_name="Value")

In [None]:
melt_pd.columns

Index(['Tag', 'Count'], dtype='object')

In [None]:
pd_frame = df_all_selected.where("adsh == '0000906345-20-000052' and stmt == 'CF'").toPandas()
#print(pd_frame.sort_values(['report', 'line']))

In [None]:
pd.set_option('display.max_rows', pd_frame.shape[0]+1)
pd_frame[['tag', 'value', 'stmt', 'report', 'line']].sort_values(['report', 'line'])

Unnamed: 0,tag,value,stmt,report,line
12,ProfitLoss,61978000.0,CF,6,2
13,ProfitLoss,17511000.0,CF,6,2
3,DepreciationDepletionAndAmortization,184662000.0,CF,6,4
33,GainLossOnDispositionOfAssets1,382000.0,CF,6,5
0,EquityMethodInvestmentDividendsOrDistributions,4039000.0,CF,6,6
15,IncomeLossFromEquityMethodInvestments,3755000.0,CF,6,7
16,IncomeLossFromEquityMethodInvestments,1600000.0,CF,6,7
8,ShareBasedCompensation,7046000.0,CF,6,8
14,OtherOperatingActivitiesCashFlowStatement,-3414000.0,CF,6,10
19,NetCashProvidedByUsedInOperatingActivities,250174000.0,CF,6,11


In [None]:
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsIncludingDisposalGroupAndDiscontinuedOperations
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsIncludingDisposalGroupAndDiscontinuedOperations

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents

In [None]:
bs_pivot_report.where("adsh == '0001492298-20-000025'").show()

In [None]:
df_all_selected.select('adsh','stmt').distinct().count()

585411

In [None]:
df_all_selected.select('adsh','stmt').distinct().where('stmt = "BS"').count()

133872

In [None]:
df_all_selected.select('adsh','stmt').distinct().where('stmt = "EQ"').count()

86120

In [None]:
cf_pivot_pd_copy.columns.tolist()

In [None]:
#[x for x in cf_pivot_pd_copy.columns.values if ('Dividends' in x)]
[x for x in cf_pivot_pd_copy.columns.values if ('Repurchase' in x) and ('Stock' in x)]

['EmployeeStockOwnershipPlanESOPRepurchaseObligationAmount',
 'PaymentsForRepurchaseOfCommonStock',
 'PaymentsForRepurchaseOfCommonStockForEmployeeTaxWithholdingObligations',
 'PaymentsForRepurchaseOfConvertiblePreferredStock',
 'PaymentsForRepurchaseOfPreferredStockAndPreferenceStock',
 'PaymentsForRepurchaseOfRedeemableConvertiblePreferredStock',
 'PaymentsForRepurchaseOfRedeemablePreferredStock',
 'ProceedsFromRepurchaseOfRedeemablePreferredStock',
 'StockRepurchaseProgramRemainingAuthorizedRepurchaseAmount',
 'StockRepurchasedAndRetiredDuringPeriodShares',
 'StockRepurchasedAndRetiredDuringPeriodValue',
 'StockRepurchasedDuringPeriodShares',
 'StockRepurchasedDuringPeriodValue',
 'TreasuryStockReissuedAtLowerThanRepurchasePrice',
 'WeightedAverageNumberOfSharesCommonStockSubjectToRepurchaseOrCancellation']

In [None]:
[x for x in bs_pivot_liabilities_copy.columns.values if x.startswith('StockholdersEquity')]

['StockholdersEquity',
 'StockholdersEquityAttributableToParentNotAllowableForNetCapital',
 'StockholdersEquityBeforeTreasuryStock',
 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterestAdjustedBalance1',
 'StockholdersEquityNoteStockSplitConversionRatio',
 'StockholdersEquityNoteStockSplitConversionRatio1',
 'StockholdersEquityNoteSubscriptionsReceivable']