In [None]:
# default_exp uniform

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# todos


- CF Statement hat noch mehrere Qrts drin. prüfen, ob alle qtrs=1 haben.


# uniform and complete

Once the data is pivoted, we want to create a dataset that contains only columns which appear in every report.

For instance, we want to have all the columns 'Assets', 'AssetsCurrent', 'AssetsNoncurrent' to be present in every balancesheet. However, not all desired tags are present and we have to find ways to calculate and complete them. Often, only 2 of 'Assets', 'AssetsCurrent', 'AssetsNoncurrent' are present, however, since Assets = AssetsCurrent + AssetsNoncurrent we can calculate the third.

Sometimes different tags are used to express the same meaning, so we have to figure out which tags belong together.

## Basic Settings

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import * 
from pathlib import Path
from typing import List, Tuple, Union, Set
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col

import pandas as pd

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

In [None]:
all_pivot_selected_folder  = "D:/data/parq_pivot_select"
all_pivoted_folder = "D:/data/parq_pivot_split"
all_processed_folder = "D:/data/parq_processed/"

col_list =    ["stmt","cik","ticker", "adsh","period","filed", "form","tag","value","report", "line", "fp", "uom"]
pivot_group = ["cik","ticker","adsh","form","period","fp", "qtrs"]
pivot_attrs = ['value', 'report', 'line']
statements =  ['IS','CF','CP','BS','CI','EQ','UN']

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

## 00_Raw_data

In [None]:
# loading the complete unpivoted dataset - if it is needed for debbuging
df_all_selected = spark.read.parquet(all_pivot_selected_folder).cache()

In [None]:
# it happens sometimes, that the data could not be associated with a right sheet (bs, is, cf, ..). in this cases, the data can appea under "UN"
# so if expected information cannot be found in the appropriate statement, we have to look in the un statement
un_pivot_value = load_data("UN", "value")
un_pivot_pd = un_pivot_value.toPandas()

In [None]:
un_pivot_pd.shape

(5989, 1774)

In [None]:
def prepare_un_values(df_to_merge_into, attr_list):
    # add possible columns from un set to  cf data with prefix cpy_
    attributes = pivot_group[:] # create copy
    attributes.extend(attr_list)

    un_prepared = un_pivot_pd[attributes].copy()
    un_prepared.rename(columns=lambda x: x  if x in pivot_group else ("cpy_" + x), inplace=True)

    return pd.merge(df_to_merge_into, un_prepared, how='left', on=pivot_group)

## 01_Balance_Sheet

In [None]:
bs_pivot_value = load_data("BS", "value")
spark_shape(bs_pivot_value)

(117868, 2243)

In [None]:
bs_pivot_pd = bs_pivot_value.toPandas()

In [None]:
bs_pivot_pd_copy = bs_pivot_pd.copy()

### Assets

In [None]:
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   1801
AssetsNoncurrent   111944
AssetsCurrent   28053


In [None]:
# Somtimes AssetsNet is present instead of Assets, copy its content to Assets
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsNet', 'Assets')

# if one of the three provided columns is missing, calculate its content based on Assets = AssetsCurrent + AssetsNoncurrent
complete_addition(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')

# if Assets contains data but AssetsCurrent and AssetsNoncurrent are empty, assume that only AssetsCurrent is present
# copy value from Assets to AssetsCurrent and set AssetsNoncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'AssetsCurrent', 'AssetsNoncurrent')

# if AssetsCurrent contains data and Assets  and AssetsNoncurrent are empty, assume that only AssetsCurrent is present
# copy value from AssetsCurrent to Assets and set AssetsNoncurrent to 0.0
copy_if_not_empty(bs_pivot_pd_copy, 'AssetsCurrent', 'Assets', 'AssetsNoncurrent')

In [None]:
# check for how many entries Assets, AssetsNoncurrent and AsstesCurrent couldn't be completed
print_null_count(bs_pivot_pd_copy, ['Assets','AssetsNoncurrent','AssetsCurrent'])

Assets   1675
AssetsNoncurrent   1675
AssetsCurrent   1675


### Liabilities

In [None]:
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   1983
LiabilitiesNoncurrent   1982
LiabilitiesCurrent   1983


In [None]:
# Completing the Liabilities columns follows the same logic as for the Assets columns

complete_addition(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')

copy_if_not_empty(bs_pivot_pd_copy, 'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent')
copy_if_not_empty(bs_pivot_pd_copy, 'LiabilitiesCurrent', 'Liabilities', 'LiabilitiesNoncurrent')

In [None]:
# check for how many entries we were not able to complete the Liabilities information
print_null_count(bs_pivot_pd_copy, ['Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent'])

Liabilities   1983
LiabilitiesNoncurrent   1982
LiabilitiesCurrent   1983


### Equity
In the Equity section of the balance sheet, we are intereste in the StockholdersEquity and the Earnings (Tag. RetainedEarningsAccumulatedDeficit)

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit'])

StockholdersEquity   11898
RetainedEarningsAccumulatedDeficit   10396


In [None]:
# per  definition, LiabilitisAndStockholdersEquity has to match Assets in a balance sheet
# so if LiabilitiesAndStockholdersEquity is not set, we copy the value from the Assets column
copy_if_not_empty(bs_pivot_pd_copy, 'Assets', 'LiabilitiesAndStockholdersEquity') # has to be the same

# if there is partner capital but no StockholdersEquite, we consider it the same as stockholder equity
copy_if_not_empty(bs_pivot_pd_copy, 'PartnersCapital', 'StockholdersEquity') 

# if there is StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest instead of StockholdersEquity, we use this as StocholdersEquity
copy_if_not_empty(bs_pivot_pd_copy, 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest', 'StockholdersEquity') 

# if RetainedEarnings has no value, we set it to zero
set_to_zero_if_null(bs_pivot_pd_copy, 'RetainedEarningsAccumulatedDeficit')

In [None]:
print_null_count(bs_pivot_pd_copy, ['StockholdersEquity','RetainedEarningsAccumulatedDeficit'])

StockholdersEquity   2423
RetainedEarningsAccumulatedDeficit   0


### Save

In [None]:
bs_pivot_pd_copy[["cik","ticker", "adsh","period","filed","form", "qtrs","fp",
                 'Assets','AssetsNoncurrent', 'AssetsCurrent', 
                 'Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent', 
                 'StockholdersEquity','RetainedEarningsAccumulatedDeficit']] \
        .to_csv(all_processed_folder + "bs_not_cleaned.csv", index=False)

### Clean empty companies

In [None]:
bs_cols_selected = bs_pivot_pd_copy[["cik","ticker", "adsh","period","form", "qtrs","fp"
                                     'Assets','AssetsNoncurrent', 'AssetsCurrent', 
                                     'Liabilities','LiabilitiesNoncurrent','LiabilitiesCurrent', 
                                     'StockholdersEquity','RetainedEarningsAccumulatedDeficit']]

In [None]:
incomplete_ciks = bs_cols_selected[bs_cols_selected.isnull().sum(axis=1) > 0].cik.unique()

In [None]:
bs_cols_cleaned = bs_cols_selected[~bs_pivot_pd_copy.cik.isin(incomplete_ciks)]

In [None]:
bs_cols_cleaned.shape

(124520, 13)

In [None]:
bs_cols_cleaned.isnull().sum(axis=1).sum()

0

In [None]:
bs_cols_cleaned.to_csv(all_processed_folder + "bs.csv", index=False)

## 02_CashFlow

Operation
- NetIncomeLoss
- ProfitLoss
- NetCashProvidedByUsedInOperatingActivities: NetIncome + other positions ergibt diese Position

Investing
- NetCashProvidedByUsedInInvestingActivities

Financing activities
- PaymentsForRepurchaseOfCommonStock: Aktienrückkäufe
- PaymentsOfDividends
- NetCashProvidedByUsedInFinancingActivities

Cash Bestand unterschied
- CashAndCashEquivalentsPeriodIncreaseDecrease: increase/decrease in cash

In [None]:
cf_pivot_value = load_data("CF", "value")
spark_shape(cf_pivot_value)

(305903, 2909)

In [None]:
#cf_empty_count = get_empty_count(cf_pivot_value)

In [None]:
cf_pivot_pd = cf_pivot_value.toPandas()

In [None]:
cf_pivot_pd_copy = cf_pivot_pd.copy()

In [None]:
cf_pivot_pd.shape

(305903, 2909)

### Cash Increase/Decrease
- 'CashAndCashEquivalentsPeriodIncreaseDecrease',
- 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
- 'CashPeriodIncreaseDecrease',
- 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect',
- 'NetCashProvidedByUsedInContinuingOperations'

In [None]:
print_null_count(cf_pivot_pd_copy, ['CashAndCashEquivalentsPeriodIncreaseDecrease'])

CashAndCashEquivalentsPeriodIncreaseDecrease   217223


In [None]:
# merge relevant columns from the UN dataset
cf_pivot_pd_copy = prepare_un_values(cf_pivot_pd_copy, [
    'CashAndCashEquivalentsPeriodIncreaseDecrease',
    'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
    'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect'
])
cf_pivot_pd_copy.shape

(305903, 2912)

In [None]:
# if CashAndCashEquivalentsPeriodIncreaseDecrease is not present and CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect
# is present, we can replace CashAndCashEquivalentsPeriodIncreaseDecrease.
# there are only about 12 entries where both are present
copy_if_not_empty(cf_pivot_pd_copy, 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashPeriodIncreaseDecrease', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInContinuingOperations', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or


# try to find data in joined un data
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashAndCashEquivalentsPeriodIncreaseDecrease', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or
copy_if_not_empty(cf_pivot_pd_copy, 'cpy_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect', 
                                    'CashAndCashEquivalentsPeriodIncreaseDecrease') # either or

In [None]:
print_null_count(cf_pivot_pd_copy, ['CashAndCashEquivalentsPeriodIncreaseDecrease'])

CashAndCashEquivalentsPeriodIncreaseDecrease   188780


### Operation
- NetIncomeLoss
- NetCashProvidedByUsedInOperatingActivities: NetIncome + other positions ergibt diese Position

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetIncomeLoss', 'ProfitLoss', 'NetCashProvidedByUsedInOperatingActivities'])

NetIncomeLoss   195814
ProfitLoss   240347
NetCashProvidedByUsedInOperatingActivities   218636


In [None]:
# if only ProfitLoss is set, copy content to NetIncomeLoss
# if onlyNetIncomeLoss is set, copy to ProfitLoss
copy_if_not_empty(cf_pivot_pd_copy, 'ProfitLoss', 'NetIncomeLoss')
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLoss', 'ProfitLoss')
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLossAvailableToCommonStockholdersBasic', 'ProfitLoss') # certain CFs just have this position
copy_if_not_empty(cf_pivot_pd_copy, 'NetIncomeLossAvailableToCommonStockholdersBasic', 'NetIncomeLoss')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivitiesContinuingOperations', 'NetCashProvidedByUsedInOperatingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivities', 'ProfitLoss') # certain CFs just have this position
copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInOperatingActivities', 'NetIncomeLoss')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetIncomeLoss', 'ProfitLoss', 'NetCashProvidedByUsedInOperatingActivities'])

NetIncomeLoss   119998
ProfitLoss   119998
NetCashProvidedByUsedInOperatingActivities   187825


### Investing
- NetCashProvidedByUsedInInvestingActivities

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInInvestingActivities'])

NetCashProvidedByUsedInInvestingActivities   219410


In [None]:
sum_into_empty_target(cf_pivot_pd_copy, 
                      'NetCashProvidedByUsedInInvestingActivitiesContinuingOperations',
                      'CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations',
                      'NetCashProvidedByUsedInInvestingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInInvestingActivitiesContinuingOperations', 'NetCashProvidedByUsedInInvestingActivities')
copy_if_not_empty(cf_pivot_pd_copy, 'CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations', 'NetCashProvidedByUsedInInvestingActivities')

set_to_zero_if_null(cf_pivot_pd_copy, 'NetCashProvidedByUsedInInvestingActivities')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInInvestingActivities'])

NetCashProvidedByUsedInInvestingActivities   0


### Financing activities
- PaymentsForRepurchaseOfCommonStock: Aktienrückkäufe
- PaymentsOfDividends
- NetCashProvidedByUsedInFinancingActivities

('CashProvidedByUsedInDiscontinuedOperationsFinancingActivities',
 'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations',
 'NetCashProvidedByUsedInFinancingActivities',
 'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations')

#### NetCashProvidedByUsedInFinancingActivities

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInFinancingActivities'])

NetCashProvidedByUsedInFinancingActivities   219518


In [None]:
sum_into_empty_target(cf_pivot_pd_copy, 
                      'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations',
                      'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations',
                      'NetCashProvidedByUsedInFinancingActivities')

copy_if_not_empty(cf_pivot_pd_copy, 'NetCashProvidedByUsedInFinancingActivitiesContinuingOperations', 'NetCashProvidedByUsedInFinancingActivities')
copy_if_not_empty(cf_pivot_pd_copy, 'CashProvidedByUsedInFinancingActivitiesDiscontinuedOperations', 'NetCashProvidedByUsedInFinancingActivities')

set_to_zero_if_null(cf_pivot_pd_copy, 'NetCashProvidedByUsedInFinancingActivities')

In [None]:
print_null_count(cf_pivot_pd_copy, ['NetCashProvidedByUsedInFinancingActivities'])

NetCashProvidedByUsedInFinancingActivities   0


#### PaymentsOfDividends
Simply set to 0.0 if no data is present
 'PaymentsOfDividends',
 'PaymentsOfDividendsCommonStock',
 'PaymentsOfDividendsMinorityInterest',
 'PaymentsOfDividendsPreferredStockAndPreferenceStock',
 'PaymentsOfOrdinaryDividends',

In [None]:
cf_pivot_pd_copy = sum_cols_into_new_target(cf_pivot_pd_copy, 'PaymentsOfDividendsTotal', 
                                 ['PaymentsOfDividends',
                                 'PaymentsOfDividendsCommonStock',
                                 'PaymentsOfDividendsMinorityInterest',
                                 'PaymentsOfDividendsPreferredStockAndPreferenceStock',
                                 'PaymentsOfOrdinaryDividends'])

#### PaymentsForRepurchaseOfCommonStock: Stock buybacks
 'PaymentsForRepurchaseOfCommonStock',
 'PaymentsForRepurchaseOfCommonStockForEmployeeTaxWithholdingObligations',
 'PaymentsForRepurchaseOfConvertiblePreferredStock',
 'PaymentsForRepurchaseOfPreferredStockAndPreferenceStock',
 'PaymentsForRepurchaseOfRedeemableConvertiblePreferredStock',
 'PaymentsForRepurchaseOfRedeemablePreferredStock'

In [None]:
cf_pivot_pd_copy = sum_cols_into_new_target(cf_pivot_pd_copy, 'PaymentsForRepurchaseOfStockTotal', 
                                 ['PaymentsForRepurchaseOfCommonStock',
                                     'PaymentsForRepurchaseOfCommonStockForEmployeeTaxWithholdingObligations',
                                     'PaymentsForRepurchaseOfConvertiblePreferredStock',
                                     'PaymentsForRepurchaseOfPreferredStockAndPreferenceStock',
                                     'PaymentsForRepurchaseOfRedeemableConvertiblePreferredStock',
                                     'PaymentsForRepurchaseOfRedeemablePreferredStock'])

### Save

In [None]:
cf_pivot_pd_copy[["cik","ticker", "adsh","period","form", "qtrs","fp",
                                     'CashAndCashEquivalentsPeriodIncreaseDecrease',
                                     'NetIncomeLoss', 
                                     'ProfitLoss', 
                                     'NetCashProvidedByUsedInOperatingActivities', 
                                     'NetCashProvidedByUsedInInvestingActivities',
                                     'NetCashProvidedByUsedInFinancingActivities',
                                     'PaymentsOfDividendsTotal',
                                     'PaymentsForRepurchaseOfStockTotal']] \
                  .to_csv(all_processed_folder + "cf_not_cleaned.csv", index=False)

### Clean empty companies

In [None]:
cf_cols_selected = cf_pivot_pd_copy[["cik","ticker", "adsh","period","form", 
                                     'CashAndCashEquivalentsPeriodIncreaseDecrease',
                                     'NetIncomeLoss', 'ProfitLoss', 
                                     'NetCashProvidedByUsedInOperatingActivities', 
                                     'NetCashProvidedByUsedInInvestingActivities',
                                     'NetCashProvidedByUsedInFinancingActivities'.
                                     'PaymentsOfDividendsTotal'
                                     'PaymentsForRepurchaseOfStockTotal']]

In [None]:
incomplete_ciks = cf_cols_selected[cf_cols_selected.isnull().sum(axis=1) > 0].cik.unique()

In [None]:
len(incomplete_ciks)

514

In [None]:
cf_cols_cleaned = cf_cols_selected[~cf_pivot_pd_copy.cik.isin(incomplete_ciks)]

In [None]:
cf_cols_cleaned.shape

(117038, 11)

In [None]:
cf_cols_cleaned.isnull().sum(axis=1).sum()

0

In [None]:
cf_cols_cleaned.to_csv(all_processed_folder + "cf.csv", index=False)

## 03_IncomeStatement

Gross Margin
- Net Sales
- Cost of Sales
- Gross Margin -> NetSales - CostOfSales

Operating Expenses
- R&D
- Selling, general and admin
- Total op expenses = R&D + Selling, general and admin

- Operating Income = Gross Margin - Total op expenses      -> OperatingIncomeLoss
- other income
- Income before provision for income taxes = operating income + other income

- Provision for income taxes
- Net income = Income before taxes -taxes                   -> NetIncomeLoss -> also available in CF(!)

Earning per share
- Basic
- Diluted

Shares used in computing earnings per share:
- basic
- diluted


In [None]:
is_pivot_value = load_data("IS", "value")
spark_shape(is_pivot_value)

(116298, 1927)

In [None]:
# is_empty_count = get_empty_count(is_pivot_value)

In [None]:
is_pivot_pd = is_pivot_value.toPandas()

In [None]:
is_pivot_pd['value_count'] = is_pivot_pd.notnull().sum(axis=1)-len(pivot_group) # create a column that countains the number of not null values of the row

In [None]:
is_pivot_pd.shape

(116298, 1928)

In [None]:
is_pivot_pd_copy = is_pivot_pd.copy()

In [None]:
# merge relevant columns from the UN dataset
is_pivot_pd_copy = prepare_un_values(is_pivot_pd_copy, [
    'NetIncomeLoss',
    'NetIncomeLossAvailableToCommonStockholdersBasic',
    'NetIncomeLossAllocatedToLimitedPartners',
    'ProfitLoss',
    'Revenues',
    'SalesRevenueNet',
    'RevenueFromContractWithCustomerExcludingAssessedTax', 
    'RevenueFromContractWithCustomerIncludingAssessedTax', 
    'CostOfGoodsAndServicesSold',
    'CostOfGoodsSold',
    'CostOfRevenue',
    'CostOfServices',
    'CostsAndExpenses',
    'OperatingIncomeLoss',
    'IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments',
    'IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest',
    
    'GrossProfit',
])
is_pivot_pd_copy.shape

(116298, 1945)

In [None]:
# if there are less than 5 columns with values it is likely that this is not a complete statement
# often this indicates, that the real information is inside the ComprehensiveIncome Statement and not in an IncomeStatement
is_pivot_pd_copy = is_pivot_pd_copy[is_pivot_pd_copy['value_count'] > 4] 
is_pivot_pd_copy.shape

(115899, 1945)

#### shares

In [None]:
print_null_count(is_pivot_pd_copy, [ 'EarningsPerShareBasic',
 'EarningsPerShareBasicAndDiluted',
 'EarningsPerShareDiluted',
 'EarningsPerShareBasicDistributed',   
 'EarningsPerShareDilutedDistributed'])

EarningsPerShareBasic   28706
EarningsPerShareBasicAndDiluted   94567
EarningsPerShareDiluted   29540
EarningsPerShareBasicDistributed   115882
EarningsPerShareDilutedDistributed   115882


In [None]:
is_pivot_pd_copy['EarningsPerShare_hj'] = None
copy_if_not_empty(is_pivot_pd_copy, 'EarningsPerShareBasic', 'EarningsPerShare_hj')
copy_if_not_empty(is_pivot_pd_copy, 'EarningsPerShareBasicAndDiluted', 'EarningsPerShare_hj')
copy_if_not_empty(is_pivot_pd_copy, 'EarningsPerShareBasicDistributed', 'EarningsPerShare_hj')
copy_if_not_empty(is_pivot_pd_copy, 'EarningsPerShareDiluted', 'EarningsPerShare_hj') 
copy_if_not_empty(is_pivot_pd_copy, 'EarningsPerShareDilutedDistributed', 'EarningsPerShare_hj') 

In [None]:
print_null_count(is_pivot_pd_copy, ['EarningsPerShare_hj'])

EarningsPerShare_hj   7138


In [None]:
print_null_count(is_pivot_pd_copy, ['WeightedAverageNumberOfSharesOutstandingBasic','WeightedAverageNumberOfDilutedSharesOutstanding'])

WeightedAverageNumberOfSharesOutstandingBasic   42151
WeightedAverageNumberOfDilutedSharesOutstanding   42378


In [None]:
is_pivot_pd_copy['SharesOutstanding_hj'] = None
copy_if_not_empty(is_pivot_pd_copy, 'WeightedAverageNumberOfSharesOutstandingBasic', 'SharesOutstanding_hj')
copy_if_not_empty(is_pivot_pd_copy, 'WeightedAverageNumberOfDilutedSharesOutstanding', 'SharesOutstanding_hj')

In [None]:
print_null_count(is_pivot_pd_copy, ['SharesOutstanding_hj'])

SharesOutstanding_hj   40947


#### NetIncome

In [None]:
print_null_count(is_pivot_pd_copy, [ 'NetIncomeLoss', 'NetIncomeLossAvailableToCommonStockholdersBasic', 'ProfitLoss'])

NetIncomeLoss   15045
NetIncomeLossAvailableToCommonStockholdersBasic   91896
ProfitLoss   76463


In [None]:
is_pivot_pd_copy['NetIncomeLoss_hj'] = None

copy_if_not_empty(is_pivot_pd_copy, 'cpy_NetIncomeLoss', 'NetIncomeLoss')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_NetIncomeLossAvailableToCommonStockholdersBasic', 'NetIncomeLossAvailableToCommonStockholdersBasic')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_NetIncomeLossAllocatedToLimitedPartners', 'NetIncomeLossAllocatedToLimitedPartners')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_ProfitLoss', 'ProfitLoss')


copy_if_not_empty(is_pivot_pd_copy, 'NetIncomeLoss', 'NetIncomeLoss_hj')
copy_if_not_empty(is_pivot_pd_copy, 'NetIncomeLossAvailableToCommonStockholdersBasic', 'NetIncomeLoss_hj')
copy_if_not_empty(is_pivot_pd_copy, 'NetIncomeLossAllocatedToLimitedPartners', 'NetIncomeLoss_hj')
copy_if_not_empty(is_pivot_pd_copy, 'ProfitLoss', 'NetIncomeLoss_hj')

In [None]:
print_null_count(is_pivot_pd_copy, [ 'NetIncomeLoss_hj', 'NetIncomeLoss', 'ProfitLoss'])

NetIncomeLoss_hj   621
NetIncomeLoss   15026
ProfitLoss   76441


#### NetSales / Revenues

In [None]:
print_null_count(is_pivot_pd_copy, [ 
    'Revenues',
    'SalesRevenueNet',
    'RevenueFromContractWithCustomerExcludingAssessedTax', # Sales
    'RevenueFromContractWithCustomerIncludingAssessedTax', # Sales
])

Revenues   72861
SalesRevenueNet   85708
RevenueFromContractWithCustomerExcludingAssessedTax   105818
RevenueFromContractWithCustomerIncludingAssessedTax   112696


In [None]:
is_pivot_pd_copy['Revenues_hj'] = None

copy_if_not_empty(is_pivot_pd_copy, 'cpy_Revenues', 'Revenues')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_SalesRevenueNet', 'SalesRevenueNet')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_RevenueFromContractWithCustomerExcludingAssessedTax', 'RevenueFromContractWithCustomerExcludingAssessedTax')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_RevenueFromContractWithCustomerIncludingAssessedTax', 'RevenueFromContractWithCustomerIncludingAssessedTax')
#copy_if_not_empty(is_pivot_pd_copy, 'cpy_RevenuesExcludingInterestAndDividends', 'RevenuesExcludingInterestAndDividends')

copy_if_not_empty(is_pivot_pd_copy, 'Revenues', 'Revenues_hj')
copy_if_not_empty(is_pivot_pd_copy, 'SalesRevenueNet', 'Revenues_hj')
copy_if_not_empty(is_pivot_pd_copy, 'RevenueFromContractWithCustomerExcludingAssessedTax', 'Revenues_hj')
copy_if_not_empty(is_pivot_pd_copy, 'RevenueFromContractWithCustomerIncludingAssessedTax', 'Revenues_hj')
copy_if_not_empty(is_pivot_pd_copy, 'RevenuesExcludingInterestAndDividends', 'Revenues_hj')
copy_if_not_empty(is_pivot_pd_copy, 'RegulatedAndUnregulatedOperatingRevenue', 'Revenues_hj')

In [None]:
# some companies provide NonInterestIncome and InterestAndDividendIncomeOperating instead of a Revenue
sum_into_empty_target(is_pivot_pd_copy,  
                      'InterestAndDividendIncomeOperating',
                      'NoninterestIncome',
                      'Revenues_hj')

sum_into_empty_target(is_pivot_pd_copy,  
                      'InterestIncomeExpenseNet',
                      'NoninterestIncome',
                      'Revenues_hj')

In [None]:
print_null_count(is_pivot_pd_copy, [ 'Revenues_hj'])

Revenues_hj   19673


#### CostOfSales

In [None]:
print_null_count(is_pivot_pd_copy, [ 
    'CostOfGoodsAndServicesSold',
    'CostOfGoodsSold',
    'CostOfRevenue',
    'CostOfServices',
])

CostOfGoodsAndServicesSold   92834
CostOfGoodsSold   92088
CostOfRevenue   97276
CostOfServices   106588


In [None]:
is_pivot_pd_copy['CostOfRevenue_hj'] = None

copy_if_not_empty(is_pivot_pd_copy, 'cpy_CostOfGoodsAndServicesSold', 'CostOfGoodsAndServicesSold')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_CostOfGoodsSold', 'CostOfGoodsSold')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_CostOfRevenue', 'CostOfRevenue')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_CostOfServices', 'CostOfServices')

copy_if_not_empty(is_pivot_pd_copy, 'CostOfRevenue', 'CostOfRevenue_hj')
copy_if_not_empty(is_pivot_pd_copy, 'CostOfGoodsAndServicesSold', 'CostOfRevenue_hj')

sum_into_empty_target(is_pivot_pd_copy,  
                      'CostOfGoodsSold',
                      'CostOfServices',
                      'CostOfRevenue_hj')

copy_if_not_empty(is_pivot_pd_copy, 'CostOfGoodsSold', 'CostOfRevenue_hj')
copy_if_not_empty(is_pivot_pd_copy, 'CostOfServices', 'CostOfRevenue_hj')

In [None]:
print_null_count(is_pivot_pd_copy, ['CostOfRevenue_hj'])

CostOfRevenue_hj   52385


#### OperatingIncomeLoss

In [None]:
print_null_count(is_pivot_pd_copy, ['OperatingIncomeLoss',
                                   'IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments',
                                   'IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest'])

OperatingIncomeLoss   28924
IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments   59817
IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest   70434


In [None]:
is_pivot_pd_copy['OperatingIncomeLoss_hj'] = None

copy_if_not_empty(is_pivot_pd_copy, 'cpy_OperatingIncomeLoss', 'OperatingIncomeLoss')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments', 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments')
copy_if_not_empty(is_pivot_pd_copy, 'cpy_IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest', 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest')

copy_if_not_empty(is_pivot_pd_copy, 'OperatingIncomeLoss', 'OperatingIncomeLoss_hj')
copy_if_not_empty(is_pivot_pd_copy, 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments', 'OperatingIncomeLoss_hj')
copy_if_not_empty(is_pivot_pd_copy, 'IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest', 'OperatingIncomeLoss_hj')

In [None]:
print_null_count(is_pivot_pd_copy, ['OperatingIncomeLoss_hj'])

OperatingIncomeLoss_hj   4073


#### Other

In [None]:
copy_if_not_empty(is_pivot_pd_copy, 'cpy_CostsAndExpenses', 'CostsAndExpenses')

#### Save

In [None]:
is_pivot_pd_copy[["cik","ticker", "adsh","period","form", "qtrs","fp",
                    'Revenues_hj',
                    'CostOfRevenue_hj',
                    'OperatingIncomeLoss_hj',
                    'CostsAndExpenses',
                    'NetIncomeLoss_hj', 'NetIncomeLoss', 'ProfitLoss',
                    'SharesOutstanding_hj',
                    'EarningsPerShare_hj'
                 ]] \
                  .to_csv(all_processed_folder + "is_not_cleaned.csv", index=False)

# xx_trials

In [None]:
# index = is_pivot_pd_copy.form == '10-K'
# print('10-Ks', len(pd.unique(is_pivot_pd_copy[index].adsh))) # 32283

#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '4')].count() # 32188
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '0')].count() # 96

# len(pd.unique(is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & 
#                                (is_pivot_pd_copy.fp == 'FY') & 
#                                (is_pivot_pd_copy.qtrs.isin(['0','4']) )].adsh) )# 32207

# index = is_pivot_pd_copy.form == '10-Q'
# print('10-Qs', len(pd.unique(is_pivot_pd_copy[index].adsh))) # 101521
# is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-Q') & (is_pivot_pd_copy.qtrs == '1')].count() # 101378
# is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-Q') & (is_pivot_pd_copy.qtrs == '2')].count() # 101378
# is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-Q') & (is_pivot_pd_copy.qtrs == '3')].count() # 101378
is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-Q') & (is_pivot_pd_copy.qtrs == '4')].count() # 101378

cik                                                                        200
ticker                                                                     200
adsh                                                                       200
form                                                                       200
period                                                                     200
                                                                          ... 
WeightedAverageNumberOfSharesRestrictedStock                                 0
WeightedAverageNumberOfSharesTreasuryStock                                   0
WeightedAverageNumerDilutedLimitedPartnershipUnitsOutstandingAdjustment      0
WellServiceExpense                                                           0
WriteOffOfDeferredDebtIssuanceCost                                           0
Length: 2328, dtype: int64

In [None]:
pd.set_option('display.max_rows', 40)
is_pivot_pd_copy[\
                 
                 (is_pivot_pd_copy.OperatingIncomeLoss_hj.isnull())  
                 &(is_pivot_pd_copy.IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments.isnull())
            
#                 &(is_pivot_pd_copy.CostsAndExpenses.notnull())
#                 &(is_pivot_pd_copy.CostOfGoodsAndServicesSold.isnull()) 
#                  &(is_pivot_pd_copy.CostOfGoodsSold.isnull()) 
#                  &(is_pivot_pd_copy.CostOfRevenue.isnull()) 
#                  &(is_pivot_pd_copy.CostOfServices.isnull())   
                 
 #                (is_pivot_pd_copy.EarningsPerShareBasic.isnull())
#                   ', 'WeightedAverageNumberOfSharesOutstandingBasic'      

#                      (is_pivot_pd_copy.NetIncomeLoss.notnull())  \
#                    & (is_pivot_pd_copy.ProfitLoss.notnull())  \
#                  & (is_pivot_pd_copy.NetIncomeLossAvailableToCommonStockholdersBasic.notnull()) \
#                 (is_pivot_pd_copy.Revenues.isnull())  \
#                 &(is_pivot_pd_copy.SalesRevenueNet.isnull())  \
#                 &(is_pivot_pd_copy.RevenueFromContractWithCustomerExcludingAssessedTax.isnull())   \
#                 &(is_pivot_pd_copy.RevenueFromContractWithCustomerIncludingAssessedTax.isnull())   \
#                 &(is_pivot_pd_copy.InterestAndDividendIncomeOperating.isnull())   \
#                 &(is_pivot_pd_copy.NoninterestIncome.isnull())   \
#                &(is_pivot_pd_copy.RevenuesExcludingInterestAndDividends.isnull())   \
#                  &(is_pivot_pd_copy.RegulatedAndUnregulatedOperatingRevenue.isnull())  \
#                 &(is_pivot_pd_copy.SalesRevenueNet.isnull())  \
                ] \
  [["cik","ticker", "adsh","period", "form","fp","qtrs","value_count",
    'OperatingIncomeLoss',
    'IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments',
    'CostsAndExpenses',
#     'CostOfGoodsAndServicesSold',
#     'CostOfGoodsSold',
#     'CostOfRevenue',
#     'CostOfServices',
#     'NetIncomeLoss', 'ProfitLoss','NetIncomeLossAvailableToCommonStockholdersBasic'
#     'Revenues',
#     'SalesRevenueNet', 
#      'RevenueFromContractWithCustomerExcludingAssessedTax', # Sales
#      'RevenueFromContractWithCustomerIncludingAssessedTax', # Sales
#     'InterestAndDividendIncomeOperating',
#     'NoninterestIncome',
#     'RevenuesExcludingInterestAndDividends'
    
#  'Revenues',
#  'SalesRevenueNet',
#  'OperatingLeasesIncomeStatementLeaseRevenue',


#  'RevenueFromCollaborativeArrangementExcludingRevenueFromContractWithCustomer',
#  'RevenueNotFromContractWithCustomer',
#  'RevenueNotFromContractWithCustomerExcludingInterestIncome',
#  'RevenueNotFromContractWithCustomerOther',
#  'RevenuesFromExternalCustomers',    
# 'CostOfGoodsAndServicesSold',
#  'GrossProfit'
   
   
   
   ]] \
    .sort_values(by=['period'])

Unnamed: 0,cik,ticker,adsh,period,form,fp,qtrs,value_count,OperatingIncomeLoss,IncomeLossFromContinuingOperationsBeforeIncomeTaxesMinorityInterestAndIncomeLossFromEquityMethodInvestments,CostsAndExpenses
31205,798359,IRET,0000798359-12-000015,2012-01-31,10-Q,Q3,1,27,,,4.226400e+07
74534,65172,MSB,0001104659-12-025296,2012-01-31,10-K,FY,4,8,,,
58282,1328598,FXE,0001193125-12-110009,2012-01-31,10-Q,Q1,1,8,,,
71016,14177,BRID,0001437749-12-001969,2012-01-31,10-Q,Q1,1,8,,,
58391,799850,CRMT,0001171843-12-000715,2012-01-31,10-Q,Q3,1,18,,,9.362500e+07
...,...,...,...,...,...,...,...,...,...,...,...
49394,799850,CRMT,0001171843-20-006280,2020-07-31,10-Q,Q1,1,19,,,1.623720e+08
81583,65172,MSB,0001558370-20-011056,2020-07-31,10-Q,Q2,1,7,,,
88069,808450,NAV,0000808450-20-000098,2020-07-31,10-Q,Q3,1,19,,,1.703000e+09
100464,72333,JWN,0000072333-20-000195,2020-07-31,10-Q,Q2,1,9,,,


In [None]:
is_pivot_pd_copy[is_pivot_pd_copy.adsh=="0000082166-20-000130"].dropna(how='all', axis=1)

Unnamed: 0,cik,ticker,adsh,form,period,fp,qtrs,AntidilutiveSecuritiesExcludedFromComputationOfEarningsPerShareAmount,NetIncomeLoss,cpy_NetIncomeLoss
19921,82166,RAVN,0000082166-20-000130,10-Q,2020-07-31,Q2,1,321613.0,5819000.0,5819000.0


In [None]:
#is_pivot_pd_copy[(is_pivot_pd_copy.qtrs == '4') & (is_pivot_pd_copy.fp != 'FY')]
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '4')].count() # 32188
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY')].count() #54080
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') ].count() #54100
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp != 'FY')].count() #20
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '0')].count() # 96
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '1')].count() # 21363
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '2')].count() # 69
#is_pivot_pd_copy[(is_pivot_pd_copy.form == '10-K') & (is_pivot_pd_copy.fp == 'FY') & (is_pivot_pd_copy.qtrs == '3')].count() # 56

cik                                                                        21363
ticker                                                                     21363
adsh                                                                       21363
form                                                                       21363
period                                                                     21363
                                                                           ...  
WeightedAverageNumberOfSharesRestrictedStock                                   0
WeightedAverageNumberOfSharesTreasuryStock                                     0
WeightedAverageNumerDilutedLimitedPartnershipUnitsOutstandingAdjustment        0
WellServiceExpense                                                             1
WriteOffOfDeferredDebtIssuanceCost                                            13
Length: 2328, dtype: int64

In [None]:
is_pivot_pd_copy[is_pivot_pd_copy.adsh == '0001401521-20-000018'].notnull().sum(axis=1)-len(pivot_group)

128918    7
dtype: int64

In [None]:
#is_pivot_pd_copy[is_pivot_pd_copy.adsh == '0001401521-20-000018'].isnull().sum(axis=1)
selection = is_pivot_pd_copy[(is_pivot_pd_copy.qtrs == '0') | (is_pivot_pd_copy.qtrs > '4')].isnull().sum(axis=1)
selection = selection == 2320 # shape[1]-8
selection.sum()

1556

In [None]:
cf_empty_pd = cf_empty_count.toPandas()

In [None]:
cf_empty_pd.shape

(1, 3052)

In [None]:
cf_melt_pd = cf_empty_pd.melt(var_name = 'Tag', value_name = "Count")
cf_melt_pd['diff'] = 133811 -cf_melt_pd['Count']

In [None]:
canditates =  ['CashAndCashEquivalentsPeriodIncreaseDecrease','cpy_CashAndCashEquivalentsPeriodIncreaseDecrease',
 'CashAndCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseExcludingExchangeRateEffect',
 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect',
 'CashPeriodIncreaseDecrease',
 'CashPeriodIncreaseDecreaseExcludingExchangeRateEffect']

cf_melt_pd[cf_melt_pd['Tag'].isin(canditates)]

Unnamed: 0,Tag,Count,diff
350,CashAndCashEquivalentsPeriodIncreaseDecrease,30244,103567
351,CashAndCashEquivalentsPeriodIncreaseDecreaseEx...,132424,1387
360,CashCashEquivalentsRestrictedCashAndRestricted...,130603,3208
361,CashCashEquivalentsRestrictedCashAndRestricted...,119033,14778
382,CashPeriodIncreaseDecrease,132236,1575
383,CashPeriodIncreaseDecreaseExcludingExchangeRat...,133766,45


In [None]:
sorted = cf_melt_pd.sort_values('Count', ascending=True)[:100]
sorted.reset_index(drop = True, inplace = True)
sorted.plot.bar(x = 'Tag', y='Count', figsize = (15,10))

In [None]:
empty_count = get_empty_count(bs_pivot_value)

In [None]:
empty_pd = empty_count.toPandas()

In [None]:
melt_pd = empty_pd.melt(var_name = 'Tag', value_name = "Count")
# df2 = pd.melt(df, id_vars=["location", "name"], var_name="Date", value_name="Value")

In [None]:
melt_pd.columns

Index(['Tag', 'Count'], dtype='object')

In [None]:
pd_frame = df_all_selected.where("adsh == '0001564590-20-043606' and stmt == 'IS' and qtrs=='1'").toPandas()
#pd_frame = df_all_selected.where("adsh == '0001628279-20-000210'").toPandas()
#pd_frame = df_all_selected.where("adsh == '0001193125-20-213555'").toPandas()
#print(pd_frame.sort_values(['report', 'line']))

In [None]:
pd.set_option('display.max_rows', pd_frame.shape[0]+1)
pd_frame[['fp','cik', 'tag', 'value', 'stmt', 'report', 'line','period', 'qtrs']].sort_values(['report','qtrs', 'line'])

Unnamed: 0,fp,cik,tag,value,stmt,report,line,period,qtrs
14,Q1,1048911,RevenueFromContractWithCustomerExcludingAssess...,19321000000.0,IS,4,1,2020-08-31,1
8,Q1,1048911,LaborAndRelatedExpense,6852000000.0,IS,4,3,2020-08-31,1
11,Q1,1048911,DepreciationDepletionAndAmortization,926000000.0,IS,4,6,2020-08-31,1
9,Q1,1048911,FuelCosts,565000000.0,IS,4,7,2020-08-31,1
15,Q1,1048911,CostOfPropertyRepairsAndMaintenance,806000000.0,IS,4,8,2020-08-31,1
4,Q1,1048911,OtherCostAndExpenseOperating,2669000000.0,IS,4,9,2020-08-31,1
1,Q1,1048911,CostsAndExpenses,17731000000.0,IS,4,10,2020-08-31,1
17,Q1,1048911,OperatingIncomeLoss,1590000000.0,IS,4,11,2020-08-31,1
7,Q1,1048911,InterestIncomeExpenseNonoperatingNet,-184000000.0,IS,4,13,2020-08-31,1
6,Q1,1048911,NetPeriodicDefinedBenefitsExpenseReversalOfExp...,-201000000.0,IS,4,14,2020-08-31,1


In [None]:
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsIncludingDisposalGroupAndDiscontinuedOperations
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsIncludingDisposalGroupAndDiscontinuedOperations

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect

gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents
gaap_CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents

In [None]:
bs_pivot_report.where("adsh == '0001492298-20-000025'").show()

In [None]:
df_all_selected.select('adsh','stmt').distinct().count()

585411

In [None]:
df_all_selected.select('adsh','stmt').distinct().where('stmt = "BS"').count()

133872

In [None]:
df_all_selected.select('adsh','stmt').distinct().where('stmt = "EQ"').count()

86120

In [None]:
cf_pivot_pd_copy.columns.tolist()
CostOfGoodsSold
CostOfRevenue
CostOfServices

In [None]:
[x for x in is_pivot_pd_copy.columns.values if ('CostOf' in x)]
#[x for x in is_pivot_pd_copy.columns.values if ('Revenue' in x) and ('Customer' in x)]

['CompensationExpenseExcludingCostOfGoodAndServiceSold',
 'CostOfChemicals',
 'CostOfCoalProductsAndServices',
 'CostOfDomesticRegulatedElectric',
 'CostOfDomesticRegulatedGasRevenue',
 'CostOfGoldProductsAndServices',
 'CostOfGoodsAndServiceExcludingDepreciationDepletionAndAmortization',
 'CostOfGoodsAndServicesEnergyCommoditiesAndServices',
 'CostOfGoodsAndServicesSold',
 'CostOfGoodsAndServicesSoldAmortization',
 'CostOfGoodsAndServicesSoldDepreciation',
 'CostOfGoodsAndServicesSoldDepreciationAndAmortization',
 'CostOfGoodsAndServicesSoldOverhead',
 'CostOfGoodsSold',
 'CostOfGoodsSoldAmortization',
 'CostOfGoodsSoldDepreciation',
 'CostOfGoodsSoldDepreciationAndAmortization',
 'CostOfGoodsSoldDepreciationDepletionAndAmortization',
 'CostOfGoodsSoldDirectLabor',
 'CostOfGoodsSoldDirectMaterials',
 'CostOfGoodsSoldElectric',
 'CostOfGoodsSoldExcludingDepreciationDepletionAndAmortization',
 'CostOfGoodsSoldOilAndGas',
 'CostOfGoodsSoldOverhead',
 'CostOfGoodsSoldSalesTypeLease',
 'Co

In [None]:
[x for x in bs_pivot_liabilities_copy.columns.values if x.startswith('StockholdersEquity')]

['StockholdersEquity',
 'StockholdersEquityAttributableToParentNotAllowableForNetCapital',
 'StockholdersEquityBeforeTreasuryStock',
 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterestAdjustedBalance1',
 'StockholdersEquityNoteStockSplitConversionRatio',
 'StockholdersEquityNoteStockSplitConversionRatio1',
 'StockholdersEquityNoteSubscriptionsReceivable']

In [None]:
spark.stop()