In [None]:
# default_exp complete

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# stellt sicher, dass beim verändern der core library diese wieder neu geladen wird
%load_ext autoreload
%autoreload 2

# Complete

## Basic Settings

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import * 
from pathlib import Path
from typing import List, Tuple, Union, Set

import pandas as pd

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [None]:
all_processed_folder       = "D:/data_mt/04_unified/"
all_completed_folder       = "D:/data_mt/05_completed/"

all_data_local_folder = "./data/"

pivot_group = ["cik","ticker","adsh","period","filed","form","qtrs","fp"]
analyze_fields = ["value_count","null_count","difference"]

## 00_Tools

In [None]:
def count_null(df: pd.DataFrame):
    columns = list(df.columns)
    value_cols = set(columns) -  set(pivot_group)
    value_cols = value_cols -  set(analyze_fields)
    
    df['value_count'] = df[value_cols].notnull().sum(axis=1)
    df['null_count'] = df[value_cols].isnull().sum(axis=1)
    
    print("lines with missing values: ", df[df.null_count > 0].shape[0])

In [None]:
def load_data(stmt:str) -> pd.DataFrame:
    df = pd.read_csv(all_processed_folder + "04_" + stmt + "_unified.csv")
    df.period = pd.to_datetime(df.period)
    df.filed = pd.to_datetime(df.filed)
    
    return df

In [None]:
def check_number_of_reports(df: pd.DataFrame):
    """
    check if there is only one entry per report (unique adsh)
    """
    print("is there just one report per ADSH: ", len(df.adsh.unique()) == df.shape[0])

In [None]:
def missing_value_report(df: pd.DataFrame):
    df_missing = df[df.null_count > 0]
    print(df_missing.shape[0])
    sns.catplot(x='null_count', kind='count', data=df_missing)

## 01_Complete BS

### load data

In [None]:
bs_df = load_data("bs")
count_null(bs_df)

lines with missing values:  1301


In [None]:
bs_df.shape

(116519, 18)

In [None]:
check_number_of_reports(bs_df)

is there just one report per ADSH:  True


### complete missing Equity

In [None]:
# complete c_Equity = Assets - Liabilities
do_set_equity = (bs_df.c_Equity.isnull()  & bs_df.Assets.notnull() & bs_df.Liabilities.notnull())
bs_df.loc[do_set_equity, 'c_Equity'] = bs_df.loc[do_set_equity, 'Assets'] - bs_df.loc[do_set_equity, 'Liabilities']

In [None]:
count_null(bs_df)

lines with missing values:  559


### complete missing liabilities

In [None]:
do_set_liab = (bs_df.Assets.notnull()  & bs_df.c_Equity.notnull()  & bs_df.Liabilities.isnull() & bs_df.LiabilitiesCurrent.isnull() & bs_df.LiabilitiesNoncurrent.isnull())
bs_df.loc[do_set_liab, 'Liabilities'] = bs_df.loc[do_set_liab, 'Assets'] - bs_df.loc[do_set_liab, 'c_Equity']
bs_df.loc[do_set_liab, 'LiabilitiesCurrent'] = bs_df.loc[do_set_liab, 'Liabilities']
bs_df.loc[do_set_liab, 'LiabilitiesNoncurrent'] = 0.0

In [None]:
count_null(bs_df)

lines with missing values:  282


### complete missing assets

In [None]:
do_set_liab = (bs_df.Liabilities.notnull()  & bs_df.c_Equity.notnull()  & bs_df.Assets.isnull() & bs_df.AssetsCurrent.isnull() & bs_df.AssetsNoncurrent.isnull())
bs_df.loc[do_set_liab, 'Assets'] = bs_df.loc[do_set_liab, 'Liabilities'] + bs_df.loc[do_set_liab, 'c_Equity']
bs_df.loc[do_set_liab, 'AssetsCurrent'] = bs_df.loc[do_set_liab, 'Liabilities']
bs_df.loc[do_set_liab, 'AssetsNoncurrent'] = 0.0

In [None]:
count_null(bs_df)

lines with missing values:  274


### save

In [None]:
fields_to_drop = set(analyze_fields).intersection(set(bs_df.columns))
bs_df.drop(columns=fields_to_drop).to_csv(all_completed_folder + "05_bs_completed.csv", index=False)

# make copy to directory under git control
shutil.copy(all_completed_folder + "05_bs_completed.csv", all_data_local_folder + "05_bs_completed.csv")

'./data/05_bs_completed.csv'

## 02_Complete CF

### load data

In [None]:
cf_df = load_data("cf")
count_null(cf_df)

lines with missing values:  8631


In [None]:
cf_df.shape

(116187, 16)

In [None]:
check_number_of_reports(cf_df)

is there just one report per ADSH:  True


### complete CashIncreaseDecrease

In [None]:
# per definition CashIncreaseDecrease = NetCashProvidedByUsedInOperatingActivities + NetCashProvidedByUsedInInvestingActivities + NetCashProvidedByUsedInFinancingActivities
do_set_cshinc = (cf_df.c_CashIncreaseDecrease.isnull()  & cf_df.c_CashFromOperating.notnull()  & 
                 cf_df.c_CashFromInvesting.notnull() & cf_df.c_CashFromFinancing.notnull())
cf_df.loc[do_set_cshinc, 'c_CashIncreaseDecrease'] = cf_df.loc[do_set_cshinc, 'c_CashFromOperating'] \
                                                    + cf_df.loc[do_set_cshinc, 'c_CashFromInvesting'] \
                                                    + cf_df.loc[do_set_cshinc, 'c_CashFromFinancing'] 

In [None]:
count_null(cf_df)

lines with missing values:  377


### complete c_CashFromOperating

In [None]:
# per definition CashIncreaseDecrease = CashFromOperating + CashFromInvesting + CashFromFinancing
# hence CashFromOperating = CashIncreaseDecrease - (CashFromInvesting + CashFromFinancing)
do_set_netop = (cf_df.c_CashIncreaseDecrease.notnull()  & cf_df.c_CashFromOperating.isnull()  & 
                 cf_df.c_CashFromInvesting.notnull() & cf_df.c_CashFromFinancing.notnull())
cf_df.loc[do_set_netop, 'c_CashFromOperating'] = cf_df.loc[do_set_netop, 'c_CashIncreaseDecrease'] \
                                                 - (cf_df.loc[do_set_netop, 'c_CashFromInvesting']  + cf_df.loc[do_set_netop, 'c_CashFromFinancing'] )

In [None]:
count_null(cf_df)

lines with missing values:  312


### invert values for c_PaymentsOfDividendsTotal and c_PaymentsForRepurchaseOfStockTotal
Since PaymentsOfDividends and PaymentsForRepurchaseOfStock are outflowing, they appear with a negative number. However, it more intuitiv if have them as positive values

In [None]:
cf_df.c_PaymentsOfDividendsTotal = - cf_df.c_PaymentsOfDividendsTotal
cf_df.c_PaymentsForRepurchaseOfStockTotal = - cf_df.c_PaymentsForRepurchaseOfStockTotal

### save

In [None]:
fields_to_drop = set(analyze_fields).intersection(set(cf_df.columns))
cf_df.drop(columns=fields_to_drop).to_csv(all_completed_folder + "05_cf_completed.csv", index=False)

# make copy to directory under git control
shutil.copy(all_completed_folder + "05_cf_completed.csv", all_data_local_folder + "05_cf_completed.csv")

'./data/05_cf_completed.csv'

## 02_Complete IS

### load data

In [None]:
is_df = load_data("is")
count_null(is_df)

lines with missing values:  116275


In [None]:
is_df.shape

(116298, 17)

In [None]:
check_number_of_reports(is_df)

is there just one report per ADSH:  True


### complete GrossProfit

In [None]:
# per definition Grossprofit = Revenues - CostOfRevenue
do_set_grossp = (is_df.c_Revenues.notnull()  & is_df.c_CostOfRevenue.notnull()  & is_df.GrossProfit.isnull())
is_df.loc[do_set_grossp, 'GrossProfit'] = is_df.loc[do_set_grossp, 'c_Revenues'] - is_df.loc[do_set_grossp, 'c_CostOfRevenue']

In [None]:
# per definition: grossprofit - opexpenses = op-income -> grossprofit = opincome + opexpenses
do_set_grossp1 = (is_df.OperatingExpenses.notnull()  & is_df.c_OperatingIncomeLoss.notnull()  & is_df.GrossProfit.isnull())
is_df.loc[do_set_grossp1, 'GrossProfit'] = is_df.loc[do_set_grossp1, 'c_OperatingIncomeLoss'] + is_df.loc[do_set_grossp1, 'OperatingExpenses']

### complete Revenues

In [None]:
# per definition Grossprofit = Revenues - CostOfRevenue -> Revenues = Grossprofit + CostOfRevenue
do_set_rev = (is_df.c_Revenues.isnull()  & is_df.c_CostOfRevenue.notnull()  & is_df.GrossProfit.notnull())
is_df.loc[do_set_rev, 'c_Revenues'] = is_df.loc[do_set_rev, 'c_CostOfRevenue'] + is_df.loc[do_set_rev, 'GrossProfit']
set_to_zero_if_null(is_df,'c_Revenues')

### complete CostOfRevenue

In [None]:
# per definition Grossprofit = Revenues - CostOfRevenue -> CostOfRevenue = Revenues - Grossprofit
do_set_cstrev = (is_df.c_Revenues.notnull()  & is_df.c_CostOfRevenue.isnull()  & is_df.GrossProfit.notnull())
is_df.loc[do_set_cstrev, 'c_CostOfRevenue'] = is_df.loc[do_set_cstrev, 'c_Revenues'] - is_df.loc[do_set_cstrev, 'GrossProfit']

### complete OperatingIncomeLoss

In [None]:
# per definition: opIncomeLoss = grossprofit - opexpenses
do_set_opinc = (is_df.OperatingExpenses.notnull()  & is_df.GrossProfit.notnull()  & is_df.c_OperatingIncomeLoss.isnull())
is_df.loc[do_set_opinc, 'c_OperatingIncomeLoss'] = is_df.loc[do_set_opinc, 'GrossProfit'] - is_df.loc[do_set_opinc, 'OperatingExpenses']

In [None]:
(is_df.c_OperatingIncomeLoss.isnull()).sum()

4251

### estimate c_OperatingIncomeLoss based on GrossProfit and NetIncomeLoss

In [None]:
# if OperatingIncomeLoss is not set, we can try to estimate it based on the average compared to GrossProfit and NetIncomeLoss

# based on existing c_OperatingIncomeLoss Values, we calculate the average c_OperatingIncomeLoss between GrossProfit and NetIncomeLoss (in percent)
#  we may only consider entries where NetIncomeLoss isn't equal to GrossProfit. since this would cause a division by zero
calc_mean_opinc_avg = (is_df.GrossProfit.notnull()  & is_df.c_NetIncomeLoss.notnull()  & is_df.c_OperatingIncomeLoss.notnull() & (is_df.c_NetIncomeLoss != is_df.GrossProfit))
selected_cols = is_df[calc_mean_opinc_avg][['c_NetIncomeLoss','GrossProfit','c_OperatingIncomeLoss']].copy()

avg_opinc = ((selected_cols.GrossProfit-selected_cols.c_OperatingIncomeLoss)/(selected_cols.GrossProfit-selected_cols.c_NetIncomeLoss)).mean()

# if we have grossprofit and c_NetIncomeLoss, we try estimate OperatingIncomeLoss based on the average position
do_updated_opinc = (is_df.GrossProfit.notnull() & is_df.c_OperatingIncomeLoss.isnull() & is_df.c_NetIncomeLoss.notnull())

is_df.loc[do_updated_opinc,'c_OperatingIncomeLoss'] = is_df.loc[do_updated_opinc,'GrossProfit'] \
                                    - avg_opinc * (is_df.loc[do_updated_opinc,'GrossProfit'] - is_df.loc[do_updated_opinc,'c_NetIncomeLoss'])

In [None]:
(is_df.c_OperatingIncomeLoss.isnull()).sum()

3944

### estimate c_OperatingIncomeLoss based on Revenues and NetIncomeLoss

In [None]:
# if OperatingIncomeLoss is not set, we can try to estimate it based on the average compared to Revenues and NetIncomeLoss

# based on existing c_OperatingIncomeLoss Values, we calculate the average c_OperatingIncomeLoss between Revenue and NetIncomeLoss (in percent)
calc_mean_opinc_avg = ((is_df.c_Revenues > 0.0)  & (is_df.c_NetIncomeLoss.notnull()  & is_df.c_OperatingIncomeLoss.notnull()))
selected_cols = is_df[calc_mean_opinc_avg][['c_NetIncomeLoss','c_Revenues','c_OperatingIncomeLoss']].copy()

avg_opinc = ((selected_cols.c_Revenues-selected_cols.c_OperatingIncomeLoss)/(selected_cols.c_Revenues-selected_cols.c_NetIncomeLoss)).mean()

# if we have revenue and c_NetIncomeLoss, we try estimate OperatingIncomeLoss based on the average position
do_updated_opinc = (is_df.c_Revenues.notnull() & is_df.c_OperatingIncomeLoss.isnull() & is_df.c_NetIncomeLoss.notnull())

is_df.loc[do_updated_opinc,'c_OperatingIncomeLoss'] = is_df.loc[do_updated_opinc,'c_Revenues'] \
                                    - avg_opinc * (is_df.loc[do_updated_opinc,'c_Revenues'] - is_df.loc[do_updated_opinc,'c_NetIncomeLoss'])

In [None]:
(is_df.c_OperatingIncomeLoss.isnull()).sum()

297

### estimate GrossProfit based on Revenue and OperatingIncomeLoss

In [None]:
# if Grossprofit is not set, we can try to estimate it based on the average compared to reveneue and operatingIncomeLoss

# based on existing GrossProfit Values, we calculate the average grossprofit between Revenue and OperatingIncome (in percent)
calc_mean_gp_selection = ((is_df.c_Revenues > 0.0)  & (is_df.GrossProfit.notnull()  & is_df.c_OperatingIncomeLoss.notnull()))
selected_cols = is_df[calc_mean_gp_selection][['c_Revenues','GrossProfit','c_OperatingIncomeLoss']].copy()

avg_gross = ((selected_cols.c_Revenues-selected_cols.GrossProfit)/(selected_cols.c_Revenues-selected_cols.c_OperatingIncomeLoss)).mean()

# if we have revenue and operatingincome, we try estimate grossprofit based on the average position
do_updated_grossprofit = (is_df.c_Revenues.notnull() & is_df.c_OperatingIncomeLoss.notnull() & is_df.GrossProfit.isnull())

is_df.loc[do_updated_grossprofit,'GrossProfit'] = is_df.loc[do_updated_grossprofit,'c_Revenues'] \
                                    - avg_gross * (is_df.loc[do_updated_grossprofit,'c_Revenues'] - is_df.loc[do_updated_grossprofit,'c_OperatingIncomeLoss'])

In [None]:
(is_df.c_Revenues.isnull() | is_df.GrossProfit.isnull() | is_df.c_OperatingIncomeLoss.isnull() | is_df.c_NetIncomeLoss.isnull()).sum()

879

In [None]:
fields = list(pivot_group)
fields.extend(['c_Revenues','GrossProfit','c_OperatingIncomeLoss','c_NetIncomeLoss'])

### save

In [None]:
is_df[fields].to_csv(all_completed_folder + "05_is_completed.csv", index=False)

# make copy to directory under git control
shutil.copy(all_completed_folder + "05_is_completed.csv", all_data_local_folder + "05_is_completed.csv")

'./data/05_is_completed.csv'

## XX_Trials

In [None]:
is_df[((is_df.c_Revenues.notnull() & is_df.GrossProfit.notnull() & is_df.c_OperatingIncomeLoss.isnull() & is_df.c_NetIncomeLoss.notnull() ))] \
  [['cik','adsh','c_Revenues','GrossProfit','c_OperatingIncomeLoss', 'c_NetIncomeLoss', 'OperatingExpenses','CostsAndExpenses']] #721

In [None]:
is_df[((is_df.c_Revenues.isnull() | is_df.GrossProfit.isnull() | is_df.c_OperatingIncomeLoss.isnull() | is_df.c_NetIncomeLoss.isnull() ))] \
  [['cik','adsh','c_Revenues','GrossProfit','c_OperatingIncomeLoss', 'c_NetIncomeLoss', 'OperatingExpenses','CostsAndExpenses']] #721

In [None]:
is_df[((is_df.CostsAndExpenses.notnull() & is_df.GrossProfit.isnull() & is_df.c_OperatingIncomeLoss.isnull()))][['cik','adsh','CostsAndExpenses', 'OperatingExpenses','GrossProfit','c_OperatingIncomeLoss']] #721
#is_df[((is_df.OperatingExpenses.notnull() & is_df.GrossProfit.isnull() & is_df.c_OperatingIncomeLoss.notnull()))][['cik','adsh','OperatingExpenses','GrossProfit','c_OperatingIncomeLoss']] #11618

In [None]:
is_df.isnull().sum(axis=0)

In [None]:
is_df.adsh.value_counts()

In [None]:
missing_value_report(cf_df)

In [None]:
len(bs_df.adsh.unique()) == bs_df.shape[0]

True

In [None]:
cf_df.dtypes

In [None]:
sum(bs_df.qtrs>0)

0

In [None]:
df = cf_df[(cf_df.null_count == 4)]
print(df.shape[0])
print(df.ticker.value_counts())

In [None]:
cf_df.columns

Index(['cik', 'ticker', 'adsh', 'period', 'filed', 'form', 'qtrs', 'fp',
       'c_CashIncreaseDecrease', 'c_CashFromOperating', 'c_CashFromInvesting',
       'c_CashFromFinancing', 'c_PaymentsOfDividendsTotal',
       'c_PaymentsForRepurchaseOfStockTotal', 'value_count', 'null_count'],
      dtype='object')

In [None]:
print(cf_df.c_CashFromOperating.isnull().sum())
print(cf_df.c_CashFromInvesting.isnull().sum())
print(cf_df.c_CashFromFinancing.isnull().sum())

312
0
0
