In [1]:
import yaml
import pyodbc
from market_growth_analysis.etl.stagging import *
import pandas as pd

# Load the YAML file
with open('../../conf/local.yml', 'r') as f:
    config = yaml.safe_load(f)

In [2]:
balance_sheet_df = pd.read_csv("../../data/raw_01/balance-sheet.csv", index_col=0)
cash_flow_statement_df = pd.read_csv("../../data/raw_01/cash-flow-statement.csv", index_col=0)
# financial_ratios = pd.read_csv("../../data/raw_01/balance-sheet.csv")
income_statement_df = pd.read_csv("../../data/raw_01/income-statement.csv", index_col=0)
# prices_df = pd.read_csv("../../data/raw_01/prices.csv", index_col=0)

In [3]:
# convert 'Date' column to datetime format
balance_sheet_df['Date'] = pd.to_datetime(balance_sheet_df['Date'])
cash_flow_statement_df['Date'] = pd.to_datetime(cash_flow_statement_df['Date'])
income_statement_df['Date'] = pd.to_datetime(income_statement_df['Date'])


In [4]:
# Perform inner joins between the dataframes
merged_df = balance_sheet_df.merge(cash_flow_statement_df, on=['ticker', 'Date'], how='inner')
merged_df = merged_df.merge(income_statement_df, on=['ticker', 'Date'], how='inner')

# Check for duplicates

In [6]:
# extract year from the 'Date' column
merged_df['Year'] = merged_df['Date'].dt.year

In [7]:
# check for duplicates in the combination of 'ID_COMPANY' and 'Year' columns
duplicated_rows = merged_df.duplicated(subset=['ticker', 'Year'], keep=False)

# print the duplicated rows
merged_df[['ticker', 'Year', 'Date']].loc[duplicated_rows]

Unnamed: 0,ticker,Year,Date
2462,INDT,2020,2020-12-31
2463,INDT,2020,2020-11-30
2535,GMRE,2014,2014-12-31
2536,GMRE,2014,2014-08-31
2539,GMRE,2011,2011-12-31
...,...,...,...
54381,CPTN,2021,2021-03-31
54383,CPTN,2019,2019-12-31
54384,CPTN,2019,2019-03-31
54902,MTVC,2021,2021-12-31


In [8]:
merged_df['ticker'].loc[duplicated_rows].unique()

array(['INDT', 'GMRE', 'CORR', 'JEF', 'CLBK', 'LC', 'GRNQ', 'LGHL', 'CSR',
       'BRT', 'GYRO', 'AFBI', 'DFS', 'HVBC', 'EBMT', 'HASI', 'CZWI',
       'FNWB', 'SSIC', 'RGEN', 'ALKS', 'VCEL', 'MYGN', 'ME', 'MDXG',
       'AUTL', 'CMPX', 'EYPT', 'XAIR', 'ENOB', 'EIGR', 'APTO', 'HEPA',
       'ORGS', 'SNCE', 'ABVC', 'ELOX', 'EDSA', 'ONVO', 'PULM', 'PTIX',
       'SONN', 'AYTU', 'TENX', 'KTRA', 'ARTL', 'THMO', 'PRGO', 'EGRX',
       'ORMP', 'KERN', 'NVIV', 'NEPT', 'ABCM', 'LIVN', 'KRMD', 'AXDX',
       'EKSO', 'NSPR', 'HSDT', 'IMGN', 'SMMT', 'TARO', 'PROC', 'KALV',
       'CTXR', 'VXRT', 'RVPH', 'RLMD', 'ADMP', 'PTE', 'LBPS', 'AMEH',
       'DMTK', 'BTCY', 'NVOS', 'ACST', 'FCUV', 'UXIN', 'CXDO', 'IRNT',
       'WDAY', 'AZPN', 'ASAN', 'ETWO', 'PL', 'ZFOX', 'PHUN', 'MRDB',
       'VERB', 'TGAN', 'NOW', 'SCWX', 'RNLX', 'LICY', 'MG', 'GTEC', 'AEY',
       'MSCI', 'HEAR', 'CLRO', 'AKTS', 'ASUR', 'CBAT', 'KLR', 'ESTE',
       'TDW', 'CEQP', 'KLXE', 'DWSN', 'RNW', 'CRGY', 'CSAN', 'SEDG',
       '

In [9]:
merged_df['n_nan'] = merged_df.isna().sum(axis=1).astype(int)

In [10]:
# sort the dataframe by 'ID_COMPANY', 'Year' and the count of missing values
df_sorted = merged_df.sort_values(by=['ticker', 'Year', 'n_nan'])

In [11]:
# keep the first occurrence of each 'ID_COMPANY-Year' combination
df_unique = df_sorted.drop_duplicates(subset=['ticker', 'Year'], keep='first')

In [12]:
df_unique[df_unique['ticker']=='NUZE']

Unnamed: 0,Date,Cash On Hand,Receivables,Inventory,Pre-Paid Expenses,Other Current Assets,Total Current Assets,"Property, Plant, And Equipment",Long-Term Investments,Goodwill And Intangible Assets,...,Basic Shares Outstanding,Shares Outstanding,Basic EPS,EPS - Earnings Per Share,industry,sector,company_full_name,country,Year,n_nan
32850,2010-09-30,,,,,,0.008,,,,...,,,,0.0,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2010,57
32848,2011-09-30,0.0001,0.033,,,,0.0331,,,0.0,...,,,,0.0,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2011,41
32846,2012-09-30,0.1655,,0.0732,0.0149,,0.2537,0.0025,,0.0428,...,0.2898,0.2898,-1.0491,-1.0491,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2012,28
32845,2013-09-30,1.1107,0.1529,,0.0169,,1.2804,0.0087,,,...,0.3328,0.3328,-3.1472,-3.1472,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2013,26
32844,2014-09-30,0.2382,0.0052,0.0509,0.0691,,0.3633,0.0334,,,...,0.2871,0.2871,-10.4906,-10.4906,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2014,26
32843,2015-09-30,0.1077,0.0182,0.2018,0.0215,,0.3492,0.1921,,,...,0.286,0.286,-5.2453,-5.2453,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2015,24
32842,2016-09-30,0.0406,0.0577,0.2064,,0.0657,0.3704,0.1519,,,...,0.2931,0.2931,-4.1962,-4.1962,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2016,25
32841,2017-09-30,0.3473,0.1563,0.2666,,0.1323,0.9025,0.278,0.0107,0.0171,...,0.3063,0.3063,-5.2453,-5.2453,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2017,15
32840,2018-09-30,1.8067,0.1449,0.1349,0.0947,0.0339,2.215,0.6744,,0.0171,...,0.3499,0.3499,-10.1399,-10.1399,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2018,15
32839,2019-09-30,1.326,0.5403,0.501,0.3725,0.0005,2.7403,1.8756,,,...,0.3966,0.3966,-30.7692,-30.7692,Beverages - Soft Drinks,Consumer Staples,Nuzee,United States,2019,16


In [15]:
df_unique.to_csv("../../data/intermediate_02/financial-sheets.csv")