In [58]:
import yaml
import pyodbc
from market_growth_analysis.etl.stagging import *
import pandas as pd
import numpy as np

# Load the YAML file
with open('../../conf/global.yml', 'r') as f:
    columns = yaml.safe_load(f)

financial = pd.read_csv("../../data/processed_03/financial-prices-ratios.csv", index_col=0)

# Clean Ratios

In [29]:
columns_ratios = columns['column_id'] + columns['columns_ratios']
ratios_df = financial[columns_ratios]

In [30]:
ratios_df.describe()

Unnamed: 0,PK,Enterprise Value (EV),EV / Revenue,EV / EBITDA,EV / EBIT,EV / Invested Capital,Free Cash Flow (FCF),EV / Free Cash Flow,EV / FCF,P/E,P/S,P/CF
count,56445.0,48095.0,45461.0,43383.0,43380.0,20679.0,54596.0,46968.0,46968.0,48663.0,45703.0,48120.0
mean,28222.0,50332020.0,2093133.0,-16353040.0,-16114150.0,,244.345771,inf,inf,inf,5527528.0,inf
std,16294.412309,8144591000.0,271926700.0,1853450000.0,1852664000.0,,6207.402514,,,,506842900.0,
min,0.0,-9816.426,-16800.51,-276507100000.0,-276382400000.0,-inf,-341822.0,-3643247000000.0,-3643247000000.0,-222353000000.0,-16541.14,-6388385000000.0
25%,14111.0,312.3743,1.379789,-0.7940197,-3.38737,4.735112,-60.766525,-13.46549,-13.46549,-2.913847,0.7663726,-0.6856935
50%,28222.0,1741.0,3.082043,10.13171,12.5765,16.49107,1.759,1.562426,1.562426,9.963503,1.752046,6.753109
75%,42333.0,8255.493,8.737657,16.86291,23.81735,75.53815,146.71675,25.7246,25.7246,20.51324,3.878814,13.8922
max,56444.0,1694839000000.0,52766380000.0,3113391000.0,6005913000.0,inf,287656.0,inf,inf,inf,75867410000.0,inf


In [31]:
# Replace -inf and inf with None
ratios_df = ratios_df.replace([-np.inf, np.inf], None)

# Make sure types are correct
ratios_df[columns['columns_ratios']] = ratios_df[columns['columns_ratios']].astype(float)
ratios_df[columns['column_id']] = ratios_df[columns['column_id']].astype(int)


In [32]:
ratios_df.describe()

Unnamed: 0,PK,Enterprise Value (EV),EV / Revenue,EV / EBITDA,EV / EBIT,EV / Invested Capital,Free Cash Flow (FCF),EV / Free Cash Flow,EV / FCF,P/E,P/S,P/CF
count,56445.0,48095.0,45461.0,43383.0,43380.0,20674.0,54596.0,46949.0,46949.0,48300.0,45703.0,48082.0
mean,28222.0,50332020.0,2093133.0,-16353040.0,-16114150.0,20719.08,244.345771,-83400900.0,-83400900.0,22765160.0,5527528.0,-172379800.0
std,16294.412309,8144591000.0,271926700.0,1853450000.0,1852664000.0,1721492.0,6207.402514,16858370000.0,16858370000.0,7737213000.0,506842900.0,30988740000.0
min,0.0,-9816.426,-16800.51,-276507100000.0,-276382400000.0,-934.6288,-341822.0,-3643247000000.0,-3643247000000.0,-222353000000.0,-16541.14,-6388385000000.0
25%,14111.0,312.3743,1.379789,-0.7940197,-3.38737,4.735097,-60.766525,-13.47164,-13.47164,-2.983694,0.7663726,-0.6945563
50%,28222.0,1741.0,3.082043,10.13171,12.5765,16.48186,1.759,1.525653,1.525653,9.821835,1.752046,6.742112
75%,42333.0,8255.493,8.737657,16.86291,23.81735,75.50369,146.71675,25.68955,25.68955,20.1336,3.878814,13.87185
max,56444.0,1694839000000.0,52766380000.0,3113391000.0,6005913000.0,217460200.0,287656.0,1025684000.0,1025684000.0,1674878000000.0,75867410000.0,546312300000.0


In [34]:
ratios_df.head(2)

Unnamed: 0,PK,Enterprise Value (EV),EV / Revenue,EV / EBITDA,EV / EBIT,EV / Invested Capital,Free Cash Flow (FCF),EV / Free Cash Flow,EV / FCF,P/E,P/S,P/CF
0,0,8148.062046,1.818358,38.985943,173.363022,49.988111,394.0,20.680361,20.680361,-178.004562,1.237015,13.585936
1,1,10215.772131,1.876519,13.301787,18.049067,71.942057,-456.0,-22.403009,-22.403009,11.615858,1.4612,11.07907


# Clean Income Statement

In [35]:
columns_income_statement = columns['column_id'] + columns['columns_income_statement']
income_statememnt_df = financial[columns_income_statement]

In [36]:
income_statememnt_df.describe()

Unnamed: 0,PK,Revenue,Cost Of Goods Sold,Gross Profit,Research And Development Expenses,SG&A Expenses,Other Operating Income Or Expenses,Operating Expenses,Operating Income,Total Non-Operating Income/Expense,...,Other Income,Income From Continuous Operations,Income From Discontinued Operations,Net Income,EBITDA,EBIT,Basic Shares Outstanding,Shares Outstanding,Basic EPS,EPS - Earnings Per Share
count,56445.0,53575.0,48450.0,53608.0,19840.0,54139.0,24819.0,48447.0,56254.0,49880.0,...,1518.0,56331.0,10144.0,56331.0,50180.0,50177.0,56217.0,56187.0,56227.0,56445.0
mean,28222.0,6029.650493,3961.210305,2445.864171,332.435771,1174.326932,-358.81124,1754.973439,722.611888,-96.160226,...,15.017554,484.553173,58.513503,468.086505,1055.045064,680.666596,278.365748,283.410748,-4213.136,-4197.578
std,16294.412309,21781.718669,16572.321726,8284.165102,1598.571693,4643.733376,2703.628418,6059.256523,3119.413928,888.758021,...,182.201979,2333.683881,969.368461,2327.285628,4062.618565,3038.941123,1963.137009,1988.47025,687624.2,686294.9
min,0.0,-8219.0,-13697.0,-41649.44,-18.936,-6231.506,-124834.0,-43812.16,-30363.0,-59155.36,...,-371.41,-23251.0,-8368.0,-22802.0,-24504.4,-30363.0,-2278.226,-2278.226,-159550000.0,-159550000.0
25%,14111.0,111.291,45.355775,55.50625,6.429775,24.999,-75.803,56.924,-2.56075,-60.7,...,0.0,-6.477,-2.0,-7.3705,0.33075,-5.0059,16.127,16.377,-0.36225,-0.36
50%,28222.0,657.077,362.086,283.148,29.902,112.384,-6.781,228.11,51.662,-5.719,...,0.0,28.527,0.0,26.934,99.2,52.3202,47.365,48.288,0.81,0.79
75%,42333.0,3317.35,2047.95,1264.807,107.288,529.45,0.0,902.48905,343.0,0.14,...,0.0,218.5,10.80175,211.1305,571.1999,354.3,148.2,151.0625,2.33,2.28
max,56444.0,611289.0,463721.0,225152.0,73213.0,138428.0,43812.16,212904.0,119437.0,31744.11,...,4044.055,99803.0,76486.91,99803.0,130541.0,119437.0,373456.0,373456.0,208935.5,177151.4


In [37]:
# Make sure types are correct
income_statememnt_df[columns['columns_income_statement']] = income_statememnt_df[columns['columns_income_statement']].astype(float)
income_statememnt_df[columns['column_id']] = income_statememnt_df[columns['column_id']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_statememnt_df[columns['columns_income_statement']] = income_statememnt_df[columns['columns_income_statement']].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_statememnt_df[columns['column_id']] = income_statememnt_df[columns['column_id']].astype(int)


In [38]:
income_statememnt_df.describe()

Unnamed: 0,PK,Revenue,Cost Of Goods Sold,Gross Profit,Research And Development Expenses,SG&A Expenses,Other Operating Income Or Expenses,Operating Expenses,Operating Income,Total Non-Operating Income/Expense,...,Other Income,Income From Continuous Operations,Income From Discontinued Operations,Net Income,EBITDA,EBIT,Basic Shares Outstanding,Shares Outstanding,Basic EPS,EPS - Earnings Per Share
count,56445.0,53575.0,48450.0,53608.0,19840.0,54139.0,24819.0,48447.0,56254.0,49880.0,...,1518.0,56331.0,10144.0,56331.0,50180.0,50177.0,56217.0,56187.0,56227.0,56445.0
mean,28222.0,6029.650493,3961.210305,2445.864171,332.435771,1174.326932,-358.81124,1754.973439,722.611888,-96.160226,...,15.017554,484.553173,58.513503,468.086505,1055.045064,680.666596,278.365748,283.410748,-4213.136,-4197.578
std,16294.412309,21781.718669,16572.321726,8284.165102,1598.571693,4643.733376,2703.628418,6059.256523,3119.413928,888.758021,...,182.201979,2333.683881,969.368461,2327.285628,4062.618565,3038.941123,1963.137009,1988.47025,687624.2,686294.9
min,0.0,-8219.0,-13697.0,-41649.44,-18.936,-6231.506,-124834.0,-43812.16,-30363.0,-59155.36,...,-371.41,-23251.0,-8368.0,-22802.0,-24504.4,-30363.0,-2278.226,-2278.226,-159550000.0,-159550000.0
25%,14111.0,111.291,45.355775,55.50625,6.429775,24.999,-75.803,56.924,-2.56075,-60.7,...,0.0,-6.477,-2.0,-7.3705,0.33075,-5.0059,16.127,16.377,-0.36225,-0.36
50%,28222.0,657.077,362.086,283.148,29.902,112.384,-6.781,228.11,51.662,-5.719,...,0.0,28.527,0.0,26.934,99.2,52.3202,47.365,48.288,0.81,0.79
75%,42333.0,3317.35,2047.95,1264.807,107.288,529.45,0.0,902.48905,343.0,0.14,...,0.0,218.5,10.80175,211.1305,571.1999,354.3,148.2,151.0625,2.33,2.28
max,56444.0,611289.0,463721.0,225152.0,73213.0,138428.0,43812.16,212904.0,119437.0,31744.11,...,4044.055,99803.0,76486.91,99803.0,130541.0,119437.0,373456.0,373456.0,208935.5,177151.4


# Clean balance sheet

In [40]:
columns_balance_sheet = columns['column_id'] + columns['columns_balance_sheet']
balance_sheet_df = financial[columns_balance_sheet]

In [41]:
# Make sure types are correct
balance_sheet_df[columns['columns_balance_sheet']] = balance_sheet_df[columns['columns_balance_sheet']].astype(float)
balance_sheet_df[columns['column_id']] = balance_sheet_df[columns['column_id']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balance_sheet_df[columns['columns_balance_sheet']] = balance_sheet_df[columns['columns_balance_sheet']].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balance_sheet_df[columns['column_id']] = balance_sheet_df[columns['column_id']].astype(int)


# Clean columns_cash_flow_statement

In [42]:
columns_cash_flow_statement = columns['column_id'] + columns['columns_cash_flow_statement']
cash_flow_statement_df = financial[columns_cash_flow_statement]

In [44]:
# Make sure types are correct
cash_flow_statement_df[columns['columns_cash_flow_statement']] = cash_flow_statement_df[columns['columns_cash_flow_statement']].astype(float)
cash_flow_statement_df[columns['column_id']] = cash_flow_statement_df[columns['column_id']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cash_flow_statement_df[columns['columns_cash_flow_statement']] = cash_flow_statement_df[columns['columns_cash_flow_statement']].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cash_flow_statement_df[columns['column_id']] = cash_flow_statement_df[columns['column_id']].astype(int)


# Clean prices

In [47]:
columns_prices = columns['column_id'] + columns['columns_prices']
prices_df = financial[columns_prices]

In [49]:
prices_df.describe()

Unnamed: 0,PK,longevity,Close,Volume,Growth -1,Growth +1,Growth +5,avgGrowth -10,avgGrowth -5
count,56445.0,56445.0,48663.0,48663.0,39001.0,36550.0,22868.0,19952.0,27327.0
mean,28222.0,21.231234,67762780.0,31256160.0,0.17619,0.151453,0.847271,0.2037,0.188114
std,16294.412309,18.208661,8629560000.0,161047100.0,1.78739,1.620428,3.606664,0.446257,0.757403
min,0.0,0.0,-1.205572,0.0,-0.999999,-0.999999,-28.594223,-0.658006,-0.856566
25%,14111.0,6.0,8.72,1097300.0,-0.229299,-0.23353,-0.234413,0.074884,0.013485
50%,28222.0,18.0,19.58838,5773700.0,0.043972,0.038478,0.361355,0.149183,0.125358
75%,42333.0,31.0,43.19,21320000.0,0.331499,0.317357,1.134124,0.24833,0.258335
max,56444.0,120.0,1373400000000.0,11977450000.0,198.999997,198.999997,280.645588,11.436645,56.027803


In [52]:
# Make sure types are correct
prices_df[columns['columns_prices']] = prices_df[columns['columns_prices']].astype(float)
prices_df[columns['column_id'] + ['longevity']] = prices_df[columns['column_id'] + ['longevity']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prices_df[columns['columns_prices']] = prices_df[columns['columns_prices']].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prices_df[columns['column_id'] + ['longevity']] = prices_df[columns['column_id'] + ['longevity']].astype(int)


# Create Dim table

In [61]:
columns_dim = columns['columns_dim']
dim_df = financial[columns_dim]

In [68]:
dim_grouped_df = dim_df.groupby('ticker').first().reset_index()

# Save dataframes to load

In [71]:
ratios_df.to_csv("../../data/load_04/ratios_df.csv")
income_statememnt_df.to_csv("../../data/load_04/income_statememnt_df.csv")
balance_sheet_df.to_csv("../../data/load_04/balance_sheet_df.csv")
cash_flow_statement_df.to_csv("../../data/load_04/cash_flow_statement_df.csv")
prices_df.to_csv("../../data/load_04/prices_df.csv")
dim_grouped_df.to_csv("../../data/load_04/dim_grouped_df.csv")