# [Cleaning: Financial Statements](#section-title)

In [1]:
#imports
import pandas as pd
from polygon import RESTClient
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

---

## Now pulling and cleaning the Financial Statements dataset

The __[Polygon.io Stock Financials vX API](https://polygon.io/docs/stocks/get_vx_reference_financials)__ was used to query the latest company financial statements from all US stock exchanges. Additional documentation is available through __[Polygon's GitHub](https://github.com/polygon-io/client-python)__.

In [2]:
# Must install polygon-api-client before running code
# pip install polygon-api-client

In [3]:
# Bringing the ticker list over from previous notebooks
df_tickers_list = pd.read_csv("../data/cleaned_csvs_interim_steps/tickers_list.csv")
tickers = df_tickers_list["symbol"].tolist()

In [4]:
# My personal API key
polygonAPIkey = "W8sl48pTwN8GCXTVrJ79tIW4tpB7yrDK"

In [5]:
client = RESTClient(polygonAPIkey)

In [6]:
len(tickers)

6171

In [7]:
# Calling the API - Part 1

# empty list
statements = []

# Function to request fin statement data + append to list using list comp

def polygon_api_pull(ticker):
    [statements.append(t) for t in client.vx.list_stock_financials(ticker=ticker, filing_date_gte="2023-05-01")]
    return statements

for ticker in tickers:
    polygon_api_pull(ticker)

In [15]:
# statements
len(statements)

8158

In [9]:
# We use the dir function (= directory) to take a look at the elements within statements
dir(statements[0])

['__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'cik',
 'company_name',
 'end_date',
 'filing_date',
 'financials',
 'fiscal_period',
 'fiscal_year',
 'from_dict',
 'source_filing_file_url',
 'source_filing_url',
 'start_date']

In [10]:
# Calling the API - Part 2

# Assigniing attributes to income statement
attributes_is = [
    "basic_earnings_per_share",
    "cost_of_revenue",
    "gross_profit",
    "operating_expenses",
    "revenues" ]

# Assigniing attributes to statement of comprehensive income
attributes_ci = [
    "comprehensive_income_loss",
    "comprehensive_income_loss_attributable_to_parent",
    "other_comprehensive_income_loss" ]

# Assigning attributes to cash flow statement
attributes_cfs = [
    "exchange_gains_losses",
    "net_cash_flow",
    "net_cash_flow_from_financing_activities" ]

# Assigning asstributes to balance sheet
attributes_bs = [
    "assets",
    "current_assets",
    "liabilities_and_equity",
    "fixed_assets",
    "equity_attributable_to_noncontrolling_interest",
    "noncurrent_assets",
    "liabilities",
    "other_than_fixed_noncurrent_assets",
    "equity_attributable_to_parent",
    "noncurrent_liabilities",
    "current_liabilities",
    "equity" ]

list_fs = []

for i in range(len(statements)): 
    try:
        dict_fs = {}
        dict_fs["cik"] = statements[i].cik
        dict_fs["company_name"] = statements[i].company_name
        dict_fs["fiscal_period"] = statements[i].fiscal_period
        dict_fs["fiscal_year"] = statements[i].fiscal_year
        dict_fs["filing_date"] = statements[i].filing_date


        # Calling attributes of the Income Statement:
        for attr in attributes_is:
            try:
                financials = statements[i].financials.income_statement
                attr_obj = getattr(financials, attr)
                dict_fs["is_" + attr + "_unit"] = attr_obj.unit
                dict_fs["is_" + attr + "_value"] = attr_obj.value
            except Exception as e:
                pass

        # Calling attributes of the Comprehensive Income Statement:
        for attr in attributes_ci:
            try:
                financials = statements[i].financials.comprehensive_income
                attr_obj = getattr(financials, attr)
                dict_fs["ci_" + attr + "_unit"] = attr_obj.unit
                dict_fs["ci_" + attr + "_value"] = attr_obj.value
            except Exception as e:
                pass

        # Calling attributes of the Cash Flow Statement:
        for attr in attributes_cfs:
            try:
                financials = statements[i].financials.cash_flow_statement
                attr_obj = getattr(financials, attr)
                dict_fs["cfs_" + attr + "_unit"] = attr_obj.unit
                dict_fs["cfs_" + attr + "_value"] = attr_obj.value
            except Exception as e:
                pass

        # Calling attributes of the Balance Sheet:
        for attr in attributes_bs:
            try:
                financials = statements[i].financials.balance_sheet.get(attr) # Note that this path is difference bc balance sheet has nested dicts
                dict_fs["bs_" + attr + "_unit"] = financials.unit
                dict_fs["bs_" + attr + "_value"] = financials.value
            except Exception as e:
                pass
        
        list_fs.append(dict_fs)
    except Exception as e:
        pass

df_poly = pd.DataFrame(list_fs)

# Used ChatGPT only to help format the financials variable and call the attribute.
# Referenced Reddit API structure for dictionary and DF creation.

In [11]:
# over 8,000 rows, because the timeline over which we scraped was expanded to ensure that we did not miss companies that appealed for extensions or filed late
#the extended time period over which we scraped means we captured two reports of many companies.
df_poly

Unnamed: 0,cik,company_name,fiscal_period,fiscal_year,filing_date,is_basic_earnings_per_share_unit,is_basic_earnings_per_share_value,is_cost_of_revenue_unit,is_cost_of_revenue_value,is_gross_profit_unit,...,bs_other_than_fixed_noncurrent_assets_unit,bs_other_than_fixed_noncurrent_assets_value,bs_equity_attributable_to_parent_unit,bs_equity_attributable_to_parent_value,bs_noncurrent_liabilities_unit,bs_noncurrent_liabilities_value,bs_current_liabilities_unit,bs_current_liabilities_value,bs_equity_unit,bs_equity_value
0,0001755672,"Corteva, Inc.",Q2,2023,2023-08-04,USD / shares,1.00,USD,3.137000e+09,USD,...,USD,2.267600e+10,USD,2.622000e+10,USD,7.382000e+09,USD,1.034600e+10,USD,2.646100e+10
1,0001755672,"Corteva, Inc.",Q1,2023,2023-05-04,USD / shares,0.84,USD,2.771000e+09,USD,...,USD,2.290000e+10,USD,2.559900e+10,USD,6.430000e+09,USD,1.323100e+10,USD,2.583900e+10
2,0000037785,FMC CORP,Q2,2023,2023-08-03,USD / shares,0.24,USD,5.817000e+08,USD,...,USD,4.817000e+09,USD,3.353000e+09,USD,4.221600e+09,USD,4.352700e+09,USD,3.377400e+09
3,0000037785,FMC CORP,Q1,2023,2023-05-02,USD / shares,1.56,USD,7.630000e+08,USD,...,USD,4.847600e+09,USD,3.470700e+09,USD,0.000000e+00,USD,4.668300e+09,USD,3.494500e+09
4,0001285785,MOSAIC CO,Q2,2023,2023-08-02,USD / shares,1.11,USD,2.822900e+09,USD,...,USD,4.333900e+09,USD,1.240500e+10,USD,5.745400e+09,USD,4.839400e+09,USD,1.255540e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8153,0001434524,ClearSign Technologies Corp,Q1,2023,2023-05-15,USD / shares,-0.04,USD,7.880000e+05,USD,...,USD,7.920000e+05,USD,8.587000e+06,USD,0.000000e+00,USD,1.367000e+06,USD,8.587000e+06
8154,0001307579,LIQTECH INTERNATIONAL INC,Q2,2023,2023-08-11,USD / shares,-0.27,USD,3.827491e+06,USD,...,USD,1.142550e+07,USD,2.002448e+07,USD,1.045452e+07,USD,6.842719e+06,USD,2.002448e+07
8155,0001307579,LIQTECH INTERNATIONAL INC,Q1,2023,2023-05-11,USD / shares,-0.05,USD,3.620177e+06,USD,...,USD,1.200310e+07,USD,2.148564e+07,USD,1.064142e+07,USD,6.614674e+06,USD,2.148564e+07
8156,0000314227,"TOMI Environmental Solutions, Inc.",Q2,2023,2023-08-14,,,USD,1.074420e+06,USD,...,USD,2.083098e+06,USD,1.038196e+07,USD,7.019740e+05,USD,2.262267e+06,USD,1.038196e+07


In [12]:
# Creating a new column within the data frame = CF - CFF ~ CFO + CFI (as a proxy for CFO)
df_poly["cfs_net_cash_flow_from_op_and_inv"] = df_poly["cfs_net_cash_flow_value"] - df_poly["cfs_net_cash_flow_from_financing_activities_value"]

In [13]:
# Convert dataframe to csv for record and for use in future notebooks:
df_poly.to_csv("../data/cleaned_csvs_interim_steps/polygon_io_output.csv", index=False)

---