In [1]:
# Standard Library Imports
from contextlib import contextmanager
from datetime import datetime
import os
import sys

# Third-Party Imports
import financedatabase as fd
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from prometheus_client import start_http_server, Summary, Counter, Gauge
from sqlalchemy import (
    create_engine,
    Column,
    String,
    Integer,
    Boolean,
    Date,
    Sequence,
    Float,
    Numeric,
    BigInteger,
)
from sqlalchemy.orm import declarative_base, sessionmaker
from tqdm import tqdm

# Local / Project Imports
sys.path.append("..")
from investorkit.investorkit.get_data.base import get_profile, get_financial_statements

# Initialize Environment Variables
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL")
FMP_API_KEY = os.getenv("FMP_SECRET_KEY")
run_id = datetime.now().isoformat()

# Initialize SQLAlchemy
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()

# Initialize Prometheus Metrics
EXECUTION_TIME = Summary("function_execution_seconds", "Time spent processing.")
NEW_SYMBOLS = Counter(
    "new_symbols_fetched", "Number of new symbols fetched.", ["run_id"]
)
SYMBOLS_LENGTH = Gauge(
    "symbols_length", "Length of the list of symbols fetched.", ["run_id"]
)
TOTAL_RECORDS = Gauge(
    "total_records", "Total number of records in the profiles table.", ["run_id"]
)
FETCH_PARAMS = Gauge(
    "fetch_params",
    "Parameters passed to fetch_equity_symbols",
    ["country", "market", "run_id"],
)
MISSING_SYMBOLS = Counter(
    "missing_symbols_fetched", "Number of missing new symbols.", ["run_id"]
)

# Set Pandas Display Options
pd.set_option("display.max_columns", None)
# Database session context manager
@contextmanager
def session_scope():
    session = SessionLocal()
    try:
        yield session
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()


# SQLAlchemy Models
class Profile(Base):
    __tablename__ = "profiles"
    symbol = Column(String, primary_key=True, index=True)
    companyName = Column(String)
    cik = Column(Integer)
    exchange = Column(String)
    exchangeShortName = Column(String)
    industry = Column(String)
    sector = Column(String)
    country = Column(String)
    ipoDate = Column(Date)
    defaultImage = Column(Boolean)
    isEtf = Column(Boolean)
    isActivelyTrading = Column(Boolean)


class CashFlow(Base):
    __tablename__ = "cashflows2"
    __table_args__ = {"extend_existing": True}

    id = Column(Integer, Sequence("cashflow_id_seq"), primary_key=True, index=True)
    date = Column(String)  # Representing period[Q-DEC] as string
    symbol = Column(String, index=True)
    reportedCurrency = Column(String)
    cik = Column(BigInteger)
    fillingDate = Column(Date)
    acceptedDate = Column(Date)
    calendarYear = Column(BigInteger)
    period = Column(String)

    # Columns changed from Integer to BigInteger
    netIncome = Column(BigInteger)
    depreciationAndAmortization = Column(BigInteger)
    deferredIncomeTax = Column(BigInteger)
    stockBasedCompensation = Column(BigInteger)
    changeInWorkingCapital = Column(BigInteger)
    accountsReceivables = Column(BigInteger)
    inventory = Column(BigInteger)
    accountsPayables = Column(BigInteger)
    otherWorkingCapital = Column(BigInteger)
    otherNonCashItems = Column(BigInteger)
    netCashProvidedByOperatingActivities = Column(BigInteger)
    investmentsInPropertyPlantAndEquipment = Column(BigInteger)
    acquisitionsNet = Column(BigInteger)
    purchasesOfInvestments = Column(BigInteger)
    salesMaturitiesOfInvestments = Column(BigInteger)
    otherInvestingActivites = Column(BigInteger)
    netCashUsedForInvestingActivites = Column(BigInteger)
    debtRepayment = Column(BigInteger)
    commonStockIssued = Column(BigInteger)
    commonStockRepurchased = Column(BigInteger)
    dividendsPaid = Column(BigInteger)
    otherFinancingActivites = Column(BigInteger)
    netCashUsedProvidedByFinancingActivities = Column(BigInteger)
    effectOfForexChangesOnCash = Column(BigInteger)
    netChangeInCash = Column(BigInteger)
    cashAtEndOfPeriod = Column(BigInteger)
    cashAtBeginningOfPeriod = Column(BigInteger)
    operatingCashFlow = Column(BigInteger)
    capitalExpenditure = Column(BigInteger)
    freeCashFlow = Column(BigInteger)

    link = Column(String)
    finalLink = Column(String)


Base.metadata.clear()
Base.metadata.create_all(bind=engine)
def filter_bigint_range(df):
    min_bigint = -9223372036854775808
    max_bigint = 9223372036854775807
    
    for col in df.select_dtypes(include=[np.number]).columns:
        mask = (df[col] >= min_bigint) & (df[col] <= max_bigint)
        df = df[mask]
        
    return df
@EXECUTION_TIME.time()
def fetch_equity_symbols(country="United States", market="NASDAQ Global Select"):
    equities = fd.Equities()
    selected_columns = [
        "name",
        "currency",
        "sector",
        "industry_group",
        "industry",
        "exchange",
        "market",
        "market_cap",
    ]
    us_equities = equities.select(country=country)
    df_equities = us_equities[us_equities["market"] == market][selected_columns]
    list_symbols = list(df_equities.index)

    SYMBOLS_LENGTH.labels(run_id=run_id).set(len(list_symbols))
    FETCH_PARAMS.labels(country=country, market=market, run_id=run_id).set(
        len(list_symbols)
    )

    return list_symbols


@EXECUTION_TIME.time()
def get_new_symbols(list_symbols, engine):
    existing_symbols_query = "SELECT symbol FROM profiles;"
    existing_symbols = pd.read_sql(existing_symbols_query, con=engine)
    new_symbols = list(set(list_symbols) - set(existing_symbols["symbol"].tolist()))

    NEW_SYMBOLS.labels(run_id=run_id).inc(len(new_symbols))

    return new_symbols


def store_to_db(df, table_name, engine):
    with session_scope() as session:
        df.to_sql(table_name, con=engine, if_exists="append", index=False)
        session.flush()

def fetch_and_store_profiles(engine, api_key):
    # 1
    list_symbols = fetch_equity_symbols()
    # 2
    new_symbols = get_new_symbols(list_symbols, engine)
    # 3
    if new_symbols:
        df_profiles = get_profile(new_symbols, api_key)

        if not df_profiles.empty:
            missing_symbols = set(new_symbols) - set(df_profiles["symbol"])
            MISSING_SYMBOLS.labels(run_id=run_id).inc(len(missing_symbols))

            list_cols = [
                "symbol",
                "companyName",
                "cik",
                "exchange",
                "exchangeShortName",
                "industry",
                "sector",
                "country",
                "ipoDate",
                "defaultImage",
                "isEtf",
                "isActivelyTrading",
            ]

            # Filtering the columns
            df_profiles_filtered = df_profiles[list_cols]

            # Replacing empty values
            df_profiles_filtered["ipoDate"].replace("", None, inplace=True)
            df_profiles_filtered["cik"].replace("", np.nan, inplace=True)

            store_to_db(df_profiles_filtered, "profiles", engine)
        else:
            print("No profiles found for the new symbols.")
            MISSING_SYMBOLS.labels(run_id=run_id).inc(len(new_symbols))


def fetch_and_store_financial_statements(engine, api_key):
    # Fetch existing symbols from the 'cashflows2' table
    query = "SELECT DISTINCT symbol FROM cashflows2;"
    existing_symbols_df = pd.read_sql(query, engine)
    existing_symbols = set(existing_symbols_df["symbol"])

    # Fetch all profiles
    query = "SELECT * FROM profiles;"
    df_profiles = pd.read_sql(query, engine)

    # Create list of symbols from profiles
    list_symbols = list(df_profiles["symbol"])

    # Filter out symbols that are already in 'cashflows2'
    list_symbols = [symbol for symbol in list_symbols if symbol not in existing_symbols]

    # Split symbols into chunks of 100
    chunks = [list_symbols[i : i + 100] for i in range(0, len(list_symbols), 100)]

    for chunk in chunks:
        df, invalid_tickers = get_financial_statements(
            tickers=chunk,
            statement="cashflow",
            api_key=api_key,
            start_date="2000-01-01",
        )

        # Store fetched data to 'cashflows2'
        store_to_db(df, "cashflows2", engine)

# Fetch existing symbols from the 'cashflows2' table
query = "SELECT DISTINCT symbol FROM cashflows2;"
existing_symbols_df = pd.read_sql(query, engine)
existing_symbols = set(existing_symbols_df["symbol"])

# Fetch all profiles
query = "SELECT * FROM profiles;"
df_profiles = pd.read_sql(query, engine)

# Create list of symbols from profiles
list_symbols = list(df_profiles["symbol"])

# Filter out symbols that are already in 'cashflows2'
list_symbols = [symbol for symbol in list_symbols if symbol not in existing_symbols]

# Split symbols into chunks of 100
chunks = [list_symbols[i : i + 100] for i in range(0, len(list_symbols), 100)]


for chunk in chunks:
    df, invalid_tickers = get_financial_statements(
        tickers=chunk,
        statement="cashflow",
        api_key=FMP_API_KEY,
        start_date="2000-01-01",
    )

    # Filter the dataframe
    filtered_df = filter_bigint_range(df)

    # Store fetched data to 'cashflows2'
    store_to_db(filtered_df, "cashflows2", engine)


if __name__ == "__main__":
    start_http_server(8000)

    fetch_and_store_profiles(engine, FMP_API_KEY)
    fetch_and_store_financial_statements(engine, FMP_API_KEY)

    Base.metadata.clear()

In [2]:
df = pd.read_csv(
    "/home/sam/github/data-driven-investor-platform/notebooks/financial_statements_temp_300.csv"
)

In [3]:
df.drop(columns=["Unnamed: 1"], inplace=True)

In [4]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
list_no_conv_cols = [
    "data",
    "symbol",
    "reportedCurrency",
    "period",
    "link",
    "finalLink",
    "cik",
    "fillingDate",
    "acceptedDate",
    "calendarYear",
]

In [7]:
df.head()

Unnamed: 0,date,symbol,reportedCurrency,cik,fillingDate,acceptedDate,calendarYear,period,netIncome,depreciationAndAmortization,deferredIncomeTax,stockBasedCompensation,changeInWorkingCapital,accountsReceivables,inventory,accountsPayables,otherWorkingCapital,otherNonCashItems,netCashProvidedByOperatingActivities,investmentsInPropertyPlantAndEquipment,acquisitionsNet,purchasesOfInvestments,salesMaturitiesOfInvestments,otherInvestingActivites,netCashUsedForInvestingActivites,debtRepayment,commonStockIssued,commonStockRepurchased,dividendsPaid,otherFinancingActivites,netCashUsedProvidedByFinancingActivities,effectOfForexChangesOnCash,netChangeInCash,cashAtEndOfPeriod,cashAtBeginningOfPeriod,operatingCashFlow,capitalExpenditure,freeCashFlow,link,finalLink
0,2023Q2,LANDO,USD,1495240,2023-08-07,2023-08-07 16:07:36,2023,Q2,7855000.0,9067000.0,0.0,0.0,6995000.0,0,0,0.0,0.0,-6054000.0,17863000.0,-2820000.0,0.0,0.0,0.0,8937000.0,6117000.0,-5417000.0,3019000.0,-951000.0,-11013000.0,-138000.0,-14500000.0,0.0,9480000.0,48208000.0,38728000.0,17863000.0,-2820000.0,15043000.0,https://www.sec.gov/Archives/edgar/data/149524...,https://www.sec.gov/Archives/edgar/data/149524...
1,2023Q1,LANDO,USD,1495240,2023-05-08,2023-05-08 16:05:59,2023,Q1,1750000.0,9142000.0,0.0,0.0,-7963000.0,0,0,-6367000.0,-1596000.0,1785000.0,4714000.0,-2871000.0,0.0,0.0,0.0,-145000.0,-3016000.0,-27570000.0,93000.0,-225000.0,-10645000.0,14236000.0,-24111000.0,0.0,-22413000.0,38728000.0,61141000.0,4714000.0,-2871000.0,1843000.0,https://www.sec.gov/Archives/edgar/data/149524...,https://www.sec.gov/Archives/edgar/data/149524...
2,2022Q4,LANDO,USD,1495240,2023-02-21,2023-02-21 17:16:42,2022,Q4,1110000.0,9504000.0,0.0,0.0,8336000.0,0,0,6109000.0,2227000.0,1765000.0,20715000.0,-4054000.0,0.0,0.0,0.0,-2803000.0,-6857000.0,-13391000.0,0.0,-371000.0,-10276000.0,34315000.0,10277000.0,0.0,24135000.0,61141000.0,37006000.0,20715000.0,-4054000.0,16661000.0,https://www.sec.gov/Archives/edgar/data/149524...,https://www.sec.gov/Archives/edgar/data/149524...
3,2022Q3,LANDO,USD,1495240,2022-11-08,2022-11-08 16:02:41,2022,Q3,1806000.0,9168000.0,0.0,0.0,-9082000.0,0,0,-1797000.0,-7285000.0,1826000.0,3718000.0,-5954000.0,0.0,0.0,0.0,-37526000.0,-43480000.0,-22335000.0,0.0,-419000.0,-9592000.0,60670000.0,28324000.0,0.0,-11438000.0,37006000.0,48444000.0,3718000.0,-5954000.0,-2236000.0,https://www.sec.gov/Archives/edgar/data/149524...,https://www.sec.gov/Archives/edgar/data/149524...
4,2022Q2,LANDO,USD,1495240,2022-08-09,2022-08-09 16:10:06,2022,Q2,613000.0,8415000.0,0.0,0.0,2219000.0,0,0,1710000.0,509000.0,555000.0,11802000.0,-6600000.0,0.0,0.0,0.0,-24975000.0,-31575000.0,-3015000.0,0.0,-7722000.0,-8830000.0,38403000.0,18836000.0,0.0,-937000.0,48444000.0,49381000.0,11802000.0,-6600000.0,5202000.0,https://www.sec.gov/Archives/edgar/data/149524...,https://www.sec.gov/Archives/edgar/data/149524...


In [23]:
df_goog = df[df["symbol"] == "GOOG"].sort_values(by=["date"], ascending=False)

In [29]:
df_goog["netIncome"] = df_goog["netIncome"] / 1000

In [None]:
df_goog

In [32]:
df_goog.loc[df_goog["date"] == "2023Q2"]["link"].values[0]

'https://www.sec.gov/Archives/edgar/data/1652044/000165204423000070/0001652044-23-000070-index.htm'

In [20]:
df.sort_values(by=["netIncome"], ascending=False)

Unnamed: 0,date,symbol,reportedCurrency,cik,fillingDate,acceptedDate,calendarYear,period,netIncome,depreciationAndAmortization,deferredIncomeTax,stockBasedCompensation,changeInWorkingCapital,accountsReceivables,inventory,accountsPayables,otherWorkingCapital,otherNonCashItems,netCashProvidedByOperatingActivities,investmentsInPropertyPlantAndEquipment,acquisitionsNet,purchasesOfInvestments,salesMaturitiesOfInvestments,otherInvestingActivites,netCashUsedForInvestingActivites,debtRepayment,commonStockIssued,commonStockRepurchased,dividendsPaid,otherFinancingActivites,netCashUsedProvidedByFinancingActivities,effectOfForexChangesOnCash,netChangeInCash,cashAtEndOfPeriod,cashAtBeginningOfPeriod,operatingCashFlow,capitalExpenditure,freeCashFlow,link,finalLink
345,2022Q1,ITHX,USD,1828852,2022-05-09,2022-05-09 16:31:47,2022,Q1,3.387190e+11,0.000000e+00,0.000000e+00,0.000000e+00,-7.275410e+05,0,0,0.000000e+00,0.000000e+00,-3.387186e+11,-2.946750e+05,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,-4.140000e+05,2.305290e+05,6.445290e+05,-2.946750e+05,0.000000e+00,-2.946750e+05,https://www.sec.gov/Archives/edgar/data/182885...,https://www.sec.gov/Archives/edgar/data/182885...
9397,2021Q4,GOOG,USD,1652044,2022-02-02,2022-02-01 21:08:02,2021,Q4,2.064200e+10,3.439000e+09,1.616000e+09,3.954000e+09,-2.225000e+09,-5819000000,5438000000,1.157000e+09,-3.001000e+09,-2.492000e+09,2.493400e+10,-6.383000e+09,-3.850000e+08,-4.086000e+10,3.651200e+10,1.000000e+08,-1.101600e+10,-6.365000e+09,0.000000e+00,-1.347300e+10,0.000000e+00,3.327000e+09,-1.651100e+10,-181000000.0,-2.774000e+09,2.094500e+10,2.371900e+10,2.493400e+10,-6.383000e+09,1.855100e+10,https://www.sec.gov/Archives/edgar/data/165204...,https://www.sec.gov/Archives/edgar/data/165204...
9398,2021Q3,GOOG,USD,1652044,2021-10-27,2021-10-26 19:47:06,2021,Q3,1.893600e+10,3.304000e+09,-1.287000e+09,3.874000e+09,2.806000e+09,-2409000000,1664000000,2.380000e+08,3.313000e+09,-2.094000e+09,2.553900e+10,-6.819000e+09,-2.590000e+08,-3.515300e+10,3.179300e+10,3.880000e+08,-1.005000e+10,-6.392000e+09,0.000000e+00,-1.261000e+10,0.000000e+00,3.748000e+09,-1.525400e+10,-146000000.0,8.900000e+07,2.371900e+10,2.363000e+10,2.553900e+10,-6.819000e+09,1.872000e+10,https://www.sec.gov/Archives/edgar/data/165204...,https://www.sec.gov/Archives/edgar/data/165204...
9399,2021Q2,GOOG,USD,1652044,2021-07-28,2021-07-27 20:16:30,2021,Q2,1.852500e+10,2.945000e+09,3.790000e+08,3.803000e+09,-8.710000e+08,-3661000000,4005000000,-1.300000e+08,-1.085000e+09,-2.891000e+09,2.189000e+10,-5.496000e+09,-3.080000e+08,-2.494900e+10,2.165600e+10,2.300000e+07,-9.074000e+09,-7.741000e+09,0.000000e+00,-1.279600e+10,0.000000e+00,4.546000e+09,-1.599100e+10,183000000.0,-2.992000e+09,2.363000e+10,2.662200e+10,2.189000e+10,-5.496000e+09,1.639400e+10,https://www.sec.gov/Archives/edgar/data/165204...,https://www.sec.gov/Archives/edgar/data/165204...
9391,2023Q2,GOOG,USD,1652044,2023-07-26,2023-07-25 19:13:26,2023,Q2,1.836800e+10,3.397000e+09,-2.415000e+09,5.774000e+09,2.936000e+09,-2948000000,130000000,1.119000e+09,4.635000e+09,6.060000e+08,2.866600e+10,-6.888000e+09,-2.980000e+08,-2.224900e+10,1.886700e+10,-2.320000e+08,-1.080000e+10,-1.255000e+09,0.000000e+00,-1.496900e+10,0.000000e+00,-1.611000e+09,-1.783500e+10,-26000000.0,5.000000e+06,2.592900e+10,2.592400e+10,2.866600e+10,-6.888000e+09,2.177800e+10,https://www.sec.gov/Archives/edgar/data/165204...,https://www.sec.gov/Archives/edgar/data/165204...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9413,2017Q4,GOOG,USD,1652044,2018-02-06,2018-02-05 20:46:29,2017,Q4,-3.020000e+09,2.026000e+09,1.600000e+07,1.847000e+09,9.327000e+09,-3049000000,2465000000,6.730000e+08,9.238000e+09,7.200000e+07,1.026800e+10,-4.307000e+09,-1.400000e+07,-1.436000e+10,1.168900e+10,1.800000e+07,-6.974000e+09,-1.615000e+09,-8.000000e+08,-2.101000e+09,-8.000000e+08,2.138000e+09,-3.178000e+09,18000000.0,1.340000e+08,1.071500e+10,1.058100e+10,1.026800e+10,-4.307000e+09,5.961000e+09,https://www.sec.gov/Archives/edgar/data/165204...,https://www.sec.gov/Archives/edgar/data/165204...
10971,2022Q2,DISCK,USD,1437107,2022-08-05,2022-08-04 22:01:59,2022,Q2,-3.408000e+09,7.884000e+09,-7.970000e+08,1.500000e+08,-3.594000e+09,0,0,1.320000e+08,-3.726000e+09,7.760000e+08,1.011000e+09,-2.220000e+08,2.419000e+09,0.000000e+00,2.200000e+08,-6.600000e+07,2.351000e+09,-3.590000e+09,0.000000e+00,0.000000e+00,0.000000e+00,2.000000e+07,-3.570000e+09,-61000000.0,-2.690000e+08,3.896000e+09,4.165000e+09,1.011000e+09,-2.220000e+08,7.890000e+08,https://www.sec.gov/Archives/edgar/data/143710...,https://www.sec.gov/Archives/edgar/data/143710...
11198,2001Q3,VCTR,USD,1570827,2022-11-03,2022-11-03 16:31:57,2001,Q3,-6.641000e+09,4.377000e+09,0.000000e+00,5.583700e+10,0.000000e+00,0,0,0.000000e+00,0.000000e+00,-5.357300e+10,5.357300e+10,0.000000e+00,2.400000e+09,0.000000e+00,0.000000e+00,0.000000e+00,2.400000e+09,0.000000e+00,0.000000e+00,0.000000e+00,-1.500000e+05,1.500000e+05,-1.500000e+05,0.0,5.597285e+10,5.597285e+10,0.000000e+00,5.357300e+10,0.000000e+00,5.357300e+10,https://www.sec.gov/Archives/edgar/data/157082...,https://www.sec.gov/Archives/edgar/data/157082...
7669,2022Q4,INTC,USD,50863,2023-01-27,2023-01-26 18:31:20,2022,Q4,-8.320000e+09,-3.209000e+09,0.000000e+00,-8.630000e+08,7.923000e+09,-61000000,993000000,3.390000e+08,6.652000e+09,5.472000e+09,1.003000e+09,-5.699000e+09,-6.579000e+09,1.302600e+10,-1.650000e+10,1.479300e+10,-9.590000e+08,-8.700000e+07,0.000000e+00,0.000000e+00,-1.477000e+09,8.572000e+09,7.008000e+09,-437000000.0,6.615000e+09,1.114400e+10,4.529000e+09,1.003000e+09,-5.699000e+09,-4.696000e+09,https://www.sec.gov/Archives/edgar/data/50863/...,https://www.sec.gov/Archives/edgar/data/50863/...


In [9]:
df["date"] = df["date"].dt.to_timestamp()

In [13]:
df["fillingDate"] = pd.to_datetime(df["fillingDate"])
df["acceptedDate"] = pd.to_datetime(df["acceptedDate"])

KeyError: 'fillingDate'

In [14]:
from sqlalchemy import select, MetaData, Table

In [15]:
query = "SELECT * FROM cashflows;"
existing_records = pd.read_sql(query, engine)

In [16]:
existing_records

Unnamed: 0,id,date,symbol,reportedCurrency,cik,fillingDate,acceptedDate,calendarYear,period,netIncome,depreciationAndAmortization,deferredIncomeTax,stockBasedCompensation,changeInWorkingCapital,accountsReceivables,inventory,accountsPayables,otherWorkingCapital,otherNonCashItems,netCashProvidedByOperatingActivities,investmentsInPropertyPlantAndEquipment,acquisitionsNet,purchasesOfInvestments,salesMaturitiesOfInvestments,otherInvestingActivites,netCashUsedForInvestingActivites,debtRepayment,commonStockIssued,commonStockRepurchased,dividendsPaid,otherFinancingActivites,netCashUsedProvidedByFinancingActivities,effectOfForexChangesOnCash,netChangeInCash,cashAtEndOfPeriod,cashAtBeginningOfPeriod,operatingCashFlow,capitalExpenditure,freeCashFlow,link,finalLink


In [17]:
existing_records["date"] = pd.to_datetime(existing_records["date"])

In [18]:
existing_records["fillingDate"] = pd.to_datetime(existing_records["fillingDate"])
existing_records["acceptedDate"] = pd.to_datetime(existing_records["acceptedDate"])

In [19]:
# Filter DataFrame to only include records that differ from existing database records
comparison_fields = [
    "date",
    "symbol",
    "cik",
    "fillingDate",
    "acceptedDate",
    "calendarYear",
    "period",
]
df_to_append = pd.merge(
    df, existing_records, on=comparison_fields, how="left", indicator=True
)
df_to_append = df_to_append[df_to_append["_merge"] == "left_only"].drop(
    "_merge", axis=1
)

KeyError: 'date'

In [93]:
df_to_append.drop(
    [col for col in df_to_append.columns if col.endswith("_y")], axis=1, inplace=True
)

# Rename '_x' columns to their original names
df_to_append.rename(
    columns={
        col: col.rstrip("_x") for col in df_to_append.columns if col.endswith("_x")
    },
    inplace=True,
)

In [94]:
# Drop the 'id' column if it exists
if "id" in df_to_append.columns:
    df_to_append.drop("id", axis=1, inplace=True)

In [95]:
df_to_append.rename(columns={"deferredIncomeTa": "deferredIncomeTax"}, inplace=True)

In [10]:
store_to_db(df, "cashflows", engine)

DataError: (psycopg2.errors.NumericValueOutOfRange) integer out of range

[SQL: INSERT INTO cashflows (date, symbol, "reportedCurrency", cik, "fillingDate", "acceptedDate", "calendarYear", period, "netIncome", "depreciationAndAmortization", "deferredIncomeTax", "stockBasedCompensation", "changeInWorkingCapital", "accountsReceiva ... 105776 characters truncated ... ngCashFlow__94)s, %(capitalExpenditure__94)s, %(freeCashFlow__94)s, %(link__94)s, %(finalLink__94)s)]
[parameters: {'changeInWorkingCapital__0': -2009000000, 'stockBasedCompensation__0': 2617000000, 'otherWorkingCapital__0': -3974000000, 'finalLink__0': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000077/aapl-20230701.htm', 'effectOfForexChangesOnCash__0': 0, 'netCashUsedProvidedByFinancingActivities__0': -24048000000, 'reportedCurrency__0': 'USD', 'freeCashFlow__0': 24287000000, 'accountsPayables__0': 3974000000, 'acceptedDate__0': '2023-08-03 18:04:43', 'accountsReceivables__0': -1987000000, 'otherNonCashItems__0': 3447000000, 'link__0': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000077/0000320193-23-000077-index.htm', 'inventory__0': -22000000, 'calendarYear__0': 2023, 'dividendsPaid__0': -3849000000, 'period__0': 'Q3', 'commonStockRepurchased__0': -19863000000, 'acquisitionsNet__0': 0, 'otherFinancingActivites__0': -2438000000, 'netCashUsedForInvestingActivites__0': 437000000, 'cashAtEndOfPeriod__0': 29898000000, 'salesMaturitiesOfInvestments__0': 12795000000, 'operatingCashFlow__0': 26380000000, 'cik__0': 320193, 'investmentsInPropertyPlantAndEquipment__0': -2093000000, 'otherInvestingActivites__0': -506000000, 'deferredIncomeTax__0': -608000000, 'date__0': datetime.datetime(2023, 7, 1, 0, 0), 'depreciationAndAmortization__0': 3052000000, 'commonStockIssued__0': 9602000000, 'netIncome__0': 19881000000, 'capitalExpenditure__0': -2093000000, 'symbol__0': 'AAPL', 'cashAtBeginningOfPeriod__0': 27129000000, 'netChangeInCash__0': 2769000000, 'netCashProvidedByOperatingActivities__0': 26380000000, 'purchasesOfInvestments__0': -9759000000, 'debtRepayment__0': -7500000000, 'fillingDate__0': '2023-08-04', 'changeInWorkingCapital__1': 231000000, 'stockBasedCompensation__1': 2686000000, 'otherWorkingCapital__1': 10340000000, 'finalLink__1': 'https://www.sec.gov/Archives/edgar/data/320193/000032019323000064/aapl-20230401.htm', 'effectOfForexChangesOnCash__1': 0, 'netCashUsedProvidedByFinancingActivities__1': -25724000000, 'reportedCurrency__1': 'USD', 'freeCashFlow__1': 25644000000, 'accountsPayables__1': -14689000000, 'acceptedDate__1': '2023-05-04 18:03:52' ... 3700 parameters truncated ... 'commonStockIssued__93': 30000000, 'netIncome__93': 233000000, 'capitalExpenditure__93': -27000000, 'symbol__93': 'AAPL', 'cashAtBeginningOfPeriod__93': 1586000000, 'netChangeInCash__93': 76000000, 'netCashProvidedByOperatingActivities__93': 61000000, 'purchasesOfInvestments__93': -1366000000, 'debtRepayment__93': 0, 'fillingDate__93': '2000-05-11', 'changeInWorkingCapital__94': 237000000, 'stockBasedCompensation__94': 0, 'otherWorkingCapital__94': 81000000, 'finalLink__94': 'https://www.sec.gov/Archives/edgar/data/320193/000091205700003201/0000912057-00-003201.txt', 'effectOfForexChangesOnCash__94': 0, 'netCashUsedProvidedByFinancingActivities__94': -24000000, 'reportedCurrency__94': 'USD', 'freeCashFlow__94': 335000000, 'accountsPayables__94': 362000000, 'acceptedDate__94': '2000-02-01 00:00:00', 'accountsReceivables__94': -211000000, 'otherNonCashItems__94': -131000000, 'link__94': 'https://www.sec.gov/Archives/edgar/data/320193/000091205700003201/0000912057-00-003201-index.htm', 'inventory__94': 5000000, 'calendarYear__94': 2000, 'dividendsPaid__94': 0, 'period__94': 'Q1', 'commonStockRepurchased__94': -41000000, 'acquisitionsNet__94': 0, 'otherFinancingActivites__94': 0, 'netCashUsedForInvestingActivites__94': -89000000, 'cashAtEndOfPeriod__94': 1586000000, 'salesMaturitiesOfInvestments__94': 655000000, 'operatingCashFlow__94': 373000000, 'cik__94': 320193, 'investmentsInPropertyPlantAndEquipment__94': -38000000, 'otherInvestingActivites__94': -13000000, 'deferredIncomeTax__94': 64000000, 'date__94': datetime.datetime(2000, 1, 1, 0, 0), 'depreciationAndAmortization__94': 20000000, 'commonStockIssued__94': 17000000, 'netIncome__94': 183000000, 'capitalExpenditure__94': -38000000, 'symbol__94': 'AAPL', 'cashAtBeginningOfPeriod__94': 1326000000, 'netChangeInCash__94': 260000000, 'netCashProvidedByOperatingActivities__94': 373000000, 'purchasesOfInvestments__94': -693000000, 'debtRepayment__94': 0, 'fillingDate__94': '2000-02-01'}]
(Background on this error at: https://sqlalche.me/e/20/9h9h)

In [68]:
max_values = df_to_append.select_dtypes(include=["number"]).max()
print(max_values)

cik                                             1318605
calendarYear                                       2023
netIncome                                   20505000000
depreciationAndAmortization                  3979000000
deferredIncomeTax                           13669000000
stockBasedCompensation                       3232000000
changeInWorkingCapital                      27009000000
accountsReceivables                         11729000000
inventory                                    1654000000
accountsPayables                             6029000000
otherWorkingCapital                         31153000000
otherNonCashItems                           16400000000
netCashProvidedByOperatingActivities        28770000000
investmentsInPropertyPlantAndEquipment          -144000
acquisitionsNet                               213523000
purchasesOfInvestments                                0
salesMaturitiesOfInvestments                57702000000
otherInvestingActivites                      214