In [4]:
import numpy as np
import pandas as pd
import matplotlib as plt
import torch 
import torch.nn as nn
import yfinance as yf
import keras 
import os
import tensorflow
import datetime 
import requests
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Activation
from keras import optimizers
from keras.models import Model
from keras.layers import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

In [5]:
#takes any ticker and returns X as past # days and Y as tmr's price, split into train and test
def DataPrep(ticker='aapl',daysago=100,split=0.9):
    df = yf.Ticker(ticker.upper()).history(period='max')
    if df.shape[1] == 7:
        df = df.drop(columns=['Dividends','Stock Splits'])
    df = df.drop(columns=['Open','High','Low','Volume'])
    for i in range(daysago,-2,-1):
        df['{}daysago'.format(i)] = df['Close'].shift(i)
        df = df.copy()
    df = df.drop(columns=['Close'])
    df = df.dropna()
    sc = MinMaxScaler(feature_range=(0,1))
    df_scaled = sc.fit_transform(df)
    X,Y = df_scaled[:,:daysago+1],df_scaled[:,-1]
    split = int(len(X)*split)
    X_train, X_test, Y_train, Y_test = X[:split], X[split:], Y[:split], Y[split:]
    X_train, X_test = X_train.reshape((-1,daysago+1,1)), X_test.reshape((-1,daysago+1,1))
    Y_train, Y_test = Y_train.reshape((-1,1)), Y_test.reshape((-1,1))
    return X_train, X_test, Y_train, Y_test

#takes any list of tickers, compiles x_train, x_test, y_train, y_test, @ a set daysago and split
def CompileData(tickers=['aapl','nvda'],daysago=100,split=0.9):
    X_train,X_test,Y_train,Y_test = [],[],[],[]
    X_train,X_test,Y_train,Y_test = np.array(X_train).reshape(-1,daysago+1,1),np.array(X_test).reshape(-1,daysago+1,1), np.array(Y_train).reshape(-1,1), np.array(Y_test).reshape(-1,1)
    for ticker in tickers:
        dataprep = DataPrep(ticker,daysago,split)
        X_train = np.append(X_train,dataprep[0],axis=0)
        X_test = np.append(X_test,dataprep[1],axis=0)
        Y_train = np.append(Y_train,dataprep[2],axis=0)
        Y_test = np.append(Y_test,dataprep[3],axis=0)
    return X_train, X_test, Y_train, Y_test
CompileData()

(array([[[3.09576982e-04],
         [2.83417229e-04],
         [2.48533336e-04],
         ...,
         [2.51314511e-04],
         [2.40308644e-04],
         [2.45811514e-04]],
 
        [[2.83417229e-04],
         [2.48533336e-04],
         [2.59433459e-04],
         ...,
         [2.40308644e-04],
         [2.45811514e-04],
         [2.49480094e-04]],
 
        [[2.48533336e-04],
         [2.59433459e-04],
         [2.72517295e-04],
         ...,
         [2.45811514e-04],
         [2.49480094e-04],
         [2.40308644e-04]],
 
        ...,
 
        [[2.45865174e-01],
         [2.43914339e-01],
         [2.40433014e-01],
         ...,
         [1.64214632e-01],
         [1.67514750e-01],
         [1.61415468e-01]],
 
        [[2.43914339e-01],
         [2.40433014e-01],
         [2.38686916e-01],
         ...,
         [1.67514750e-01],
         [1.61415468e-01],
         [1.68015674e-01]],
 
        [[2.40433014e-01],
         [2.38686916e-01],
         [2.41931176e-01],
         

In [6]:
def ReadIndex(filename='SP500',blacklist=['BRK.B','BF.B','WRK']):
    df = pd.read_csv('{}.csv'.format(filename))
    ListOfTickers = df['Symbol'].tolist()
    for ticker in blacklist:
        ListOfTickers.remove(ticker)
    return ListOfTickers

In [7]:
# function to get the CIK of a company from the SEC database
def get_cik(ticker):
    ticker_symbol = ticker.upper()

    # getting all the companies ticker and CIK from the SEC database as a dictionary
    companyTickers = requests.get('https://www.sec.gov/files/company_tickers.json', headers={'User-Agent': "testing@gmail.com"}).json().values()

    # returns {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'} for each company
    for company in companyTickers:
        if company['ticker'] == ticker_symbol:
            return(company)

    return('Ticker CIK not found in SEC database')

# function to get the SEC financial reports of a company
def sec_filings(ticker):

    ticker_symbol = ticker.upper()

    # grab CIK and fill leading zeroes
    cik = str(get_cik(ticker_symbol)['cik_str']).zfill(10)

    filingData = requests.get(f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',headers={'User-Agent': "testing@gmail.com"}).json()

    return filingData

    # used to check the key names for category of data
    # print(filingData['facts']['us-gaap'].keys())



In [197]:
# function to get the actual SEC financial metrics from the filing data
def metrics(ticker,metrics):
    metricList = []
    filingData = sec_filings(ticker)
    index = pd.date_range(start='1900-01-01', end=datetime.datetime.now(), freq='D')
    for metric in metrics:
        df = pd.DataFrame(columns=[f"{ticker.upper()}'s {metric}"],index=index)
        for i in filingData['facts']['us-gaap'][f'{metric}']['units']['USD']:
            if i['form'] == '10-K' or (i['fp'] == 'Q3' or i['fp'] == 'Q1' or i['fp'] == 'Q2'):
                df.at[i['filed'], f"{ticker.upper()}'s {metric}"] = i['val']
        df = df.infer_objects(copy=False).ffill().fillna(0)
        metricList.append(df)
    return metricList


CommonMetrics =['NetCashProvidedByUsedInFinancingActivities', 'NetCashProvidedByUsedInInvestingActivities', 'Assets', 'LiabilitiesAndStockholdersEquity']

metrics('Abnb',CommonMetrics)   

[            ABNB's NetCashProvidedByUsedInFinancingActivities
 1900-01-01                                       0.000000e+00
 1900-01-02                                       0.000000e+00
 1900-01-03                                       0.000000e+00
 1900-01-04                                       0.000000e+00
 1900-01-05                                       0.000000e+00
 ...                                                       ...
 2024-08-06                                       2.945000e+09
 2024-08-07                                       2.945000e+09
 2024-08-08                                       2.945000e+09
 2024-08-09                                       2.945000e+09
 2024-08-10                                       2.945000e+09
 
 [45513 rows x 1 columns],
             ABNB's NetCashProvidedByUsedInInvestingActivities
 1900-01-01                                                0.0
 1900-01-02                                                0.0
 1900-01-03               

In [198]:
#this was to find the intersection of two lists, for finding CommonMetrics
def find_intersection(list1, list2):
    intersection = list(set(list1) & set(list2))
    return intersection

# f = list(sec_filings('MMM'))

# for ticker in ReadIndex():
#     print(ticker)
#     e = list(sec_filings(ticker))
#     intersect = find_intersection(e,f)
#     f = intersect
#     print(f)

#this is the list of common metrics between all SEC filings, can be used to compare companies
CommonMetrics =['NetCashProvidedByUsedInFinancingActivities', 'NetCashProvidedByUsedInInvestingActivities', 'Assets', 'LiabilitiesAndStockholdersEquity']

#loop for all tickers in the S&P500
for ticker in ReadIndex():
    print(ticker)
    e = metrics(ticker,CommonMetrics)


   

MMM
AOS
ABT
ABBV
ACN
ADBE
AMD
AES
AFL
A
APD
ABNB
AKAM
ALB
ARE
ALGN
ALLE
LNT
ALL
GOOGL
GOOG
MO
AMZN
AMCR
AEE
AAL
AEP
AXP
AIG
AMT
AWK
AMP
AME
AMGN
APH
ADI
ANSS
AON
APA
AAPL
AMAT
APTV
ACGL
ADM
ANET
AJG
AIZ
T
ATO
ADSK
ADP
AZO
AVB
AVY
AXON
BKR
BALL
BAC
BK
BBWI
BAX
BDX
BBY
BIO
TECH
BIIB
BLK
BX
BA
BKNG
BWA
BXP
BSX
BMY
AVGO
BR
BRO
BLDR
BG
CDNS
CZR
CPT
CPB
COF
CAH
KMX
CCL
CARR
CTLT
CAT
CBOE
CBRE
CDW
CE
COR
CNC
CNP
CF
CHRW
CRL
SCHW
CHTR
CVX
CMG
CB
CHD
CI
CINF
CTAS
CSCO
C
CFG
CLX
CME
CMS
KO
CTSH
CL
CMCSA
CMA
CAG
COP
ED
STZ
CEG
COO
CPRT
GLW
CPAY
CTVA
CSGP
COST
CTRA
CCI
CSX
CMI
CVS
DHR
DRI
DVA
DAY
DECK
DE
DAL
DVN
DXCM
FANG
DLR
DFS
DG
DLTR
D
DPZ
DOV
DOW
DHI
DTE
DUK
DD
EMN
ETN
EBAY
ECL
EIX
EW
EA
ELV
LLY
EMR
ENPH
ETR
EOG
EPAM
EQT
EFX
EQIX
EQR
ESS
EL
ETSY
EG
EVRG
ES
EXC
EXPE
EXPD
EXR
XOM
FFIV
FDS
FICO
FAST
FRT
FDX
FIS
FITB
FSLR
FE
FI
FMC
F
FTNT
FTV
FOXA
FOX
BEN
FCX
GRMN
IT
GE
GEHC
GEV
GEN
GNRC
GD
GIS
GM
GPC
GILD
GPN
GL
GS
HAL
HIG
HAS
HCA
DOC
HSIC
HSY
HES
HPE
HLT
HOLX
HD
HON
HRL
HST
HWM
HPQ
HUBB
HUM
HBA