In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
import torch 
import torch.nn as nn
import yfinance as yf
import keras 
import os
import tensorflow
import datetime 
import requests
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Activation
from keras import optimizers
from keras.models import Model
from keras.layers import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

In [37]:
#takes any ticker and returns X as past # days and Y as tmr's price, split into train and test
def DataPrep(ticker='aapl',daysago=100,split=0.9):

    #makes a df of ticker, drops columns
    df = yf.Ticker(ticker.upper()).history(period='max')
    if df.shape[1] == 7:
        df = df.drop(columns=['Dividends','Stock Splits'])
    df = df.drop(columns=['Open','High','Low','Volume'])

    #shifts the close price by # of days ago, to create a dataset of past # days and tmr's price
    for i in range(daysago,-2,-1):
        df['{}daysago'.format(i)] = df['Close'].shift(i)
        df = df.copy()
    df = df.drop(columns=['Close'])
    df = df.dropna()

    #scales the data, splits into train and test
    sc = MinMaxScaler(feature_range=(0,1))
    df_scaled = sc.fit_transform(df)
    X,Y = df_scaled[:,:daysago+1],df_scaled[:,-1]
    split = int(len(X)*split)
    X_train, X_test, Y_train, Y_test = X[:split], X[split:], Y[:split], Y[split:]

    #reshapes the data for LSTM
    X_train, X_test = X_train.reshape((-1,daysago+1,1)), X_test.reshape((-1,daysago+1,1))
    Y_train, Y_test = Y_train.reshape((-1,1)), Y_test.reshape((-1,1))
    return X_train, X_test, Y_train, Y_test

#takes any list of tickers, compiles x_train, x_test, y_train, y_test, @ a set daysago and split
def CompileData(tickers=['aapl','nvda'],daysago=100,split=0.9):

    #initializes empty arrays, for big X_train, X_test, Y_train, Y_test
    X_train,X_test,Y_train,Y_test = [],[],[],[]
    X_train,X_test,Y_train,Y_test = np.array(X_train).reshape(-1,daysago+1,1),np.array(X_test).reshape(-1,daysago+1,1), np.array(Y_train).reshape(-1,1), np.array(Y_test).reshape(-1,1)
    
    #loops through tickers and compiles data into 1 big X_train, X_test, Y_train, Y_test
    for ticker in tickers:
        print(f'Compiling Data for {ticker}')
        dataprep = DataPrep(ticker,daysago,split)
        X_train = np.append(X_train,dataprep[0],axis=0)
        X_test = np.append(X_test,dataprep[1],axis=0)
        Y_train = np.append(Y_train,dataprep[2],axis=0)
        Y_test = np.append(Y_test,dataprep[3],axis=0)
    return X_train, X_test, Y_train, Y_test
CompileData()

#takes the name of a csv file and returns a list of tickers, with an optional blacklist
def ReadIndex(filename='SP500',blacklist=['BRK.B','BF.B','WRK','SOLV','GEV','SWKS']):
    df = pd.read_csv(f'{filename}.csv')
    ListOfTickers = df['Symbol'].tolist()
    for ticker in blacklist:
        ListOfTickers.remove(ticker)
    return ListOfTickers
ReadIndex()


#takes the full list of tickers and a SINGLE metric, returns a grouped object of the metric, ranked by quantiles
metricsIWannaTest = ['state', 'industryKey', 'sectorKey', 'fullTimeEmployees', 'companyOfficers', 'auditRisk', 'boardRisk', 'compensationRisk', 'shareHolderRightsRisk', 'overallRisk', 'maxAge', 'priceHint', 'previousClose', 'open', 'dayLow', 'dayHigh', 'regularMarketPreviousClose', 'regularMarketOpen', 'regularMarketDayLow', 'regularMarketDayHigh', 'dividendRate', 'dividendYield', 'payoutRatio', 'fiveYearAvgDividendYield', 'beta', 'trailingPE', 'forwardPE', 'volume', 'regularMarketVolume', 'averageVolume', 'averageVolume10days', 'averageDailyVolume10Day', 'bid', 'ask', 'bidSize', 'askSize', 'marketCap', 'fiftyTwoWeekLow', 'fiftyTwoWeekHigh', 'priceToSalesTrailing12Months', 'fiftyDayAverage', 'twoHundredDayAverage', 'trailingAnnualDividendRate', 'trailingAnnualDividendYield', 'enterpriseValue', 'profitMargins', 'floatShares', 'sharesOutstanding', 'sharesShort', 'sharesShortPriorMonth', 'sharesShortPreviousMonthDate', 'dateShortInterest', 'sharesPercentSharesOut', 'heldPercentInsiders', 'heldPercentInstitutions', 'shortRatio', 'shortPercentOfFloat', 'impliedSharesOutstanding', 'bookValue', 'priceToBook', 'mostRecentQuarter', 'earningsQuarterlyGrowth', 'netIncomeToCommon', 'trailingEps', 'forwardEps', 'pegRatio', 'lastSplitFactor', 'enterpriseToRevenue', 'enterpriseToEbitda', '52WeekChange', 'SandP52WeekChange', 'lastDividendValue', 'lastDividendDate', 'currentPrice', 'targetHighPrice', 'targetLowPrice', 'targetMeanPrice', 'targetMedianPrice', 'recommendationMean', 'numberOfAnalystOpinions', 'totalCash', 'totalCashPerShare', 'ebitda', 'totalDebt', 'quickRatio', 'currentRatio', 'totalRevenue', 'debtToEquity', 'revenuePerShare', 'returnOnAssets', 'returnOnEquity', 'freeCashflow', 'operatingCashflow', 'earningsGrowth', 'revenueGrowth', 'grossMargins', 'ebitdaMargins', 'operatingMargins', 'trailingPegRatio']
def getIndexMetric(listOfTickers, metric):

    #initializes empty array, for metric info
    metricinfo = np.array([])

    #loops through tickers and appends the metric info to the array
    for ticker in listOfTickers:
        print(ticker)
        metricinfo = np.append(metricinfo,yf.Ticker(ticker).info[f'{metric}'])
    
    #converts the array to a df, groups by quantiles
    metricinfo = pd.DataFrame(metricinfo,columns=[f'{metric}'],index=ReadIndex())
    metricinfo = metricinfo.groupby(pd.qcut(metricinfo[f'{metric}'], q=10))
    return metricinfo
getIndexMetric(ReadIndex(), 'marketCap')

#takes the grouped object of a metric and returns a dictionary of train/test data for each quantile
def groupedIntoTrainableData(df_grouped):
    boxes = {}
    quant = 10
    
    #loops through the grouped object and compiles data for each quantile
    for group_name, group_data in df_grouped:
        boxes[f'{quant}% box'] = CompileData(group_data.index.tolist())
        quant += 10
        
    return boxes

#takes a dictionary of train/test data and trains a model on each set
def trainingModel(dictOfData, metric):
    #initializes model
    model = Sequential()
    model.add(InputLayer((101, 1)))
    model.add(LSTM(66))
    model.add(Dense(33, 'sigmoid'))
    model.add(Dense(1, 'linear'))
    model.compile(loss='mse',optimizer=Adam(learning_rate=0.001), metrics=['mse'])

    #loops through the dictionary of train/test data and trains the model on each set
    for key in dictOfData.keys():
        X_train = dictOfData[key][0]
        Y_train = dictOfData[key][2]
        #saves the model into folders
        cp1 = ModelCheckpoint(f'models/{metric}/{key}.keras', save_best_only=False)
        model.fit(X_train, Y_train, epochs=5, batch_size=32, callbacks=[cp1])

# trainingModel(groupedIntoTrainableData(getIndexMrkCap(ReadIndex(), 'marketCap')))

#takes a dictionary of train/test data and tests a model on each set
def testingModel(dictOfData, metric):
    #loops through the dictionary of train/test data and tests the model on each set
    for key in dictOfData.keys():
        
        X_test = dictOfData[key][1]
        Y_test = dictOfData[key][3]
        model = load_model(f'model/{metric}/{key}.keras')
        y_pred = model.predict(X_test)

        #calculates the average absolute error 
        avg_error = (np.sum(np.sqrt((Y_test - y_pred)**2)))/len(Y_test)*100
        print(f"{key}'s abs avg error:{avg_error}")

        #calculates the average error
        avg_error = (np.sum(Y_test - y_pred))/len(Y_test)*100
        print(f"{key}'s non-abs avg error:{avg_error}")

testingModel(groupedIntoTrainableData(getIndexMetric(ReadIndex(), 'marketCap')),'marketCap')

Compiling Data for aapl
Compiling Data for nvda
MMM
AOS
ABT
ABBV
ACN
ADBE
AMD
AES
AFL
A
APD
ABNB
AKAM
ALB
ARE
ALGN
ALLE
LNT
ALL
GOOGL
GOOG
MO
AMZN
AMCR
AEE
AAL
AEP
AXP
AIG
AMT
AWK
AMP
AME
AMGN
APH
ADI
ANSS
AON
APA
AAPL
AMAT
APTV
ACGL
ADM
ANET
AJG
AIZ
T
ATO
ADSK
ADP
AZO
AVB
AVY
AXON
BKR
BALL
BAC
BK
BBWI
BAX
BDX
BBY
BIO
TECH
BIIB
BLK
BX
BA
BKNG
BWA
BXP
BSX
BMY
AVGO
BR
BRO
BLDR
BG
CDNS
CZR
CPT
CPB
COF
CAH
KMX
CCL
CARR
CTLT
CAT
CBOE
CBRE
CDW
CE
COR
CNC
CNP
CF
CHRW
CRL
SCHW
CHTR
CVX
CMG
CB
CHD
CI
CINF
CTAS
CSCO
C
CFG
CLX
CME
CMS
KO
CTSH
CL
CMCSA
CMA
CAG
COP
ED
STZ
CEG
COO
CPRT
GLW
CPAY
CTVA
CSGP
COST
CTRA
CCI
CSX
CMI
CVS
DHR
DRI
DVA
DAY
DECK
DE
DAL
DVN
DXCM
FANG
DLR
DFS
DG
DLTR
D
DPZ
DOV
DOW
DHI
DTE
DUK
DD
EMN
ETN
EBAY
ECL
EIX
EW
EA
ELV
LLY
EMR
ENPH
ETR
EOG
EPAM
EQT
EFX
EQIX
EQR
ESS
EL
ETSY
EG
EVRG
ES
EXC
EXPE
EXPD
EXR
XOM
FFIV
FDS
FICO
FAST
FRT
FDX
FIS
FITB
FSLR
FE
FI
FMC
F
FTNT
FTV
FOXA
FOX
BEN
FCX
GRMN
IT
GE
GEHC
GEN
GNRC
GD
GIS
GM
GPC
GILD
GPN
GL
GS
HAL
HIG
HAS
HCA
DOC
HSIC
HSY
HES
HPE


  metricinfo = metricinfo.groupby(pd.qcut(metricinfo[f'{metric}'], q=10))


ABBV
ACN
ADBE
AMD
AES
AFL
A
APD
ABNB
AKAM
ALB
ARE
ALGN
ALLE
LNT
ALL
GOOGL
GOOG
MO
AMZN
AMCR
AEE
AAL
AEP
AXP
AIG
AMT
AWK
AMP
AME
AMGN
APH
ADI
ANSS
AON
APA
AAPL
AMAT
APTV
ACGL
ADM
ANET
AJG
AIZ
T
ATO
ADSK
ADP
AZO
AVB
AVY
AXON
BKR
BALL
BAC
BK
BBWI
BAX
BDX
BBY
BIO
TECH
BIIB
BLK
BX
BA
BKNG
BWA
BXP
BSX
BMY
AVGO
BR
BRO
BLDR
BG
CDNS
CZR
CPT
CPB
COF
CAH
KMX
CCL
CARR
CTLT
CAT
CBOE
CBRE
CDW
CE
COR
CNC
CNP
CF
CHRW
CRL
SCHW
CHTR
CVX
CMG
CB
CHD
CI
CINF
CTAS
CSCO
C
CFG
CLX
CME
CMS
KO
CTSH
CL
CMCSA
CMA
CAG
COP
ED
STZ
CEG
COO
CPRT
GLW
CPAY
CTVA
CSGP
COST
CTRA
CCI
CSX
CMI
CVS
DHR
DRI
DVA
DAY
DECK
DE
DAL
DVN
DXCM
FANG
DLR
DFS
DG
DLTR
D
DPZ
DOV
DOW
DHI
DTE
DUK
DD
EMN
ETN
EBAY
ECL
EIX
EW
EA
ELV
LLY
EMR
ENPH
ETR
EOG
EPAM
EQT
EFX
EQIX
EQR
ESS
EL
ETSY
EG
EVRG
ES
EXC
EXPE
EXPD
EXR
XOM
FFIV
FDS
FICO
FAST
FRT
FDX
FIS
FITB
FSLR
FE
FI
FMC
F
FTNT
FTV
FOXA
FOX
BEN
FCX
GRMN
IT
GE
GEHC
GEN
GNRC
GD
GIS
GM
GPC
GILD
GPN
GL
GS
HAL
HIG
HAS
HCA
DOC
HSIC
HSY
HES
HPE
HLT
HOLX
HD
HON
HRL
HST
HWM
HPQ
HUBB
HUM
HBAN
HII
IBM
IEX
ID

  metricinfo = metricinfo.groupby(pd.qcut(metricinfo[f'{metric}'], q=10))


Compiling Data for ALB
Compiling Data for ALLE
Compiling Data for AAL
Compiling Data for APA
Compiling Data for AIZ
Compiling Data for BBWI
Compiling Data for BIO
Compiling Data for TECH
Compiling Data for BWA
Compiling Data for CZR
Compiling Data for CTLT
Compiling Data for CHRW
Compiling Data for CRL
Compiling Data for CMA
Compiling Data for DAY
Compiling Data for EMN
Compiling Data for EPAM
Compiling Data for ETSY
Compiling Data for FFIV
Compiling Data for FRT
Compiling Data for FMC
Compiling Data for BEN
Compiling Data for GNRC
Compiling Data for GL
Compiling Data for HAS
Compiling Data for HSIC
Compiling Data for HII
Compiling Data for IPG
Compiling Data for IVZ
Compiling Data for LW
Compiling Data for LKQ
Compiling Data for MKTX
Compiling Data for MTCH
Compiling Data for MGM
Compiling Data for MHK
Compiling Data for TAP
Compiling Data for MOS
Compiling Data for NCLH
Compiling Data for PARA
Compiling Data for PAYC
Compiling Data for PNW
Compiling Data for QRVO
Compiling Data for R

In [4]:
# function to get the CIK of a company from the SEC database
def get_cik(ticker):
    ticker_symbol = ticker.upper()

    # getting all the companies ticker and CIK from the SEC database as a dictionary
    companyTickers = requests.get('https://www.sec.gov/files/company_tickers.json', headers={'User-Agent': "testing@gmail.com"}).json().values()

    # returns {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'} for each company
    for company in companyTickers:
        if company['ticker'] == ticker_symbol:
            return(company)

    return('Ticker CIK not found in SEC database')

# function to get the SEC financial reports of a company
def sec_filings(ticker):

    ticker_symbol = ticker.upper()

    # grab CIK and fill leading zeroes
    cik = str(get_cik(ticker_symbol)['cik_str']).zfill(10)

    filingData = requests.get(f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',headers={'User-Agent': "testing@gmail.com"}).json()

    return filingData

    # used to check the key names for category of data
    # print(filingData['facts']['us-gaap'].keys())

    # function to get the actual SEC financial metrics from the filing data
def metrics1(ticker,metrics):
    metricList = []
    filingData = sec_filings(ticker)
    index = pd.date_range(start='1900-01-01', end=datetime.datetime.now(), freq='D')
    for metric in metrics:
        df = pd.DataFrame(columns=[f"{ticker.upper()}'s {metric}"],index=index)
        for i in filingData['facts']['us-gaap'][f'{metric}']['units']['USD']:
            if i['form'] == '10-K' or (i['fp'] == 'Q3' or i['fp'] == 'Q1' or i['fp'] == 'Q2'):
                df.at[i['start'], f"{ticker.upper()}'s {metric}"] = i['val']
        df = df.infer_objects(copy=False).ffill()
        # .fillna(0)
        metricList.append(df)
    return metricList

def metrics2(ticker,metrics):
    metricList = []
    filingData = sec_filings(ticker)
    for metric in metrics:
        df = pd.DataFrame(filingData['facts']['us-gaap'][f'{metric}']['units']['USD'])
        df = df[(df['form'] == '10-K') | ((df['fp'] == 'Q3')|( df['fp'] == 'Q1') |( df['fp'] == 'Q2' ))]
        df = df.loc[:,['start','val']]
        df = df.sort_values('start')
        df = df.rename(columns={'val': f"{ticker.upper()}'s {metric}"})
        # .fillna(0)
        metricList.append(df)
    return metricList

CommonMetrics =['NetCashProvidedByUsedInFinancingActivities']

metrics2('Amzn',CommonMetrics)[0]


In [6]:
#this was to find the intersection of two lists, for finding CommonMetrics
def find_intersection(list1, list2):
    intersection = list(set(list1) & set(list2))
    return intersection

# f = list(sec_filings('MMM'))

# for ticker in ReadIndex():
#     print(ticker)
#     e = list(sec_filings(ticker))
#     intersect = find_intersection(e,f)
#     f = intersect
#     print(f)

#this is the list of common metrics between all SEC filings, can be used to compare companies
# CommonMetrics = ['NetCashProvidedByUsedInFinancingActivities', 'NetCashProvidedByUsedInInvestingActivities', 'Assets', 'LiabilitiesAndStockholdersEquity']

# #loop for all tickers in the S&P500
# finAct = pd.DataFrame()
# invAct = pd.DataFrame()
# assets = pd.DataFrame()
# liaANDeq = pd.DataFrame()

# for ticker in ReadIndex():
#     print(ticker)
#     e = metrics2(ticker,CommonMetrics)
#     finAct = pd.concat([finAct,e[0]],axis=1)
#     invAct = pd.concat([invAct,e[1]],axis=1)
#     assets = pd.concat([assets,e[2]],axis=1)
#     liaANDeq = pd.concat([liaANDeq,e[3]],axis=1)

# finAct.fillna(0)
# finAct.columns[finAct.notna().all()]
# .plot(figsize=(10,5)) 
# invAct 
# assets 
# liaANDeq 
# finAct = finAct.dropna()
# invAct = invAct.dropna()
# assets = assets.dropna()
# liaANDeq = liaANDeq.dropna()
e = groupedIntoTrainableData(getIndexMrkCap(ReadIndex(), 'marketCap'))


In [10]:
# e = list(yf.Ticker('aapl').info.keys())
# f = yf.Ticker('aapl').info
# fawk = []
# for i in range(0,len(e)):
#     print(e[i], f[e[i]])
#     yah = input('yah?')
#     if yah == '':
#         fawk.append(e[i])
#     elif yah == 'stop':
#         break
# print(fawk)

def testBaseline(listOfTickers):
    wallah = CompileData(listOfTickers)
    X_train = wallah[0]
    X_test = wallah[1]
    Y_train = wallah[2]
    Y_test = wallah[3]

    # modelwallah = Sequential()
    # modelwallah.add(InputLayer((101, 1)))
    # modelwallah.add(LSTM(66))
    # modelwallah.add(Dense(33, 'sigmoid'))
    # modelwallah.add(Dense(1, 'linear'))
    # modelwallah.compile(loss='mse',optimizer=Adam(learning_rate=0.001), metrics=['mse'])
    # cp1 = ModelCheckpoint('models/base/test_case.keras', save_best_only=False)
    # modelwallah.fit(X_train, Y_train, epochs=5, batch_size=32, callbacks=[cp1])
    modelwallah = load_model('models/base/test_case.keras')
    y_pred = modelwallah.predict(X_test)
    avg_error = (np.sum(np.sqrt((Y_test - y_pred)**2)))/len(Y_test)*100
    print(avg_error)
    avg_error = (np.sum(Y_test - y_pred))/len(Y_test)*100
    print(avg_error)
    # 0.961587031180842
    # 0.3912897285342797


MMM
AOS
ABT
ABBV
ACN
ADBE
AMD
AES
AFL
A
APD
ABNB
AKAM
ALB
ARE
ALGN
ALLE
LNT
ALL
GOOGL
GOOG
MO
AMZN
AMCR
AEE
AAL
AEP
AXP
AIG
AMT
AWK
AMP
AME
AMGN
APH
ADI
ANSS
AON
APA
AAPL
AMAT
APTV
ACGL
ADM
ANET
AJG
AIZ
T
ATO
ADSK
ADP
AZO
AVB
AVY
AXON
BKR
BALL
BAC
BK
BBWI
BAX
BDX
BBY
BIO
TECH
BIIB
BLK
BX
BA
BKNG
BWA
BXP
BSX
BMY
AVGO
BR
BRO
BLDR
BG
CDNS
CZR
CPT
CPB
COF
CAH
KMX
CCL
CARR
CTLT
CAT
CBOE
CBRE
CDW
CE
COR
CNC
CNP
CF
CHRW
CRL
SCHW
CHTR
CVX
CMG
CB
CHD
CI
CINF
CTAS
CSCO
C
CFG
CLX
CME
CMS
KO
CTSH
CL
CMCSA
CMA
CAG
COP
ED
STZ
CEG
COO
CPRT
GLW
CPAY
CTVA
CSGP
COST
CTRA
CCI
CSX
CMI
CVS
DHR
DRI
DVA
DAY
DECK
DE
DAL
DVN
DXCM
FANG
DLR
DFS
DG
DLTR
D
DPZ
DOV
DOW
DHI
DTE
DUK
DD
EMN
ETN
EBAY
ECL
EIX
EW
EA
ELV
LLY
EMR
ENPH
ETR
EOG
EPAM
EQT
EFX
EQIX
EQR
ESS
EL
ETSY
EG
EVRG
ES
EXC
EXPE
EXPD
EXR
XOM
FFIV
FDS
FICO
FAST
FRT
FDX
FIS
FITB
FSLR
FE
FI
FMC
F
FTNT
FTV
FOXA
FOX
BEN
FCX
GRMN
IT
GE
GEHC
GEN
GNRC
GD
GIS
GM
GPC
GILD
GPN
GL
GS
HAL
HIG
HAS
HCA
DOC
HSIC
HSY
HES
HPE
HLT
HOLX
HD
HON
HRL
HST
HWM
HPQ
HUBB
HUM
HBAN
HI

  metricinfo = metricinfo.groupby(pd.qcut(metricinfo[f'{metric}'], q=10))


Compiling Data for AOS
Compiling Data for ALB
Compiling Data for ALLE
Compiling Data for AAL
Compiling Data for APA
Compiling Data for AIZ
Compiling Data for BBWI
Compiling Data for BIO
Compiling Data for TECH
Compiling Data for BWA
Compiling Data for CZR
Compiling Data for CTLT
Compiling Data for CHRW
Compiling Data for CRL
Compiling Data for CMA
Compiling Data for DAY
Compiling Data for EMN
Compiling Data for EPAM
Compiling Data for ETSY
Compiling Data for FFIV
Compiling Data for FRT
Compiling Data for FMC
Compiling Data for BEN
Compiling Data for GNRC
Compiling Data for GL
Compiling Data for HAS
Compiling Data for HSIC
Compiling Data for HII
Compiling Data for IPG
Compiling Data for IVZ
Compiling Data for LW
Compiling Data for LKQ
Compiling Data for MKTX
Compiling Data for MTCH
Compiling Data for MGM
Compiling Data for MHK
Compiling Data for TAP
Compiling Data for MOS
Compiling Data for NCLH
Compiling Data for PARA
Compiling Data for PAYC
Compiling Data for PNW
Compiling Data for QR