In [119]:
import os
import numpy as np
import pandas as pd
import requests
import json
import time

# Stock scraping

### Functions

In [120]:
def get_sp500_list():
    """
    Get a list of S&P 500 company symbols by reading the Wikipedia page.
    """
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    tables = pd.read_html(url)
    sp500_table = tables[0]
    sp500_symbols = sp500_table["Symbol"].tolist()
    return sp500_symbols

### Constants

In [121]:
API_KEY_ALPHAVANTAGE = '76I7ZXLX7S7BSRAX'
API_KEY_FAM = 'NdLNkc2mzBnSVEvOgyOqB3CBGN4YBm4v'

BASE_URL_FAM = 'https://financialmodelingprep.com/api/v3'

REQUEST_LIMIT = 250
req_count = 12
YEARS = 15

DATA_DIR = '../financial_data'

# List of required metrics
required_metrics = [
    "GrossProfit",
    "Revenues",
    "NetIncomeLoss",
    "StockholdersEquity",
    "Liabilities",
    "AssetsNoncurrent",
    "NetCashProvidedByUsedInOperatingActivities"
]

## Data collection

In [122]:
FAM_free_access = [
    'AAPL', 'TSLA', 'AMZN', 'MSFT', 'NVDA', 'GOOGL', 'META', 'NFLX', 'JPM', 'V', 'BAC', 'AMD', 'PYPL', 'DIS', 'T', 'PFE', 'COST', 'INTC', 'KO', 'TGT', 'NKE', 'SPY', 'BA', 'BABA', 'XOM', 'WMT', 'GE', 'CSCO', 'VZ', 'JNJ', 'CVX', 'PLTR', 'SQ', 'SHOP', 'SBUX', 'SOFI', 'HOOD', 'RBLX', 'SNAP', 'UBER', 'FDX', 'ABBV', 'ETSY', 'MRNA', 'LMT', 'GM', 'F', 'RIVN', 'LCID', 'CCL', 'DAL', 'UAL', 'AAL', 'TSM', 'SONY', 'ET', 'NOK', 'MRO', 'COIN', 'SIRI', 'RIOT', 'CPRX', 'VWO', 'SPYG', 'ROKU', 'VIAC', 'ATVI', 'BIDU', 'DOCU', 'ZM', 'PINS', 'TLRY', 'WBA', 'MGM', 'NIO', 'C', 'GS', 'WFC', 'ADBE', 'PEP', 'UNH', 'CARR', 'FUBO', 'HCA', 'TWTR', 'BILI', 'RKT'
]

In [123]:
# Create .csv files to keep track of the company data that I got
if not os.path.exists("sp500_tickers.csv"):
    sp500_list = get_sp500_list()
    df_sp500 = pd.DataFrame(sp500_list)
    df_sp500.to_csv("sp500_tickers.csv", index=False)
else:
    print("sp500_tickers.csv already exists, therefore import it.")
    df_sp500 = pd.read_csv("sp500_tickers.csv")

if not os.path.exists("data_progress_sp500.csv"):
    df_progress = pd.DataFrame(columns=['ticker', 'income_done', 'balance_done', 'cashflow_done'])
    df_progress['ticker'] = sp500_list
    df_progress['income_done'] = False
    df_progress['balance_done'] = False
    df_progress['cashflow_done'] = False
    df_progress.set_index('ticker', inplace=True)
    df_progress.to_csv("data_progress_sp500.csv")
    df_data_progress = df_progress
else: 
    print("data_progress_sp500.csv already exists, therefore import it.")
    df_data_progress_sp500 = pd.read_csv("data_progress_sp500.csv", index_col='ticker')

if not os.path.exists("data_progress_free.csv"):
    df_progress = pd.DataFrame(columns=['ticker', 'income_done', 'balance_done', 'cashflow_done'])
    df_progress['ticker'] = FAM_free_access
    df_progress['income_done'] = False
    df_progress['balance_done'] = False
    df_progress['cashflow_done'] = False
    df_progress.set_index('ticker', inplace=True)
    df_progress.to_csv("data_progress_free.csv")
    df_data_progress_free = df_progress
else: 
    print("data_progress_free.csv already exists, therefore import it.")
    df_data_progress_free = pd.read_csv("data_progress_free.csv", index_col='ticker')

sp500_tickers.csv already exists, therefore import it.
data_progress_sp500.csv already exists, therefore import it.


In [124]:
print(df_data_progress_free)
print(df_data_progress_free.info())

        income_done  balance_done  cashflow_done
ticker                                          
AAPL          False         False          False
TSLA          False         False          False
AMZN          False         False          False
MSFT          False         False          False
NVDA          False         False          False
...             ...           ...            ...
FUBO          False         False          False
HCA           False         False          False
TWTR          False         False          False
BILI          False         False          False
RKT           False         False          False

[87 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, AAPL to RKT
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   income_done    87 non-null     bool 
 1   balance_done   87 non-null     bool 
 2   cashflow_done  87 non-null     bool 
dtypes: bool(3)
memory usage:

In [125]:

list_tickers = df_data_progress_free.index.tolist()

# check if the available tickers is in the S&P500 list
for ticker in FAM_free_access:
    if ticker not in list_tickers:
        # print(f"{ticker} is not in the s&p500 list")
        pass


In [None]:
drop_income_cols = ['reportedCurrency', 'cik', 'fillingDate', 'acceptedDate', 'calendarYear', 'link', 'finalLink', 'eps', 'epsdiluted', 'weightedAverageShsOut', 'weightedAverageShsOutDil', 'grossProfitRatio', 'ebitdaratio', 'operatingIncomeRatio', 'incomeBeforeTaxRatio']

drop_balance_columns = ['reportedCurrency', 'cik', 'fillingDate', 'acceptedDate', 'link', 'calendarYear', 'finalLink', 'commonStockSharesOutstanding']

drop_cashflow_columns = ['reportedCurrency', 'cik', 'fillingDate', 'acceptedDate', 'link', 'calendarYear', 'finalLink']


In [None]:

count = 0

# loop through the tickers and get their income, balance and cashflow statements
for ticker in list_tickers:
    
    # Ensure the correct amount of API calls
    if not ((REQUEST_LIMIT - req_count) > 2):
        print(f"Request limit reached, curr count: {req_count}")
        break

    row = df_data_progress_free.loc[ticker]

    ### INCOME STATEMENT ###
    if not row['income_done']:
        # do test of 2 loops
        if count == 1: break

        print(f"Getting income statement for {ticker}...")
        url = f"{BASE_URL_FAM}/income-statement/{ticker}?period=annual&limit={YEARS}&apikey={API_KEY_FAM}"
        
        try:
            response = requests.get(url)
            req_count += 1
        except Exception as e:
            print(f"Request failed for {ticker}: {e}")
            break

        print(f"Status Code: {response.status_code}")
        if response.status_code != 200:
            print(f"Error fetching {ticker}: {response.text}")
            break

        if response.ok:
            data = response.json()
            if data:
                df = pd.DataFrame(data)
                df = df[income_cols]
                df.to_csv(f"{DATA_DIR}/{ticker}_income.csv", index=False)
                df_data_progress_free.loc[ticker, 'income_done'] = True
                print(f"Income downloaded for {ticker}")
            else:
                print(f"No data returned for {ticker}")
        else:
            print(f"Failed to get income for {ticker}")
        time.sleep(1)

        # break for testing
        print()
        count += 1 
    break

    ### BALANCE STATEMENT ###
    if not row['balance_done']:
        print(f"Getting balance statement for {ticker}...")
        url = f"{BASE_URL_FAM}/balance-sheet-statement/{ticker}?limit={YEARS}&apikey={API_KEY_FAM}"
        response = requests.get(url)
        req_count += 1
        if response.ok:
            data = response.json()
            if data:
                pd.DataFrame(data).to_csv(f"{DATA_DIR}/{ticker}_balance.csv", index=False)
                df_data_progress_free.loc[ticker, 'balance_done'] = True
                print(f"Balance sheet downloaded for {ticker}")
            else:
                print(f"No data returned for {ticker}")
        else:
            print(f"Failed to get balance sheet for {ticker}")
        time.sleep(1)


    ### CASHFLOW STATEMENT ###
    if not row['cashflow_done']:
        print(f"Getting Cash flow statement for {ticker}...")
        url = f"{BASE_URL_FAM}/cash-flow-statement/{ticker}?limit={YEARS}&apikey={API_KEY_FAM}"
        response = requests.get(url)
        req_count += 1
        if response.ok:
            data = response.json()
            if data:
                pd.DataFrame(data).to_csv(f"{DATA_DIR}/{ticker}_cashflow.csv", index=False)
                df_data_progress_free.loc[ticker, 'cashflow_done'] = True
                print(f"Cash flow downloaded for {ticker}")
            else:
                print(f"No data returned for {ticker}")
        else:
            print(f"Failed to fetch cash flow for {ticker}")
        time.sleep(1)

    print()
    count += 1 
        
df_data_progress_free.to_csv("data_progress_free.csv")

# Data Cleaning

# Machine learning