In [1]:
import pandas as pd
sp500_company_profiles_file_path="resources/sp500_company_profiles.csv"

def get_sp500_list() -> pd.DataFrame:
  # URL to the Wikipedia page for S&P 500 constituents
  url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

  # Fetch the table of S&P 500 companies
  tables = pd.read_html(url)
  sp500_table = tables[0]  # The first table contains the list of companies

  # Return the symbol and the comany name
  return sp500_table[["Symbol", "Security"]]

sp500_list = get_sp500_list()


In [105]:
def get_us_big_tech() -> str:
  url = "https://en.wikipedia.org/wiki/List_of_largest_technology_companies_by_revenue"
  tables = pd.read_html(url)
  big_tech = tables[1]
  us_big_tech = big_tech[big_tech["Country (origin)"] == "US"]
  return "|".join(us_big_tech["Company"]).replace("Inc.", "Inc")

us_big_tech_str_list = get_us_big_tech()
print(us_big_tech_str_list)


Amazon|Apple|Alphabet|Microsoft|AT&T|Meta|Dell Technologies|Intel|HP Inc|Nvidia|IBM


In [111]:
from pathlib import Path

def utilise_sp500_company_profiles_csv(sp500_list: pd.DataFrame):
  file_path = Path(sp500_company_profiles_file_path)
  if(file_path.exists()):
    previous_sp500_company_profiles = pd.read_csv(sp500_company_profiles_file_path)
    company_symbols = previous_sp500_company_profiles["Symbol"]
    new_sp500_list = sp500_list[~sp500_list["Symbol"].isin(company_symbols)]
    existing_sp500_company_profile = previous_sp500_company_profiles[company_symbols.isin(sp500_list["Symbol"])]
    return {'sp500_list': new_sp500_list, 'sp500_company_profile': existing_sp500_company_profile}
  return {'sp500_list': sp500_list, 'sp500_company_profile': None}
dist_sp500_list = utilise_sp500_company_profiles_csv(sp500_list)
print(dist_sp500_list["sp500_list"])

Empty DataFrame
Columns: [Symbol, Security]
Index: []


In [107]:
from concurrent.futures import ThreadPoolExecutor
import time
import finnhub
import asyncio

from settings import FINNHUB_API_KEY

finnhub_client = finnhub.Client(api_key=FINNHUB_API_KEY)

def process_sp500_row(row: pd.Series, trail: int=0):
  try:
    company_profile_json = finnhub_client.company_profile2(symbol=row["Symbol"])
    company_profile_json["Symbol"] = row["Symbol"]
    print(f"{row["Security"]}: success")

  except finnhub.FinnhubAPIException as e:
    if e.status_code == 429 and trail <= 4:
      #Wait for the specified duration before retrying
      retry_after = 30+(trail*5)
      print(f"Rate limit exceeded. Retry after {retry_after} seconds")
      time.sleep(retry_after)
      company_profile_json = process_sp500_row(row, trail+1)
      print(f"{row['Security']}: {trail}")
    else:
      print(f"Could not retrieve the company profile for {row["Security"]} in {trail} trails")
      company_profile_json = None
  return company_profile_json

# async def async_process_sp500_row(
#     row: pd.Series,
#     loop: asyncio.AbstractEventLoop,
#     executor: ThreadPoolExecutor):
#   result = await loop.run_in_executor(executor, process_sp500_row, row)
#   return result


# async def async_process_sp500_company_profiles(sp500_list: pd.DataFrame):
#   loop = asyncio.get_running_loop()
#   with ThreadPoolExecutor() as executor:
#     sp500_list = sp500_list.head(5)
#     tasks = [async_process_sp500_row(row, loop, executor) for index, row in sp500_list.iterrows()]
#     sp500_company_profiles = await asyncio.gather(*tasks)
#   return sp500_company_profiles

async def async_process_sp500_row(
    row: pd.Series,
    semaphore: asyncio.Semaphore):
  async with semaphore:
    return process_sp500_row(row)

async def async_process_sp500_company_profiles(sp500_list: pd.DataFrame, concurrency_limit: int = 3):
  semaphore = asyncio.Semaphore(concurrency_limit)
  tasks = [async_process_sp500_row(row, semaphore) for _, row in sp500_list.iterrows()]
  sp500_company_profiles = await asyncio.gather(*tasks)
  return sp500_company_profiles

current_sp500_list = dist_sp500_list["sp500_list"]
sp500_company_profiles_json = await async_process_sp500_company_profiles(current_sp500_list)
fetched_sp500_company_profiles = pd.DataFrame(sp500_company_profiles_json)
sp500_company_profiles = pd.concat([dist_sp500_list["sp500_company_profile"], fetched_sp500_company_profiles], ignore_index=True)
# print(sp500_company_profiles)

sp500_company_profiles.to_csv(sp500_company_profiles_file_path, na_rep="N/A", index=False)
nasdaq_sp500 = sp500_company_profiles[sp500_company_profiles["name"].str.contains(us_big_tech_str_list, case=False, na=False)]
nasdaq_sp500.shape
# sp500_company_profiles = pd.DataFrame(sp500_company_profiles_json)
# print(sp500_company_profiles)

# if __name__ == "__main__":
#   # sp500_company_profiles = asyncio.run(async_process_sp500_company_profiles(sp500_list)) # Uncomment if not in Jupyter notebooks. When running in Juptyer notebooks, it already has a running event loop thus no need to run a new event loop. This means we only need to wait on the async task
#   sp500_company_profiles = await async_process_sp500_company_profiles(sp500_list) # Comment if in Jupyter notebooks
#   sp500_company_profiles = pd.DataFrame(sp500_company_profiles_json)
#   print(sp500_company_profiles.dtypes)
#   print(sp500_company_profiles["ticker"])


(11, 14)

In [None]:
ticker_list = nasdaq_sp500["ticker"]
import websocket

def on_message(ws, message):
    print(message)

def on_error(ws, error):
    print(error)

def on_close(ws):
    print("### closed ###")

def on_open(ws):
    ticker_list_5 = ticker_list.head(100)
    for ticker in ticker_list_5:
        # ticker="GOOGL"
        ws.send(f'{{"type":"subscribe", "symbol":"{ticker}"}}')

if __name__ == "__main__":
    websocket.enableTrace(True)
    ws = websocket.WebSocketApp("wss://ws.finnhub.io?token=cugj791r01qr6jncvar0cugj791r01qr6jncvarg",
                              on_message = on_message,
                              on_error = on_error,
                              on_close = on_close)
    ws.on_open = on_open
    ws.run_forever()


In [127]:
# df = pd.DataFrame({
#     "id": [1, 2, 3],
#     "name": ["Alice", "Bob", "Charlie"],
#     "age": [25, 30, 35]
# })
x = {
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35]
}
id, age, name = x.values()

print(name)


[25, 30, 35]


In [118]:
import os
import sys
print(os.path.realpath(os.path.dirname(sys.argv[0])))

/Users/malemichael/miniconda3/envs/finnhubDPL/lib/python3.13/site-packages
