In [1]:
import torch
from torch_geometric.data import HeteroData

import pandas as pd
import numpy as np

import os
import json

import re

from utils.utils import *    # import custom functions from utils module for cleaning up name strings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
DATA_PATH = './data'

# folder to save entity id mapping
ID_MAPPING = os.path.join(DATA_PATH, 'entity_id_map')
os.makedirs(ID_MAPPING, exist_ok=True)

# folder to save edge index
EDGE_INDEX = os.path.join(DATA_PATH, 'edge_index')
os.makedirs(EDGE_INDEX, exist_ok=True)

# 1. Grab entities
Companies, symbols, mutual funds, institutions, C-level board

Relations:
* `company` (shortName) --- has symbol ---> `stock symbol`
* `stock symbol`--- listed on ---> `exchange`
* `company` --- belongs to ---> `industry`
* `industry`--- is part of ---> `sector`
* `company` --- employs C-level member ---> `person`
* `mutual fund`--- has symbol ---> `mutualfund symbol`
* `institution` --- holds ---> `stock symbol`
* `mutual fund` --- holds ---> `stock symbol`

In [4]:
stocks = pd.read_parquet(os.path.join(DATA_PATH, 'stocks_info.parquet'))

In [5]:
# funds = 

## 1.1. Company, stock symbol, exchange, sector, industry

In [6]:
stocks.head(1)

Unnamed: 0,address1,city,state,zip,country,phone,fax,website,industry,industryKey,industryDisp,sector,sectorKey,sectorDisp,longBusinessSummary,fullTimeEmployees,companyOfficers,compensationAsOfEpochDate,executiveTeam,maxAge,priceHint,previousClose,open,dayLow,dayHigh,regularMarketPreviousClose,regularMarketOpen,regularMarketDayLow,regularMarketDayHigh,exDividendDate,payoutRatio,beta,forwardPE,volume,regularMarketVolume,averageVolume,averageVolume10days,averageDailyVolume10Day,bid,ask,bidSize,askSize,marketCap,fiftyTwoWeekLow,fiftyTwoWeekHigh,allTimeHigh,allTimeLow,priceToSalesTrailing12Months,fiftyDayAverage,twoHundredDayAverage,trailingAnnualDividendRate,trailingAnnualDividendYield,currency,tradeable,enterpriseValue,profitMargins,floatShares,sharesOutstanding,sharesShort,sharesShortPriorMonth,sharesShortPreviousMonthDate,dateShortInterest,sharesPercentSharesOut,heldPercentInsiders,heldPercentInstitutions,shortRatio,shortPercentOfFloat,impliedSharesOutstanding,bookValue,lastFiscalYearEnd,nextFiscalYearEnd,mostRecentQuarter,netIncomeToCommon,trailingEps,forwardEps,lastSplitFactor,lastSplitDate,enterpriseToRevenue,enterpriseToEbitda,52WeekChange,SandP52WeekChange,lastDividendValue,lastDividendDate,quoteType,currentPrice,targetHighPrice,targetLowPrice,targetMeanPrice,targetMedianPrice,recommendationMean,recommendationKey,numberOfAnalystOpinions,totalCash,totalCashPerShare,ebitda,totalDebt,quickRatio,currentRatio,totalRevenue,revenuePerShare,returnOnAssets,grossProfits,freeCashflow,operatingCashflow,revenueGrowth,grossMargins,ebitdaMargins,operatingMargins,financialCurrency,symbol,language,region,typeDisp,quoteSourceName,triggerable,customPriceAlertConfidence,marketState,corporateActions,preMarketTime,regularMarketTime,exchange,messageBoardId,exchangeTimezoneName,exchangeTimezoneShortName,gmtOffSetMilliseconds,market,esgPopulated,regularMarketChangePercent,regularMarketPrice,hasPrePostMarketData,firstTradeDateMilliseconds,preMarketChange,preMarketChangePercent,preMarketPrice,regularMarketChange,regularMarketDayRange,fullExchangeName,averageDailyVolume3Month,fiftyTwoWeekLowChange,fiftyTwoWeekLowChangePercent,fiftyTwoWeekRange,fiftyTwoWeekHighChange,fiftyTwoWeekHighChangePercent,fiftyTwoWeekChangePercent,dividendDate,earningsTimestampStart,earningsTimestampEnd,earningsCallTimestampStart,earningsCallTimestampEnd,isEarningsDateEstimate,epsTrailingTwelveMonths,epsForward,fiftyDayAverageChange,fiftyDayAverageChangePercent,twoHundredDayAverageChange,twoHundredDayAverageChangePercent,priceToBook,sourceInterval,exchangeDataDelayedBy,averageAnalystRating,cryptoTradeable,shortName,longName,displayName,trailingPegRatio,address2,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,governanceEpochDate,debtToEquity,returnOnEquity,earningsTimestamp,epsCurrentYear,priceEpsCurrentYear,ipoExpectedDate,dividendRate,dividendYield,fiveYearAvgDividendYield,trailingPE,earningsQuarterlyGrowth,earningsGrowth,prevName,nameChangeDate,irWebsite,openInterest,pegRatio,newListingDate,prevTicker,tickerChangeDate,prevExchange,exchangeTransferDate,industrySymbol
0,9655 Maroon Circle,Englewood,CO,80112,United States,303 703 4906,800 495 6695,https://www.zynex.com,Medical Distribution,medical-distribution,Medical Distribution,Healthcare,healthcare,Healthcare,"Zynex, Inc., together with its subsidiaries, d...",1000.0,"[{'age': 65.0, 'exercisedValue': 0, 'fiscalYea...",1735603000.0,[],86400,4,0.7374,0.721,0.695,2.1,0.7374,0.721,0.695,2.1,1641341000.0,0.0,1.01,3.3125,136585146.0,136585146.0,4407053.0,27279780.0,27279780.0,1.29,2.06,2.0,2.0,48317932.0,0.38,8.72,27.027273,0.054545,0.446553,1.2252,2.4123,0.0,0.0,USD,False,106676928.0,-0.68352,15791758.0,30388635.0,3552328.0,3438928.0,1760486000.0,1763078000.0,0.1169,0.48163,0.13781,18.37,0.2247,30388635.0,-1.34,1735603000.0,1767139000.0,1759190000.0,-73958000.0,-2.42,0.48,11:10,1641341000.0,0.986,-3.295,-0.808894,0.12934,0.1,1641341000.0,EQUITY,1.59,3.7,3.7,3.7,3.7,1.0,strong_buy,1.0,13259000.0,0.436,-32378000.0,71618000.0,0.253,0.469,108202000.0,3.489,-0.27107,77912000.0,-2242375.0,-20566000.0,-0.733,0.72006,-0.29924,-0.98967,USD,ZYXI,en-US,US,Equity,Nasdaq Real Time Price,True,HIGH,PRE,[],1764337000.0,1764190801,NMS,finmb_3103657,America/New_York,EST,-18000000,us_market,False,115.622,1.59,True,1077719000000.0,-0.036,-2.264152,1.554,0.8526,0.695 - 2.1,NasdaqGS,4407053.0,1.21,3.184211,0.38 - 8.72,-7.13,-0.817661,-80.88942,1642723000.0,1763068000.0,1763068000.0,1763474000.0,1763474000.0,False,-2.42,0.48,0.3648,0.297747,-0.8223,-0.340878,-1.186567,15,0,1.0 - Strong Buy,False,"Zynex, Inc.","Zynex, Inc.",Zynex,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
# grab entities from the data
companies = stocks['shortName'].dropna().unique()
stock_symbols = stocks['symbol'].dropna().unique()
exchanges = stocks['exchange'].dropna().unique()
industries = stocks['industryKey'].dropna().unique()
sectors = stocks['sectorKey'].dropna().unique()

In [13]:
# build id maps
company2id = {name: i for i, name in enumerate(companies)}
stocksymbol2id = {name: i for i, name in enumerate(stock_symbols)}
exchange2id = {name: i for i, name in enumerate(exchanges)}
industry2id = {name: i for i, name in enumerate(industries)}
sector2id = {name: i for i, name in enumerate(sectors)}

In [14]:
# save to json
with open(os.path.join(ID_MAPPING, 'company2id.json'), 'w') as f:
    json.dump(company2id, f, indent=2)

with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json'), 'w') as f:
    json.dump(stocksymbol2id, f, indent=2)

with open(os.path.join(ID_MAPPING, 'exchange2id.json'), 'w') as f:
    json.dump(exchange2id, f, indent=2)

with open(os.path.join(ID_MAPPING, 'industry2id.json'), 'w') as f:
    json.dump(industry2id, f, indent=2)

with open(os.path.join(ID_MAPPING, 'sector2id.json'), 'w') as f:
    json.dump(sector2id, f, indent=2)

## 1.2. Company and employed officers

In [15]:
# get list of officers
tmp = stocks[['shortName', 'companyOfficers']].copy()

tmp['officerNames'] = tmp['companyOfficers'].apply(extract_officer_names)


In [16]:
tmp.head(1).values

array([['Zynex, Inc.',
        array([{'age': 65.0, 'exercisedValue': 0, 'fiscalYear': 2024.0, 'maxAge': 1, 'name': 'Mr. Thomas  Sandgaard', 'title': 'Founder, President & Chairman', 'totalPay': 879352.0, 'unexercisedValue': 22462, 'yearBorn': 1959.0},
               {'age': 51.0, 'exercisedValue': 0, 'fiscalYear': 2024.0, 'maxAge': 1, 'name': 'Dr. Steven Lewis Dyson Ph.D.', 'title': 'CEO & Director', 'totalPay': None, 'unexercisedValue': 0, 'yearBorn': 1973.0},
               {'age': None, 'exercisedValue': 0, 'fiscalYear': 2024.0, 'maxAge': 1, 'name': 'Mr. Vikram  Bajaj', 'title': 'Chief Financial Officer', 'totalPay': None, 'unexercisedValue': 0, 'yearBorn': None},
               {'age': 51.0, 'exercisedValue': 0, 'fiscalYear': 2024.0, 'maxAge': 1, 'name': 'Mr. John T. Bibb', 'title': 'Chief Legal Officer', 'totalPay': None, 'unexercisedValue': 0, 'yearBorn': 1973.0},
               {'age': None, 'exercisedValue': 0, 'fiscalYear': 2024.0, 'maxAge': 1, 'name': 'Mr. Ajay  Gopal', 'tit

In [17]:
# get list of officers
officers = tmp['officerNames'].explode().dropna().unique()
# build id map
officer2id = {name: i for i, name in enumerate(officers)}
# save id map
with open(os.path.join(ID_MAPPING, 'officer2id.json'), 'w') as f:
    json.dump(officer2id, f, indent=2)

## 1.3. Mutual fund, mutualfund symbol, insitutions

# 2. Build edge lists for each relation

## 2.1. `company` --- has symbol ---> `stock symbol`

In [18]:
comp2sym_src = []
comp2sym_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'company2id.json')) as f:
    company2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json')) as f:
    stocksymbol2id = json.load(f)

for row in stocks.itertuples():
    if pd.isna(row.shortName) or pd.isna(row.symbol):   # skip rows where either company shortName or symbol is missing
        continue
    comp2sym_src.append(company2id[row.shortName])
    comp2sym_dst.append(stocksymbol2id[row.symbol])

In [19]:
# sanity check - src and dst should have same length
assert len(comp2sym_src) == len(comp2sym_dst)

In [20]:
# save edge index as torch tensor
edge_index = torch.tensor([comp2sym_src, comp2sym_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "comp2sym.pt"))

## 2.2 `stock symbol` --- listed on ---> `exchange`

In [21]:
sym2ex_src = []
sym2ex_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'exchange2id.json')) as f:
    exchange2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json')) as f:
    stocksymbol2id = json.load(f)

for row in stocks.itertuples():
    if pd.isna(row.exchange) or pd.isna(row.symbol):   # skip rows where either company shortName or symbol is missing
        continue
    sym2ex_src.append(stocksymbol2id[row.symbol])
    sym2ex_dst.append(exchange2id[row.exchange])

In [22]:
assert len(sym2ex_src) == len(sym2ex_dst)

In [23]:
# save edge index as torch tensor
edge_index = torch.tensor([sym2ex_src, sym2ex_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "sym2ex.pt"))

## 2.3. `company` --- belongs to ---> `industry`

In [24]:
comp2ind_src = []
comp2ind_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'company2id.json')) as f:
    company2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'industry2id.json')) as f:
    industry2id = json.load(f)

for row in stocks.itertuples():
    if pd.isna(row.shortName) or pd.isna(row.industryKey):   # skip rows where either company shortName or industry key is missing
        continue
    comp2ind_src.append(company2id[row.shortName])
    comp2ind_dst.append(industry2id[row.industryKey])

In [25]:
assert len(comp2ind_src) == len(comp2ind_dst)

In [26]:
# save edge index as torch tensor
edge_index = torch.tensor([comp2ind_src, comp2ind_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "comp2ind.pt"))

## 2.4. `industry` --- is part of ---> `sector`

In [27]:
ind2sec_src = []
ind2sec_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'industry2id.json')) as f:
    industry2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'sector2id.json')) as f:
    sector2id = json.load(f)

for row in stocks.itertuples():
    if pd.isna(row.industryKey) or pd.isna(row.sectorKey):   # skip rows where either sectorKey or industry key is missing
        continue
    ind2sec_src.append(industry2id[row.industryKey])
    ind2sec_dst.append(sector2id[row.sectorKey])

In [28]:
assert len(ind2sec_src) == len(ind2sec_dst)

In [29]:
# save edge index as torch tensor
edge_index = torch.tensor([ind2sec_src, ind2sec_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "ind2sec.pt"))

## 2.5. `company` --- employs C-level officer ---> `officer`

In [31]:
comp2off_src = []
comp2off_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'company2id.json')) as f:
    company2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'officer2id.json')) as f:
    officer2id = json.load(f)

# get list of officers and corresponding company names
tmp = stocks[['shortName', 'companyOfficers']].copy()
tmp['officerNames'] = tmp['companyOfficers'].apply(extract_officer_names)
tmp = tmp.explode('officerNames')
tmp.head()

for row in tmp.itertuples():
    if pd.isna(row.shortName) or pd.isna(row.officerNames):   # skip rows where either sectorKey or industry key is missing
        continue
    comp2off_src.append(company2id[row.shortName])
    comp2off_dst.append(officer2id[row.officerNames])

In [33]:
assert len(comp2off_src) == len(comp2off_dst)

In [35]:
# save edge index as torch tensor
edge_index = torch.tensor([comp2off_src, comp2off_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "comp2off.pt"))