In [137]:
import torch
from torch_geometric.data import HeteroData

import pandas as pd
import numpy as np

import os
import json

import re
import ast

from utils.utils import *    # import custom functions from utils module for cleaning up name strings
from utils.kge import build_global_id_map, build_global_triples, build_hetero_graph

import warnings
warnings.filterwarnings('ignore')

In [138]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [139]:
DATA_PATH = './data'

# folder to save entity id mapping
ID_MAPPING = os.path.join(DATA_PATH, 'entity_id_map')
os.makedirs(ID_MAPPING, exist_ok=True)

# folder to save edge index
EDGE_INDEX = os.path.join(DATA_PATH, 'edge_index')
os.makedirs(EDGE_INDEX, exist_ok=True)

# folder to save graph data
GRAPH_DATA = os.path.join(DATA_PATH, 'graph_data')
os.makedirs(GRAPH_DATA, exist_ok=True)

# 1. Grab entities
Companies, symbols, mutual funds, institutions, C-level board

Relations:
* `company` (shortName) --- has symbol ---> `stock symbol`
* `stock symbol`--- listed on ---> `exchange`
* `company` --- belongs to ---> `industry`
* `industry`--- is part of ---> `sector`
* `company` --- employs C-level member ---> `person`
* `mutual fund`--- has symbol ---> `mutualfund symbol`
* `institution` --- holds ---> `stock symbol`
* `mutual fund` --- holds ---> `stock symbol`
* `stock symbol` --- co-mentioned in news with ---> `stock symbol`

In [140]:
stocks = pd.read_parquet(os.path.join(DATA_PATH, 'stocks_info.parquet'))

In [141]:
inst = pd.read_parquet(os.path.join(DATA_PATH, 'institutional_holders.parquet'))

In [142]:
funds = pd.read_parquet(os.path.join(DATA_PATH, 'mutualfund_holders.parquet'))

In [143]:
funds_symbol = pd.read_parquet(os.path.join(DATA_PATH, 'funds_symbol.parquet'))

In [144]:
funds_symbol.head()

Unnamed: 0,fund_name,symbols
0,vanguard total stock market index fund,VTI
1,vanguard total stock market index fund,VTI.MX
2,vanguard extended market index fund,VXF
3,vanguard extended market index fund,VEMPX
4,vanguard extended market index fund,VEXAX


## 1.1. Company, stock symbol, exchange, sector, industry

In [145]:
stocks.head(1)

Unnamed: 0,address1,city,state,zip,country,phone,fax,website,industry,industryKey,industryDisp,sector,sectorKey,sectorDisp,longBusinessSummary,fullTimeEmployees,companyOfficers,compensationAsOfEpochDate,executiveTeam,maxAge,priceHint,previousClose,open,dayLow,dayHigh,regularMarketPreviousClose,regularMarketOpen,regularMarketDayLow,regularMarketDayHigh,exDividendDate,payoutRatio,beta,forwardPE,volume,regularMarketVolume,averageVolume,averageVolume10days,averageDailyVolume10Day,bid,ask,bidSize,askSize,marketCap,fiftyTwoWeekLow,fiftyTwoWeekHigh,allTimeHigh,allTimeLow,priceToSalesTrailing12Months,fiftyDayAverage,twoHundredDayAverage,trailingAnnualDividendRate,trailingAnnualDividendYield,currency,tradeable,enterpriseValue,profitMargins,floatShares,sharesOutstanding,sharesShort,sharesShortPriorMonth,sharesShortPreviousMonthDate,dateShortInterest,sharesPercentSharesOut,heldPercentInsiders,heldPercentInstitutions,shortRatio,shortPercentOfFloat,impliedSharesOutstanding,bookValue,lastFiscalYearEnd,nextFiscalYearEnd,mostRecentQuarter,netIncomeToCommon,trailingEps,forwardEps,lastSplitFactor,lastSplitDate,enterpriseToRevenue,enterpriseToEbitda,52WeekChange,SandP52WeekChange,lastDividendValue,lastDividendDate,quoteType,currentPrice,targetHighPrice,targetLowPrice,targetMeanPrice,targetMedianPrice,recommendationMean,recommendationKey,numberOfAnalystOpinions,totalCash,totalCashPerShare,ebitda,totalDebt,quickRatio,currentRatio,totalRevenue,revenuePerShare,returnOnAssets,grossProfits,freeCashflow,operatingCashflow,revenueGrowth,grossMargins,ebitdaMargins,operatingMargins,financialCurrency,symbol,language,region,typeDisp,quoteSourceName,triggerable,customPriceAlertConfidence,marketState,corporateActions,preMarketTime,regularMarketTime,exchange,messageBoardId,exchangeTimezoneName,exchangeTimezoneShortName,gmtOffSetMilliseconds,market,esgPopulated,regularMarketChangePercent,regularMarketPrice,hasPrePostMarketData,firstTradeDateMilliseconds,preMarketChange,preMarketChangePercent,preMarketPrice,regularMarketChange,regularMarketDayRange,fullExchangeName,averageDailyVolume3Month,fiftyTwoWeekLowChange,fiftyTwoWeekLowChangePercent,fiftyTwoWeekRange,fiftyTwoWeekHighChange,fiftyTwoWeekHighChangePercent,fiftyTwoWeekChangePercent,dividendDate,earningsTimestampStart,earningsTimestampEnd,earningsCallTimestampStart,earningsCallTimestampEnd,isEarningsDateEstimate,epsTrailingTwelveMonths,epsForward,fiftyDayAverageChange,fiftyDayAverageChangePercent,twoHundredDayAverageChange,twoHundredDayAverageChangePercent,priceToBook,sourceInterval,exchangeDataDelayedBy,averageAnalystRating,cryptoTradeable,shortName,longName,displayName,trailingPegRatio,address2,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,governanceEpochDate,debtToEquity,returnOnEquity,earningsTimestamp,epsCurrentYear,priceEpsCurrentYear,ipoExpectedDate,dividendRate,dividendYield,fiveYearAvgDividendYield,trailingPE,earningsQuarterlyGrowth,earningsGrowth,prevName,nameChangeDate,irWebsite,openInterest,pegRatio,newListingDate,prevTicker,tickerChangeDate,prevExchange,exchangeTransferDate,industrySymbol
0,9655 Maroon Circle,Englewood,CO,80112,United States,303 703 4906,800 495 6695,https://www.zynex.com,Medical Distribution,medical-distribution,Medical Distribution,Healthcare,healthcare,Healthcare,"Zynex, Inc., together with its subsidiaries, d...",1000.0,"[{'age': 65.0, 'exercisedValue': 0, 'fiscalYea...",1735603000.0,[],86400,4,0.7374,0.721,0.695,2.1,0.7374,0.721,0.695,2.1,1641341000.0,0.0,1.01,3.3125,136585146.0,136585146.0,4407053.0,27279780.0,27279780.0,1.29,2.06,2.0,2.0,48317932.0,0.38,8.72,27.027273,0.054545,0.446553,1.2252,2.4123,0.0,0.0,USD,False,106676928.0,-0.68352,15791758.0,30388635.0,3552328.0,3438928.0,1760486000.0,1763078000.0,0.1169,0.48163,0.13781,18.37,0.2247,30388635.0,-1.34,1735603000.0,1767139000.0,1759190000.0,-73958000.0,-2.42,0.48,11:10,1641341000.0,0.986,-3.295,-0.808894,0.12934,0.1,1641341000.0,EQUITY,1.59,3.7,3.7,3.7,3.7,1.0,strong_buy,1.0,13259000.0,0.436,-32378000.0,71618000.0,0.253,0.469,108202000.0,3.489,-0.27107,77912000.0,-2242375.0,-20566000.0,-0.733,0.72006,-0.29924,-0.98967,USD,ZYXI,en-US,US,Equity,Nasdaq Real Time Price,True,HIGH,PRE,[],1764337000.0,1764190801,NMS,finmb_3103657,America/New_York,EST,-18000000,us_market,False,115.622,1.59,True,1077719000000.0,-0.036,-2.264152,1.554,0.8526,0.695 - 2.1,NasdaqGS,4407053.0,1.21,3.184211,0.38 - 8.72,-7.13,-0.817661,-80.88942,1642723000.0,1763068000.0,1763068000.0,1763474000.0,1763474000.0,False,-2.42,0.48,0.3648,0.297747,-0.8223,-0.340878,-1.186567,15,0,1.0 - Strong Buy,False,"Zynex, Inc.","Zynex, Inc.",Zynex,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [146]:
# grab entities from the data
companies = stocks['shortName'].dropna().unique()
stock_symbols = stocks['symbol'].dropna().unique()
# exchanges = stocks['exchange'].dropna().unique()
industries = stocks['industryKey'].replace('', np.nan).dropna().unique()
sectors = stocks['sectorKey'].replace('', np.nan).dropna().unique()

In [147]:
# build id maps
company2id = {name: i for i, name in enumerate(companies)}
stocksymbol2id = {name: i for i, name in enumerate(stock_symbols)}
# exchange2id = {name: i for i, name in enumerate(exchanges)}
industry2id = {name: i for i, name in enumerate(industries)}
sector2id = {name: i for i, name in enumerate(sectors)}

In [148]:
# save to json
with open(os.path.join(ID_MAPPING, 'company2id.json'), 'w') as f:
    json.dump(company2id, f, indent=2)

with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json'), 'w') as f:
    json.dump(stocksymbol2id, f, indent=2)

# with open(os.path.join(ID_MAPPING, 'exchange2id.json'), 'w') as f:
#     json.dump(exchange2id, f, indent=2)

with open(os.path.join(ID_MAPPING, 'industry2id.json'), 'w') as f:
    json.dump(industry2id, f, indent=2)

with open(os.path.join(ID_MAPPING, 'sector2id.json'), 'w') as f:
    json.dump(sector2id, f, indent=2)

## 1.2. Company and employed officers (removed)

In [149]:
# # get list of officers
# tmp = stocks[['shortName', 'companyOfficers']].copy()

# tmp['officerNames'] = tmp['companyOfficers'].apply(extract_officer_names)


In [150]:
# tmp.head(1).values

In [151]:
# # get list of officers
# officers = tmp['officerNames'].explode().dropna().unique()
# # build id map
# officer2id = {name: i for i, name in enumerate(officers)}
# # save id map
# with open(os.path.join(ID_MAPPING, 'officer2id.json'), 'w') as f:
#     json.dump(officer2id, f, indent=2)

## 1.3. Institutions

In [152]:
# get list of institution
inst['holderNames'] = inst['holders'].apply(extract_institution_names)

institutions = inst['holderNames'].explode().dropna().unique()

# build id map
institution2id = {name: i for i, name in enumerate(institutions)}
with open(os.path.join(ID_MAPPING, 'institution2id.json'), 'w') as f:
    json.dump(institution2id, f, indent=2)

# institution2id

## 1.4. Mutual funds & mutual funds symbol

In [153]:
# funds.iloc[0].values

# get list of mutual funds
funds['fundNames'] = funds['holders'].apply(extract_mutualfund_names)

mutualfunds = funds['fundNames'].explode().dropna().unique()

# build id map
mutualfund2id = {name: i for i, name in enumerate(mutualfunds)}
with open(os.path.join(ID_MAPPING, 'mutualfund2id.json'), 'w') as f:
    json.dump(mutualfund2id, f, indent=2)

In [154]:
# get list of fund symbols
symbols = funds_symbol['symbols'].dropna().unique()

# build id map
fundsymbol2id = {name: i for i, name in enumerate(symbols)}
with open(os.path.join(ID_MAPPING, 'fundsymbol2id.json'), 'w') as f:
    json.dump(fundsymbol2id, f, indent=2)

# 2. Build edge lists for each relation

## 2.1. `company` --- has symbol ---> `stock symbol`

In [155]:
comp2sym_src = []
comp2sym_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'company2id.json')) as f:
    company2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json')) as f:
    stocksymbol2id = json.load(f)

edges = set()
for row in stocks.itertuples():
    if pd.isna(row.shortName) or pd.isna(row.symbol):   # skip rows where either company shortName or symbol is missing
        continue
    h = company2id[row.shortName]
    t = stocksymbol2id[row.symbol]
    edges.add((h, t))

comp2sym_src, comp2sym_dst = zip(*edges)

In [156]:
# sanity check - src and dst should have same length
assert len(comp2sym_src) == len(comp2sym_dst)

In [157]:
# save edge index as torch tensor
edge_index = torch.tensor([comp2sym_src, comp2sym_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "comp2sym.pt"))

## 2.2 `stock symbol` --- listed on ---> `exchange` (removed)

In [158]:
# sym2ex_src = []
# sym2ex_dst = []

# # load id mappings
# with open(os.path.join(ID_MAPPING, 'exchange2id.json')) as f:
#     exchange2id = json.load(f)

# with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json')) as f:
#     stocksymbol2id = json.load(f)

# edges = set()
# for row in stocks.itertuples():
#     if pd.isna(row.exchange) or pd.isna(row.symbol):   # skip rows where either company shortName or symbol is missing
#         continue
#     h = stocksymbol2id[row.symbol]
#     t = exchange2id[row.exchange]
#     edges.add((h, t))

# sym2ex_src, sym2ex_dst = zip(*edges)

In [159]:
# assert len(sym2ex_src) == len(sym2ex_dst)

In [160]:
# # save edge index as torch tensor
# edge_index = torch.tensor([sym2ex_src, sym2ex_dst], dtype=torch.long)
# torch.save(edge_index, os.path.join(EDGE_INDEX, "sym2ex.pt"))

## 2.3. `company` --- belongs to ---> `industry`

In [161]:
comp2ind_src = []
comp2ind_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'company2id.json')) as f:
    company2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'industry2id.json')) as f:
    industry2id = json.load(f)

edges = set()
for row in stocks.itertuples():
    if pd.isna(row.shortName) or pd.isna(row.industryKey) or row.industryKey=='':   # skip rows where either company shortName or industry key is missing
        continue
    # comp2ind_src.append(company2id[row.shortName])
    # comp2ind_dst.append(industry2id[row.industryKey])
    h = company2id[row.shortName]
    t = industry2id[row.industryKey]
    edges.add((h, t))

comp2ind_src, comp2ind_dst = zip(*edges)

In [162]:
assert len(comp2ind_src) == len(comp2ind_dst)

In [163]:
# save edge index as torch tensor
edge_index = torch.tensor([comp2ind_src, comp2ind_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "comp2ind.pt"))

## 2.4. `industry` --- is part of ---> `sector`

In [164]:
ind2sec_src = []
ind2sec_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'industry2id.json')) as f:
    industry2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'sector2id.json')) as f:
    sector2id = json.load(f)

edges = set()
for row in stocks.itertuples():
    if pd.isna(row.industryKey) or pd.isna(row.sectorKey) or row.industryKey=='' or row.sectorKey=='':   # skip rows where either sectorKey or industry key is missing
        continue
    h = industry2id[row.industryKey]
    t = sector2id[row.sectorKey]
    edges.add((h, t))

ind2sec_src, ind2sec_dst = zip(*edges)

In [165]:
assert len(ind2sec_src) == len(ind2sec_dst)

In [166]:
# save edge index as torch tensor
edge_index = torch.tensor([ind2sec_src, ind2sec_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "ind2sec.pt"))

## 2.5. `company` --- employs C-level officer ---> `officer` (removed)

In [167]:
# comp2off_src = []
# comp2off_dst = []

# # load id mappings
# with open(os.path.join(ID_MAPPING, 'company2id.json')) as f:
#     company2id = json.load(f)

# with open(os.path.join(ID_MAPPING, 'officer2id.json')) as f:
#     officer2id = json.load(f)

# # get list of officers and corresponding company names
# tmp = stocks[['shortName', 'companyOfficers']].copy()
# tmp['officerNames'] = tmp['companyOfficers'].apply(extract_officer_names)
# tmp = tmp.explode('officerNames')
# tmp.head()

# edges = set()
# for row in tmp.itertuples():
#     if pd.isna(row.shortName) or pd.isna(row.officerNames):   # skip rows where either sectorKey or industry key is missing
#         continue
#     h = company2id[row.shortName]
#     t = officer2id[row.officerNames]
#     edges.add((h, t))

# comp2off_src, comp2off_dst = zip(*edges)

In [168]:
# assert len(comp2off_src) == len(comp2off_dst)

In [169]:
# # save edge index as torch tensor
# edge_index = torch.tensor([comp2off_src, comp2off_dst], dtype=torch.long)
# torch.save(edge_index, os.path.join(EDGE_INDEX, "comp2off.pt"))

## 2.6. `institution` --- holds ---> `stock symbol`

In [170]:
inst2sym_src = []
inst2sym_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json')) as f:
    stocksymbol2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'institution2id.json')) as f:
    institution2id = json.load(f)

# get list of officers and corresponding company names
tmp = inst[['symbol', 'holders']].copy()
tmp['holderNames'] = tmp['holders'].apply(extract_institution_names)
tmp = tmp.explode('holderNames')
tmp.head()

edges = set()
for row in tmp.itertuples():
    if pd.isna(row.symbol) or pd.isna(row.holderNames):   # skip rows where either sectorKey or industry key is missing
        continue
    h = institution2id[row.holderNames]
    t = stocksymbol2id[row.symbol]
    edges.add((h, t))

inst2sym_src, inst2sym_dst = zip(*edges)

In [171]:
assert len(inst2sym_src) == len(inst2sym_dst)

In [172]:
# save edge index as torch tensor
edge_index = torch.tensor([inst2sym_src, inst2sym_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "inst2sym.pt"))

## 2.7. `stock symbol` --- co-mentioned in news with ---> `stock symbol` (removed)

In [173]:
# comention = pd.read_parquet(os.path.join(DATA_PATH, 'stocks_related_tickers.parquet'))

# comention.head()

## 2.8. `mutual fund` --- holds ---> `stock symbol`

In [174]:
fund2sym_src = []
fund2sym_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json')) as f:
    stocksymbol2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'mutualfund2id.json')) as f:
    mutualfund2id = json.load(f)

# get list of officers and corresponding company names
tmp = funds[['symbol', 'holders']].copy()
tmp['holderNames'] = tmp['holders'].apply(extract_mutualfund_names)
tmp = tmp.explode('holderNames')
tmp.head()

edges = set()
for row in tmp.itertuples():
    if pd.isna(row.symbol) or pd.isna(row.holderNames):   # skip rows where either sectorKey or industry key is missing
        continue
    h = mutualfund2id[row.holderNames]
    t = stocksymbol2id[row.symbol]
    edges.add((h, t))

fund2sym_src, fund2sym_dst = zip(*edges)

In [175]:
assert len(fund2sym_src) == len(fund2sym_dst)

In [176]:
# save edge index as torch tensor
edge_index = torch.tensor([fund2sym_src, fund2sym_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "fund2stocksym.pt"))

## 2.9. `mutual fund` --- has symbol ---> `fund symbol`

In [177]:
fund2sym_src = []
fund2sym_dst = []

# load id mappings
with open(os.path.join(ID_MAPPING, 'mutualfund2id.json')) as f:
    mutualfund2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'fundsymbol2id.json')) as f:
    fundsymbol2id = json.load(f)

edges = set()
for row in funds_symbol.itertuples():
    if pd.isna(row.fund_name) or pd.isna(row.symbols):   # skip rows where either company shortName or symbol is missing
        continue
    h = mutualfund2id[row.fund_name]
    t = fundsymbol2id[row.symbols]
    edges.add((h, t))

fund2sym_src, fund2sym_dst = zip(*edges)

In [178]:
# sanity check - src and dst should have same length
assert len(fund2sym_src) == len(fund2sym_dst)

In [179]:
# save edge index as torch tensor
edge_index = torch.tensor([fund2sym_src, fund2sym_dst], dtype=torch.long)
torch.save(edge_index, os.path.join(EDGE_INDEX, "fund2fundsym.pt"))

# 3. Build knowledge graph
Steps:
1. Convert local IDs of each entity type to a global ID
2. Convert every typed edge_index (src, dst) to triples (h, r, t)

## 3.1. Step 1: Build global ID map for entities

In [180]:
# grab entity id mappings
with open(os.path.join(ID_MAPPING, 'company2id.json')) as f:
    company2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'stocksymbol2id.json')) as f:
    stocksymbol2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'industry2id.json')) as f:
    industry2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'sector2id.json')) as f:
    sector2id = json.load(f)

# with open(os.path.join(ID_MAPPING, 'exchange2id.json')) as f:
#     exchange2id = json.load(f)

# with open(os.path.join(ID_MAPPING, 'officer2id.json')) as f:
#     officer2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'institution2id.json')) as f:
    institution2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'mutualfund2id.json')) as f:
    mutualfund2id = json.load(f)

with open(os.path.join(ID_MAPPING, 'fundsymbol2id.json')) as f:
    fundsymbol2id = json.load(f)

In [181]:
entity_id_maps = [
    company2id, stocksymbol2id, industry2id, sector2id, 
    # exchange2id, officer2id, 
    institution2id, mutualfund2id, fundsymbol2id
]

entity_types = [
    'company', 'stock_symbol', 'industry', 'sector',
    # 'exchange', 'officer', 
    'institution', 'fund', 'fund_symbol'
]

global_map, type_map, offsets = build_global_id_map(
    entity_id_maps=entity_id_maps, entity_types=entity_types
)

In [182]:
# save mappings
with open(os.path.join(ID_MAPPING, 'global_id.json'), 'w') as f:
    json.dump(global_map, f, indent=2)

with open(os.path.join(ID_MAPPING, 'global_type_map.json'), 'w') as f:
    json.dump(type_map, f, indent=2)

with open(os.path.join(ID_MAPPING, 'global_entity_id_offsets.json'), 'w') as f:
    json.dump(offsets, f, indent=2)

## 3.2. Build global edge index

In [183]:
# load local edge indices (edge indices of each relation type using local entity id mappings)
comp2ind = torch.load(os.path.join(EDGE_INDEX, 'comp2ind.pt'), weights_only=False)   # company belongs to industry
# comp2off = torch.load(os.path.join(EDGE_INDEX, 'comp2off.pt'), weights_only=False)   # company employs officer
comp2sym = torch.load(os.path.join(EDGE_INDEX, 'comp2sym.pt'), weights_only=False)   # company has stock symbol
ind2sec = torch.load(os.path.join(EDGE_INDEX, 'ind2sec.pt'), weights_only=False)     # industry belongs to sector
inst2sym = torch.load(os.path.join(EDGE_INDEX, 'inst2sym.pt'), weights_only=False)   # institution holds stock symbol
# sym2ex = torch.load(os.path.join(EDGE_INDEX, 'sym2ex.pt'), weights_only=False)       # stock symbol listed on exchange
fund2stocksym = torch.load(os.path.join(EDGE_INDEX, 'fund2stocksym.pt'), weights_only=False)   # mutual fund holds stock symbol
fund2fundsym = torch.load(os.path.join(EDGE_INDEX, 'fund2fundsym.pt'), weights_only=False)   # mutual fund has fund symbol


In [184]:
# create edge indices list for input 
edge_indices_list = [
    {
        "relation": "has_symbol",
        "head_type": "company",
        "tail_type": "stock_symbol",
        "edge_index": comp2sym,  # tensor [2, num_edges]
    },
    # {
    #     "relation": "is_listed_on",
    #     "head_type": "stock_symbol",
    #     "tail_type": "exchange",
    #     "edge_index": sym2ex,
    # },
    {
        "relation": "belongs_to",
        "head_type": "company",
        "tail_type": "industry",
        "edge_index": comp2ind,
    },
    {
        "relation": "is_part_of",
        "head_type": "industry",
        "tail_type": "sector",
        "edge_index": ind2sec,
    },
    # {
    #     "relation": "employs",
    #     "head_type": "company",
    #     "tail_type": "officer",
    #     "edge_index": comp2off,
    # },
    {
        "relation": "holds",
        "head_type": "institution",
        "tail_type": "stock_symbol",
        "edge_index": inst2sym,
    },
    {
        "relation": "holds",
        "head_type": "fund",
        "tail_type": "stock_symbol",
        "edge_index": fund2stocksym,
    },
    {
        "relation": "holds",
        "head_type": "fund",
        "tail_type": "fund_symbol",
        "edge_index": fund2fundsym,
    },
]

# grab global entity id mappings created earlier
with open(os.path.join(ID_MAPPING, 'global_id.json')) as f:
    global_id = json.load(f)

In [185]:
# create global triples
global_triples = build_global_triples(edge_indices_list, global_id)

# store
with open(os.path.join(EDGE_INDEX, 'global_triples.json'), 'w') as f:
    json.dump(global_triples, f, indent=2)

Building global triples for (company has_symbol stock_symbol)
Building global triples for (company belongs_to industry)
Building global triples for (industry is_part_of sector)
Building global triples for (institution holds stock_symbol)
Building global triples for (fund holds stock_symbol)
Building global triples for (fund holds fund_symbol)
Building global triples for (fund holds fund_symbol)


## 3.3. Create HeteroData object

In [186]:
# load global triples and global entity type maps
with open(os.path.join(EDGE_INDEX, 'global_triples.json')) as f:
    global_triples = json.load(f)

with open(os.path.join(ID_MAPPING, 'global_type_map.json')) as f:
    type_map = json.load(f)

In [187]:
# create hetero data object
hetero_data = build_hetero_graph(global_triples, type_map)

In [188]:
hetero_data

HeteroData(
  (company, has_symbol, stock_symbol)={ edge_index=[2, 4508] },
  (company, belongs_to, industry)={ edge_index=[2, 3998] },
  (industry, is_part_of, sector)={ edge_index=[2, 144] },
  (institution, holds, stock_symbol)={ edge_index=[2, 39512] },
  (fund, holds, stock_symbol)={ edge_index=[2, 37098] },
  (fund, holds, fund_symbol)={ edge_index=[2, 6851] }
)

In [189]:
# save data object
torch.save(hetero_data, os.path.join(GRAPH_DATA, 'yfinance_kge.pt'))

# 4. Add Node Features for companies
Load financial features and attach them to company nodes

In [190]:
# Load raw stocks data to get additional columns for derived features
stocks_raw = pd.read_parquet(os.path.join(DATA_PATH, 'stocks_info.parquet'))

# Create derived features
print("=== Creating Derived Features ===")

# 1. bidAskSpread: (ask - bid) / ((bid + ask)/2) - scaled by midpoint
stocks_raw['bidAskSpread'] = (stocks_raw['ask'] - stocks_raw['bid']) / ((stocks_raw['bid'] + stocks_raw['ask']) / 2)

# 2. fiftyTwoWeekSpread: (high - low) / (high + low) - scaled by sum
stocks_raw['fiftyTwoWeekSpread'] = (stocks_raw['fiftyTwoWeekHigh'] - stocks_raw['fiftyTwoWeekLow']) / (stocks_raw['fiftyTwoWeekHigh'] + stocks_raw['fiftyTwoWeekLow'])

# 3. fiftyDayAvgRatio: fiftyDayAverage / currentPrice
stocks_raw['fiftyDayAvgRatio'] = stocks_raw['fiftyDayAverage'] / stocks_raw['currentPrice']

# 4. twoHundredDayAvgRatio: twoHundredDayAverage / currentPrice
stocks_raw['twoHundredDayAvgRatio'] = stocks_raw['twoHundredDayAverage'] / stocks_raw['currentPrice']

# 5. targetPriceSpread: (targetHighPrice - targetLowPrice) / ((targetHighPrice + targetLowPrice)/2)
stocks_raw['targetPriceSpread'] = (stocks_raw['targetHighPrice'] - stocks_raw['targetLowPrice']) / ((stocks_raw['targetHighPrice'] + stocks_raw['targetLowPrice']) / 2)

# 6 & 7: Additional features from raw data (already exist, just select them)
# averageDailyVolume3Month and quickRatio

# Select the derived features + additional features
derived_features = ['bidAskSpread', 'fiftyTwoWeekSpread', 'fiftyDayAvgRatio', 
                    'twoHundredDayAvgRatio', 'targetPriceSpread', 
                    'averageDailyVolume3Month', 'quickRatio']

# Create a dataframe with symbol and derived features
derived_df = stocks_raw[['symbol'] + derived_features].copy()

print(f"Derived features created: {derived_features}")
print(f"Shape: {derived_df.shape}")
print(f"\nSample values:")
print(derived_df[derived_features].describe())

=== Creating Derived Features ===
Derived features created: ['bidAskSpread', 'fiftyTwoWeekSpread', 'fiftyDayAvgRatio', 'twoHundredDayAvgRatio', 'targetPriceSpread', 'averageDailyVolume3Month', 'quickRatio']
Shape: (4607, 8)

Sample values:
       bidAskSpread  fiftyTwoWeekSpread  fiftyDayAvgRatio  \
count   3502.000000         4594.000000       4312.000000   
mean      -0.033934            0.286565          1.027247   
std        1.166118            0.198172          0.208498   
min       -2.000000            0.000000          0.395398   
25%        0.002928            0.139921          0.959283   
50%        0.024509            0.244033          1.002566   
75%        0.386300            0.387535          1.052372   
max        2.000000            1.000000          7.196430   

       twoHundredDayAvgRatio  targetPriceSpread  averageDailyVolume3Month  \
count            4312.000000        3152.000000              4.481000e+03   
mean                1.030873           0.390093         

In [191]:
# Load features CSV
features_df = pd.read_csv(os.path.join(DATA_PATH, 'stocks_info_features.csv'))

# Merge with derived features created in previous cell
features_df = features_df.merge(derived_df, on='symbol', how='left')

print(f"Features shape (after adding derived features): {features_df.shape}")
print(f"Features columns: {features_df.columns.tolist()}")
print(f"Missing values: {features_df.isna().sum().sum()} total NaN values")

# Prepare feature data
# Remove 'symbol' column and keep only numeric features
feature_cols = [col for col in features_df.columns if col != 'symbol']
X = features_df[['symbol'] + feature_cols].copy()

print(f"\nNumber of features (original + derived): {len(feature_cols)}")

# Handle missing values and infinities
for col in feature_cols:
    # Replace infinities with NaN first
    X[col] = X[col].replace([np.inf, -np.inf], np.nan)
    # Then impute with mean
    X[col].fillna(X[col].mean(), inplace=True)
    # If still has NaN (e.g., all values were inf/nan), fill with 0
    X[col].fillna(0, inplace=True)

print(f"Missing values after imputation: {X[feature_cols].isna().sum().sum()}")
print(f"Infinities after handling: {np.isinf(X[feature_cols]).sum().sum()}")

# Normalize features (zero-mean, unit variance)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_normalized = X.copy()
X_normalized[feature_cols] = scaler.fit_transform(X[feature_cols])

print(f"\nFeatures normalized (mean ≈ 0, std ≈ 1)")
print(f"Mean of first feature: {X_normalized[feature_cols[0]].mean():.6f}")
print(f"Std of first feature: {X_normalized[feature_cols[0]].std():.6f}")

# Load company2id mapping and symbol-to-company mapping
with open(os.path.join(ID_MAPPING, 'company2id.json')) as f:
    company2id = json.load(f)

# Reload stocks info to get company names
stocks_df = pd.read_parquet(os.path.join(DATA_PATH, 'stocks_info.parquet'))

# Create a mapping from symbol to company name (shortName)
symbol_to_company = dict(zip(stocks_df['symbol'], stocks_df['shortName']))

print(f"\nTotal companies in mapping: {len(company2id)}")
print(f"Total symbols in features: {len(features_df)}")

Features shape (after adding derived features): (4607, 39)
Features columns: ['fullTimeEmployees', 'payoutRatio', 'beta', 'forwardPE', 'averageDailyVolume10Day', 'bid', 'ask', 'marketCap', 'fiftyTwoWeekLow', 'fiftyTwoWeekHigh', 'fiftyDayAverage', 'twoHundredDayAverage', 'profitMargins', 'sharesOutstanding', 'shortRatio', 'bookValue', 'trailingEps', 'forwardEps', 'enterpriseToEbitda', 'currentPrice', 'targetHighPrice', 'targetLowPrice', 'revenuePerShare', 'revenueGrowth', 'ebitdaMargins', 'symbol', 'epsTrailingTwelveMonths', 'epsForward', 'priceToBook', 'debtToEquity', 'dividendRate', 'trailingPE', 'bidAskSpread', 'fiftyTwoWeekSpread', 'fiftyDayAvgRatio', 'twoHundredDayAvgRatio', 'targetPriceSpread', 'averageDailyVolume3Month', 'quickRatio']
Missing values: 25679 total NaN values

Number of features (original + derived): 38
Missing values after imputation: 0
Infinities after handling: 0

Features normalized (mean ≈ 0, std ≈ 1)
Mean of first feature: -0.000000
Std of first feature: 1.000

In [192]:
# Create feature tensor for company nodes
# Initialize with zeros: [num_companies, num_features]
num_companies = len(company2id)
company_features = torch.zeros((num_companies, len(feature_cols)), dtype=torch.float32)

# Fill the tensor using symbol -> company mapping
matched_count = 0
unmatched_symbols = []

for row in X_normalized.itertuples():
    symbol = row.symbol
    
    # Map symbol to company name
    if symbol not in symbol_to_company:
        unmatched_symbols.append(symbol)
        continue
    
    company_name = symbol_to_company[symbol]
    
    # Map company name to company ID
    if company_name not in company2id:
        unmatched_symbols.append(symbol)
        continue
    
    company_idx = company2id[company_name]
    feature_values = torch.tensor([getattr(row, col) for col in feature_cols], dtype=torch.float32)
    company_features[company_idx] = feature_values
    matched_count += 1

print(f"Matched companies: {matched_count}/{len(X_normalized)}")
print(f"Unmatched symbols: {len(unmatched_symbols)}")
print(f"Company features shape: {company_features.shape}")


Matched companies: 4508/4607
Unmatched symbols: 99
Company features shape: torch.Size([4171, 38])


In [193]:
hetero_data

HeteroData(
  (company, has_symbol, stock_symbol)={ edge_index=[2, 4508] },
  (company, belongs_to, industry)={ edge_index=[2, 3998] },
  (industry, is_part_of, sector)={ edge_index=[2, 144] },
  (institution, holds, stock_symbol)={ edge_index=[2, 39512] },
  (fund, holds, stock_symbol)={ edge_index=[2, 37098] },
  (fund, holds, fund_symbol)={ edge_index=[2, 6851] }
)

In [194]:
# Attach features to the HeteroData object
hetero_data['company'].x = company_features

# Verify attachment
print(f"HeteroData updated with company features:")
print(hetero_data['company'])
print(f"Company node features shape: {hetero_data['company'].x.shape}")


HeteroData updated with company features:
{'x': tensor([[-3.0125e-01, -1.3510e-01,  1.7179e-02,  ..., -1.5132e+00,
          2.3509e-01, -2.4738e-01],
        [-3.1398e-01, -1.3510e-01,  1.8747e-02,  ...,  4.5530e-01,
         -1.4851e-01,  4.0132e-01],
        [-2.7535e-01, -9.1958e-02,  1.6212e-02,  ..., -8.0794e-01,
         -1.7128e-01, -9.8636e-02],
        ...,
        [ 2.0455e+00, -1.3510e-01,  1.8547e-02,  ...,  1.0729e+00,
          7.8621e+00, -2.3438e-01],
        [-7.8489e-02, -1.2362e-01,  2.2445e-02,  ...,  2.1084e-01,
          5.1817e-01, -1.9316e-01],
        [-7.6895e-03, -1.0674e-01,  1.8600e-02,  ..., -7.2817e-01,
         -6.5892e-02, -1.3219e-01]])}
Company node features shape: torch.Size([4171, 38])


In [195]:
# Save the updated graph with company features
torch.save(hetero_data, os.path.join(GRAPH_DATA, 'yfinance_kge.pt'))
print("Graph saved with company features!")

# Verify by reloading the graph
graph_with_features = torch.load(os.path.join(GRAPH_DATA, 'yfinance_kge.pt'), weights_only=False)
print(f"\nLoaded graph verification:")
print(graph_with_features)
print(f"\nCompany node features shape: {graph_with_features['company'].x.shape}")
print(f"Company feature tensor (first 5 companies):\n{graph_with_features['company'].x[:5]}")


Graph saved with company features!

Loaded graph verification:
HeteroData(
  company={ x=[4171, 38] },
  (company, has_symbol, stock_symbol)={ edge_index=[2, 4508] },
  (company, belongs_to, industry)={ edge_index=[2, 3998] },
  (industry, is_part_of, sector)={ edge_index=[2, 144] },
  (institution, holds, stock_symbol)={ edge_index=[2, 39512] },
  (fund, holds, stock_symbol)={ edge_index=[2, 37098] },
  (fund, holds, fund_symbol)={ edge_index=[2, 6851] }
)

Company node features shape: torch.Size([4171, 38])
Company feature tensor (first 5 companies):
tensor([[-3.0125e-01, -1.3510e-01,  1.7179e-02, -9.7521e-02,  3.4044e+00,
         -1.8315e-02, -1.7871e-02, -1.5632e-01, -1.9690e-02, -2.0792e-02,
         -2.0895e-02, -2.0503e-02, -4.3673e-01, -2.8442e-01,  3.7927e+00,
         -4.4071e-02, -2.6516e-02, -3.3475e-02, -4.0287e-02, -2.1256e-02,
         -2.9527e-02, -2.8409e-02, -7.4037e-02, -9.8131e-02, -5.2578e-01,
         -2.6405e-02, -3.3423e-02, -3.1352e-02,  0.0000e+00, -1.0697e-1

In [196]:
# Diagnostic: Check for duplicate company entities and company coverage
print("=== Company Entity Analysis ===")
print(f"Total companies in company2id: {len(company2id)}")
print(f"Unique company short names in stocks_df: {stocks_df['shortName'].nunique()}")
print(f"Total rows in stocks_df: {len(stocks_df)}")

# Check if there are duplicate shortNames (which shouldn't happen)
duplicates = stocks_df[stocks_df['shortName'].duplicated(keep=False)].sort_values('shortName')
if len(duplicates) > 0:
    print(f"\n⚠️ Found {len(duplicates)} rows with duplicate shortNames:")
    print(duplicates[['symbol', 'shortName']].head(10))
else:
    print("\n✓ No duplicate shortNames found")

# Check coverage of features
print(f"\n=== Feature Coverage ===")
print(f"Companies in graph with features: {(company_features.sum(dim=1) != 0).sum().item()}")
print(f"Companies in graph with zero features: {(company_features.sum(dim=1) == 0).sum().item()}")
print(f"Total companies in graph: {company_features.shape[0]}")


=== Company Entity Analysis ===
Total companies in company2id: 4171
Unique company short names in stocks_df: 4171
Total rows in stocks_df: 4607

⚠️ Found 634 rows with duplicate shortNames:
       symbol                        shortName
43     YCY-UN  AA Mission Acquisition Corp. II
44        YCY  AA Mission Acquisition Corp. II
1944  MITT-PA  AG Mortgage Investment Trust, I
1947     MITP  AG Mortgage Investment Trust, I
1948     MITN  AG Mortgage Investment Trust, I
1945     MITT  AG Mortgage Investment Trust, I
1943  MITT-PB  AG Mortgage Investment Trust, I
1942  MITT-PC  AG Mortgage Investment Trust, I
4479    AGNCZ  AGNC Investment Corp. - Deposit
4480    AGNCP  AGNC Investment Corp. - Deposit

=== Feature Coverage ===
Companies in graph with features: 4171
Companies in graph with zero features: 0
Total companies in graph: 4171


# 5. Add Node Features for fund symbols
Load financial features and attach them to fund symbol nodes

In [197]:
# Load raw funds data to create derived features
funds_raw = pd.read_csv(os.path.join(DATA_PATH, 'funds_info_features.csv'))

# Convert columns to numeric (some may have string values)
numeric_cols = ['ask', 'bid', 'fiftyTwoWeekHigh', 'fiftyTwoWeekLow', 
                'fiftyDayAverage', 'twoHundredDayAverage', 'navPrice',
                'fiveYearAverageReturn', 'threeYearAverageReturn']
for col in numeric_cols:
    if col in funds_raw.columns:
        funds_raw[col] = pd.to_numeric(funds_raw[col], errors='coerce')

print("=== Creating Derived Features for Funds ===")

# 1. bidAskSpread: (ask - bid) / ((bid + ask)/2)
funds_raw['bidAskSpread'] = (funds_raw['ask'] - funds_raw['bid']) / ((funds_raw['bid'] + funds_raw['ask']) / 2)

# 2. fiftyTwoWeekSpread: (high - low) / (high + low)
funds_raw['fiftyTwoWeekSpread'] = (funds_raw['fiftyTwoWeekHigh'] - funds_raw['fiftyTwoWeekLow']) / (funds_raw['fiftyTwoWeekHigh'] + funds_raw['fiftyTwoWeekLow'])

# 3. fiftyDayAvgRatio: fiftyDayAverage / navPrice (using navPrice instead of currentPrice for funds)
funds_raw['fiftyDayAvgRatio'] = funds_raw['fiftyDayAverage'] / funds_raw['navPrice']

# 4. twoHundredDayAvgRatio: twoHundredDayAverage / navPrice
funds_raw['twoHundredDayAvgRatio'] = funds_raw['twoHundredDayAverage'] / funds_raw['navPrice']

# 5. returnSpread: difference between long-term and medium-term returns (fund-specific)
funds_raw['returnSpread'] = funds_raw['fiveYearAverageReturn'] - funds_raw['threeYearAverageReturn']

# Select derived features
derived_fund_features = ['bidAskSpread', 'fiftyTwoWeekSpread', 'fiftyDayAvgRatio', 
                         'twoHundredDayAvgRatio', 'returnSpread']

# Create dataframe with symbol and derived features
derived_funds_df = funds_raw[['symbol'] + derived_fund_features].copy()

print(f"Derived features created: {derived_fund_features}")
print(f"Shape: {derived_funds_df.shape}")
print(f"\nSample values:")
print(derived_funds_df[derived_fund_features].describe())

=== Creating Derived Features for Funds ===
Derived features created: ['bidAskSpread', 'fiftyTwoWeekSpread', 'fiftyDayAvgRatio', 'twoHundredDayAvgRatio', 'returnSpread']
Shape: (6851, 6)

Sample values:
       bidAskSpread  fiftyTwoWeekSpread  fiftyDayAvgRatio  \
count   1195.000000         6850.000000        907.000000   
mean       0.069752            0.151237               inf   
std        0.545384            0.126740               NaN   
min       -2.000000           -1.000000          0.143513   
25%        0.002319            0.095917          0.985399   
50%        0.013724            0.145879          0.995338   
75%        0.067160            0.181614          1.008696   
max        2.000000            1.000000               inf   

       twoHundredDayAvgRatio  returnSpread  
count             907.000000    454.000000  
mean                     inf     -0.083616  
std                      NaN      0.119652  
min                 0.125476     -0.776405  
25%                 0.

In [198]:
# Load features CSV
features_df = pd.read_csv(os.path.join(DATA_PATH, 'funds_info_features.csv'))
# remove the ones without quoteType
features_df = features_df[~features_df['quoteType'].isnull()]
# cast binary type for quoteType (0: ETF/Equity, 1: Mutual fund)
features_df['quoteType'] = (features_df['quoteType'] == 'MUTUALFUND').astype(int)
# calculate fund age (from fund inception date to present)
features_df['fundInceptionDate'] = features_df['fundInceptionDate'].apply(pd.to_numeric, errors='coerce')
features_df['fundAge'] = features_df['fundInceptionDate'].apply(years_since_timestamp)
features_df.drop('fundInceptionDate', axis=1, inplace=True)
# cast object cols to numeric
object_cols = [col for col in features_df.select_dtypes(include=['object']).columns.tolist() if col !='symbol']
features_df[object_cols] = features_df[object_cols].apply(pd.to_numeric, errors="coerce")

# Merge with derived features created in previous cell
features_df = features_df.merge(derived_funds_df, on='symbol', how='left')

print(f"Features shape (after adding derived features): {features_df.shape}")
print(f"Features columns: {features_df.columns.tolist()}")
print(f"Missing values: {features_df.isna().sum().sum()} total NaN values")

# Prepare feature data
# quoteType: Map
# Remove 'symbol' column and keep only numeric features
feature_cols = features_df.select_dtypes(include=['float64', 'int']).columns.tolist()
X = features_df[['symbol'] + feature_cols].copy()

print(f"\nNumber of features (original + derived): {len(feature_cols)}")

# Handle missing values and infinities
for col in feature_cols:
    # Replace infinities with NaN first
    X[col] = X[col].replace([np.inf, -np.inf], np.nan)
    # Then impute with mean
    X[col].fillna(X[col].mean(), inplace=True)
    # If still has NaN (e.g., all values were inf/nan), fill with 0
    X[col].fillna(0, inplace=True)

print(f"Missing values after imputation: {X[feature_cols].isna().sum().sum()}")
print(f"Infinities after handling: {np.isinf(X[feature_cols]).sum().sum()}")

# Normalize features (zero-mean, unit variance)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_normalized = X.copy()
X_normalized[feature_cols] = scaler.fit_transform(X[feature_cols])

print(f"\nFeatures normalized (mean ≈ 0, std ≈ 1)")
print(f"Mean of first feature: {X_normalized[feature_cols[0]].mean():.6f}")
print(f"Std of first feature: {X_normalized[feature_cols[0]].std():.6f}")

# Load fundsymbol2id mapping
with open(os.path.join(ID_MAPPING, 'fundsymbol2id.json')) as f:
    fundsymbol2id = json.load(f)

print(f"\nTotal symbols in mapping: {len(fundsymbol2id)}")
print(f"Total symbols in features: {len(features_df)}")

Features shape (after adding derived features): (8666, 30)
Features columns: ['trailingPE', 'averageDailyVolume10Day', 'bid', 'ask', 'fiftyTwoWeekLow', 'fiftyTwoWeekHigh', 'fiftyDayAverage', 'twoHundredDayAverage', 'trailingAnnualDividendRate', 'navPrice', 'ytdReturn', 'beta3Year', 'threeYearAverageReturn', 'fiveYearAverageReturn', 'quoteType', 'symbol', 'averageDailyVolume3Month', 'trailingThreeMonthReturns', 'epsTrailingTwelveMonths', 'netExpenseRatio', 'priceToBook', 'morningStarOverallRating', 'morningStarRiskRating', 'dividendRate', 'fundAge', 'bidAskSpread', 'fiftyTwoWeekSpread', 'fiftyDayAvgRatio', 'twoHundredDayAvgRatio', 'returnSpread']
Missing values: 110796 total NaN values

Number of features (original + derived): 29
Missing values after imputation: 0
Infinities after handling: 0
Missing values after imputation: 0
Infinities after handling: 0

Features normalized (mean ≈ 0, std ≈ 1)
Mean of first feature: 0.000000
Std of first feature: 1.000058

Total symbols in mapping: 63

In [199]:
# Create feature tensor for fund symbol nodes
# Initialize with zeros: [num_companies, num_features]
num_funds = len(fundsymbol2id)
fundsym_features = torch.zeros((num_funds, len(feature_cols)), dtype=torch.float32)

# Fill the tensor using symbol -> company mapping
matched_count = 0
unmatched_symbols = []

for row in X_normalized.itertuples():
    symbol = row.symbol
    
    # # Map symbol to company name
    # if symbol not in symbol_to_fund:
    #     unmatched_symbols.append(symbol)
    #     continue
    
    # fund_name = symbol_to_fund[symbol]
    
    # check if fund symbol is included in mapping
    if symbol not in fundsymbol2id:
        unmatched_symbols.append(symbol)
        continue
    
    symbol_idx = fundsymbol2id[symbol]
    feature_values = torch.tensor([getattr(row, col) for col in feature_cols], dtype=torch.float32)
    fundsym_features[symbol_idx] = feature_values
    matched_count += 1

print(f"Matched funds: {matched_count}/{len(X_normalized)}")
print(f"Unmatched symbols: {len(unmatched_symbols)}")
print(f"Company features shape: {fundsym_features.shape}")


Matched funds: 8666/8666
Unmatched symbols: 0
Company features shape: torch.Size([6340, 29])


In [200]:
# Attach features to the HeteroData object
hetero_data['fund_symbol'].x = fundsym_features

# Verify attachment
print(f"HeteroData updated with fund features attached to symbols:")
print(hetero_data['fund_symbol'])
print(f"Company node features shape: {hetero_data['fund_symbol'].x.shape}")

HeteroData updated with fund features attached to symbols:
{'x': tensor([[ 1.2002e+00,  1.3130e+00,  2.9353e-01,  ..., -9.9602e-01,
         -1.0013e+00, -6.0524e-02],
        [-2.2922e+00, -8.2931e-02,  1.4576e+01,  ...,  4.4703e+00,
          4.5337e+00,  4.8292e-16],
        [ 3.6770e-02,  6.9794e-02, -2.1879e-02,  ..., -9.9954e-01,
         -1.0035e+00, -3.2729e-01],
        ...,
        [ 5.8709e-16, -8.3173e-02,  7.1521e-17,  ...,  2.7935e-16,
          0.0000e+00,  4.8292e-16],
        [ 5.8709e-16, -8.3173e-02,  7.1521e-17,  ...,  2.7935e-16,
          0.0000e+00,  4.8292e-16],
        [ 5.8709e-16, -8.3173e-02,  7.1521e-17,  ...,  2.7935e-16,
          0.0000e+00,  4.8292e-16]])}
Company node features shape: torch.Size([6340, 29])


In [201]:
# Save the updated graph with company features
torch.save(hetero_data, os.path.join(GRAPH_DATA, 'yfinance_kge.pt'))
print("Graph saved with fund features!")

# Verify by reloading the graph
graph_with_features = torch.load(os.path.join(GRAPH_DATA, 'yfinance_kge.pt'), weights_only=False)
print(f"\nLoaded graph verification:")
print(graph_with_features)
print(f"\Fund symbol node features shape: {graph_with_features['fund_symbol'].x.shape}")
print(f"Fund symbol feature tensor (first 5 companies):\n{graph_with_features['fund_symbol'].x[:5]}")


Graph saved with fund features!

Loaded graph verification:
HeteroData(
  company={ x=[4171, 38] },
  fund_symbol={ x=[6340, 29] },
  (company, has_symbol, stock_symbol)={ edge_index=[2, 4508] },
  (company, belongs_to, industry)={ edge_index=[2, 3998] },
  (industry, is_part_of, sector)={ edge_index=[2, 144] },
  (institution, holds, stock_symbol)={ edge_index=[2, 39512] },
  (fund, holds, stock_symbol)={ edge_index=[2, 37098] },
  (fund, holds, fund_symbol)={ edge_index=[2, 6851] }
)
\Fund symbol node features shape: torch.Size([6340, 29])
Fund symbol feature tensor (first 5 companies):
tensor([[ 1.2002e+00,  1.3130e+00,  2.9353e-01,  3.2889e-01,  9.2340e-02,
          1.7472e-02,  1.7613e-02,  2.9850e-02,  2.4844e+00,  4.6631e+00,
          6.1465e-02,  8.3342e-02,  1.0879e+00,  1.8865e+00, -2.2062e+00,
          1.7442e+00,  2.8359e-01, -2.8027e-02, -1.0817e+00, -1.5290e-01,
          0.0000e+00, -3.6188e-16, -2.8016e-17,  1.0698e+00, -3.2055e-01,
          3.0145e-01, -9.9602e-01,

In [202]:
# # Diagnostic: Check for duplicate company entities and company coverage
# print("=== Fund Entity Analysis ===")
# print(f"Total fund symbol in fundsymbol2id: {len(fundsymbol2id)}")
# print(f"Unique fund names in funds_df: {funds_df['fund_name'].nunique()}")
# print(f"Total rows in funds_df: {len(funds_df)}")

# # Check if there are duplicate shortNames (which shouldn't happen)
# duplicates = funds_df[funds_df['fund_name'].duplicated(keep=False)].sort_values('fund_name')
# if len(duplicates) > 0:
#     print(f"\n⚠️ Found {len(duplicates)} rows with duplicate fund_name:")
#     print(duplicates[['symbols', 'fund_name']].head(10))
# else:
#     print("\n✓ No duplicate shortNames found")

# # Check coverage of features
# print(f"\n=== Feature Coverage ===")
# print(f"Funds in graph with features: {(fundsym_features.sum(dim=1) != 0).sum().item()}")
# print(f"Funds in graph with zero features: {(fund_features.sum(dim=1) == 0).sum().item()}")
# print(f"Total funds in graph: {fund_features.shape[0]}")


# 6. Add node features for sectors

In [203]:
# Load features CSV
features_df = pd.read_csv(os.path.join(DATA_PATH, 'sectors.csv'))

print(f"Features shape: {features_df.shape}")
print(f"Features columns: {features_df.columns.tolist()}")
print(f"Missing values: {features_df.isna().sum().sum()} total NaN values")

# Prepare feature data
# quoteType: Map
# Remove 'symbol' column and keep only numeric features
feature_cols = features_df.select_dtypes(include=['float64', 'int']).columns.tolist()
X = features_df[['sector'] + feature_cols].copy()

print(f"\nNumber of features: {len(feature_cols)}")

# Handle missing values and infinities
for col in feature_cols:
    # Replace infinities with NaN first
    X[col] = X[col].replace([np.inf, -np.inf], np.nan)
    # Then impute with mean
    X[col].fillna(X[col].mean(), inplace=True)
    # If still has NaN (e.g., all values were inf/nan), fill with 0
    X[col].fillna(0, inplace=True)

print(f"Missing values after imputation: {X[feature_cols].isna().sum().sum()}")
print(f"Infinities after handling: {np.isinf(X[feature_cols]).sum().sum()}")

# Normalize features (zero-mean, unit variance)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_normalized = X.copy()
X_normalized[feature_cols] = scaler.fit_transform(X[feature_cols])

print(f"\nFeatures normalized (mean ≈ 0, std ≈ 1)")
print(f"Mean of first feature: {X_normalized[feature_cols[0]].mean():.6f}")
print(f"Std of first feature: {X_normalized[feature_cols[0]].std():.6f}")

# Load sector2id mapping
with open(os.path.join(ID_MAPPING, 'sector2id.json')) as f:
    sector2id = json.load(f)

# # Reload funds info to get company names
# funds_df = pd.read_parquet(os.path.join(DATA_PATH, 'funds_symbol.parquet'))

# # Create a mapping from symbol to company name (shortName)
# symbol_to_fund = dict(zip(funds_df['symbols'], funds_df['fund_name']))

print(f"\nTotal symbols in mapping: {len(sector2id)}")
print(f"Total symbols in features: {len(features_df)}")


Features shape: (11, 8)
Features columns: ['sector', 'marketCap', 'marketWeight', 'companies', 'ytdExcessReturn', 'oneYearExcessReturn', 'threeYearExcessReturn', 'fiveYearExcessReturn']
Missing values: 0 total NaN values

Number of features: 7
Missing values after imputation: 0
Infinities after handling: 0

Features normalized (mean ≈ 0, std ≈ 1)
Mean of first feature: -0.000000
Std of first feature: 1.048809

Total symbols in mapping: 11
Total symbols in features: 11


In [204]:
X_normalized.head(1)

Unnamed: 0,sector,marketCap,marketWeight,companies,ytdExcessReturn,oneYearExcessReturn,threeYearExcessReturn,fiveYearExcessReturn
0,healthcare,0.041146,0.040429,1.410455,-0.023412,-0.151976,-0.851793,-0.993362


In [205]:
# Create feature tensor for fund symbol nodes
# Initialize with zeros: [num_companies, num_features]
num_entities = len(sector2id)
entity_features = torch.zeros((num_entities, len(feature_cols)), dtype=torch.float32)

# Fill the tensor using symbol -> company mapping
matched_count = 0
unmatched_entities = []

for row in X_normalized.itertuples():
    entity = row.sector
    
    # check if entity is included in mapping
    if entity not in sector2id:
        unmatched_entities.append(entity)
        continue
    
    entity_idx = sector2id[entity]
    feature_values = torch.tensor([getattr(row, col) for col in feature_cols], dtype=torch.float32)
    entity_features[entity_idx] = feature_values
    matched_count += 1

print(f"Matched funds: {matched_count}/{len(X_normalized)}")
print(f"Unmatched symbols: {len(unmatched_entities)}")
print(f"Company features shape: {entity_features.shape}")


Matched funds: 11/11
Unmatched symbols: 0
Company features shape: torch.Size([11, 7])


In [206]:
# Attach features to the HeteroData object
hetero_data['sector'].x = entity_features

# Verify attachment
print(f"HeteroData updated with fund features attached to symbols:")
print(hetero_data['sector'])
print(f"Company node features shape: {hetero_data['sector'].x.shape}")

HeteroData updated with fund features attached to symbols:
{'x': tensor([[ 0.0411,  0.0404,  1.4105, -0.0234, -0.1520, -0.8518, -0.9934],
        [-0.0984, -0.0989,  0.4471,  0.1979, -0.0751, -0.1304,  0.0468],
        [-0.5461, -0.5462, -0.7455, -1.1497, -0.9734, -0.7673, -1.0697],
        [ 0.2070,  0.2051,  0.0916, -0.8074, -0.2144,  0.3680, -0.3991],
        [ 2.7294,  2.7316,  0.5897,  0.5992,  0.9654,  1.8801,  1.9297],
        [ 0.1816,  0.1798, -0.7151,  2.0337,  2.4479,  1.9994,  0.2939],
        [ 0.7185,  0.7157,  2.1587, -0.1144, -0.0184,  0.1110,  0.3525],
        [-0.8225, -0.8224, -0.6964,  1.5673,  0.6810, -0.3117, -0.2331],
        [-0.6496, -0.6438, -0.7478, -0.7991, -0.9533, -1.0112,  1.7723],
        [-0.8842, -0.8845, -1.0916, -0.3968, -0.5199, -0.4989, -0.5919],
        [-0.8768, -0.8769, -0.7011, -1.1073, -1.1877, -0.7873, -1.1079]])}
Company node features shape: torch.Size([11, 7])


In [207]:
# Save the updated graph with company features
torch.save(hetero_data, os.path.join(GRAPH_DATA, 'yfinance_kge.pt'))
print("Graph saved with sector features!")

# Verify by reloading the graph
graph_with_features = torch.load(os.path.join(GRAPH_DATA, 'yfinance_kge.pt'), weights_only=False)
print(f"\nLoaded graph verification:")
print(graph_with_features)
print(f"\Sector node features shape: {graph_with_features['sector'].x.shape}")
print(f"Sector feature tensor (first 5 companies):\n{graph_with_features['sector'].x[:5]}")


Graph saved with sector features!

Loaded graph verification:
HeteroData(
  company={ x=[4171, 38] },
  fund_symbol={ x=[6340, 29] },
  sector={ x=[11, 7] },
  (company, has_symbol, stock_symbol)={ edge_index=[2, 4508] },
  (company, belongs_to, industry)={ edge_index=[2, 3998] },
  (industry, is_part_of, sector)={ edge_index=[2, 144] },
  (institution, holds, stock_symbol)={ edge_index=[2, 39512] },
  (fund, holds, stock_symbol)={ edge_index=[2, 37098] },
  (fund, holds, fund_symbol)={ edge_index=[2, 6851] }
)
\Sector node features shape: torch.Size([11, 7])
Sector feature tensor (first 5 companies):
tensor([[ 0.0411,  0.0404,  1.4105, -0.0234, -0.1520, -0.8518, -0.9934],
        [-0.0984, -0.0989,  0.4471,  0.1979, -0.0751, -0.1304,  0.0468],
        [-0.5461, -0.5462, -0.7455, -1.1497, -0.9734, -0.7673, -1.0697],
        [ 0.2070,  0.2051,  0.0916, -0.8074, -0.2144,  0.3680, -0.3991],
        [ 2.7294,  2.7316,  0.5897,  0.5992,  0.9654,  1.8801,  1.9297]])


# 7. Add node features for industries

In [208]:
# Load features CSV
features_df = pd.read_csv(os.path.join(DATA_PATH, 'industries.csv'))

print(f"Features shape: {features_df.shape}")
print(f"Features columns: {features_df.columns.tolist()}")
print(f"Missing values: {features_df.isna().sum().sum()} total NaN values")

# Prepare feature data
# quoteType: Map
# Remove 'symbol' column and keep only numeric features
feature_cols = features_df.select_dtypes(include=['float64', 'int']).columns.tolist()
X = features_df[['industry'] + feature_cols].copy()

print(f"\nNumber of features: {len(feature_cols)}")

# Handle missing values and infinities
for col in feature_cols:
    # Replace infinities with NaN first
    X[col] = X[col].replace([np.inf, -np.inf], np.nan)
    # Then impute with mean
    X[col].fillna(X[col].mean(), inplace=True)
    # If still has NaN (e.g., all values were inf/nan), fill with 0
    X[col].fillna(0, inplace=True)

print(f"Missing values after imputation: {X[feature_cols].isna().sum().sum()}")
print(f"Infinities after handling: {np.isinf(X[feature_cols]).sum().sum()}")

# Normalize features (zero-mean, unit variance)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_normalized = X.copy()
X_normalized[feature_cols] = scaler.fit_transform(X[feature_cols])

print(f"\nFeatures normalized (mean ≈ 0, std ≈ 1)")
print(f"Mean of first feature: {X_normalized[feature_cols[0]].mean():.6f}")
print(f"Std of first feature: {X_normalized[feature_cols[0]].std():.6f}")

# Load sector2id mapping
with open(os.path.join(ID_MAPPING, 'industry2id.json')) as f:
    industry2id = json.load(f)

# # Reload funds info to get company names
# funds_df = pd.read_parquet(os.path.join(DATA_PATH, 'funds_symbol.parquet'))

# # Create a mapping from symbol to company name (shortName)
# symbol_to_fund = dict(zip(funds_df['symbols'], funds_df['fund_name']))

print(f"\nTotal symbols in mapping: {len(industry2id)}")
print(f"Total symbols in features: {len(features_df)}")


Features shape: (144, 10)
Features columns: ['industry', 'beta', 'debtToEquity', 'effectiveTaxRate', 'unleveredBeta', 'cashToFirmValue', 'unleveredBetaCashCorrected', 'hiloRisk', 'stdevEquity', 'stdevOperatingIncome']
Missing values: 118 total NaN values

Number of features: 9
Missing values after imputation: 0
Infinities after handling: 0

Features normalized (mean ≈ 0, std ≈ 1)
Mean of first feature: -0.000000
Std of first feature: 1.003490

Total symbols in mapping: 144
Total symbols in features: 144


In [209]:
X_normalized.head(1)

Unnamed: 0,industry,beta,debtToEquity,effectiveTaxRate,unleveredBeta,cashToFirmValue,unleveredBetaCashCorrected,hiloRisk,stdevEquity,stdevOperatingIncome
0,advertising-agencies,1.42394,-0.404704,-0.188945,1.269378,0.380622,1.365462,1.228793,1.146955,-0.73318


In [210]:
# Create feature tensor for fund symbol nodes
# Initialize with zeros: [num_companies, num_features]
num_entities = len(industry2id)
entity_features = torch.zeros((num_entities, len(feature_cols)), dtype=torch.float32)

# Fill the tensor using symbol -> company mapping
matched_count = 0
unmatched_entities = []

for row in X_normalized.itertuples():
    entity = row.industry
    
    # check if entity is included in mapping
    if entity not in industry2id:
        unmatched_entities.append(entity)
        continue
    
    entity_idx = industry2id[entity]
    feature_values = torch.tensor([getattr(row, col) for col in feature_cols], dtype=torch.float32)
    entity_features[entity_idx] = feature_values
    matched_count += 1

print(f"Matched funds: {matched_count}/{len(X_normalized)}")
print(f"Unmatched symbols: {len(unmatched_entities)}")
print(f"Company features shape: {entity_features.shape}")


Matched funds: 144/144
Unmatched symbols: 0
Company features shape: torch.Size([144, 9])


In [211]:
# Attach features to the HeteroData object
hetero_data['industry'].x = entity_features

# Verify attachment
print(f"HeteroData updated with fund features attached to industries:")
print(hetero_data['industry'])
print(f"Industry node features shape: {hetero_data['industry'].x.shape}")

HeteroData updated with fund features attached to industries:
{'x': tensor([[ 0.1455, -0.6856, -1.2984,  ...,  1.2327,  1.5029,  0.0633],
        [ 1.0644, -0.6758, -1.4401,  ...,  1.3996,  2.4966, -0.2095],
        [-0.2540, -0.6095, -0.3458,  ...,  0.8935,  0.2735, -0.3552],
        ...,
        [ 0.3053, -0.9246,  1.5272,  ..., -0.6422, -0.2618, -0.4697],
        [ 0.1455, -0.6856, -1.2984,  ...,  1.2327,  1.5029,  0.0633],
        [ 0.8646,  0.1997,  0.9035,  ..., -0.5981, -0.0419, -0.6371]])}
Industry node features shape: torch.Size([144, 9])


In [212]:
# Save the updated graph with company features
torch.save(hetero_data, os.path.join(GRAPH_DATA, 'yfinance_kge.pt'))
print("Graph saved with industry features!")

# Verify by reloading the graph
graph_with_features = torch.load(os.path.join(GRAPH_DATA, 'yfinance_kge.pt'), weights_only=False)
print(f"\nLoaded graph verification:")
print(graph_with_features)
print(f"\Industry node features shape: {graph_with_features['industry'].x.shape}")
print(f"Industry feature tensor (first 5 companies):\n{graph_with_features['industry'].x[:5]}")


Graph saved with industry features!

Loaded graph verification:
HeteroData(
  company={ x=[4171, 38] },
  fund_symbol={ x=[6340, 29] },
  sector={ x=[11, 7] },
  industry={ x=[144, 9] },
  (company, has_symbol, stock_symbol)={ edge_index=[2, 4508] },
  (company, belongs_to, industry)={ edge_index=[2, 3998] },
  (industry, is_part_of, sector)={ edge_index=[2, 144] },
  (institution, holds, stock_symbol)={ edge_index=[2, 39512] },
  (fund, holds, stock_symbol)={ edge_index=[2, 37098] },
  (fund, holds, fund_symbol)={ edge_index=[2, 6851] }
)
\Industry node features shape: torch.Size([144, 9])
Industry feature tensor (first 5 companies):
tensor([[ 0.1455, -0.6856, -1.2984,  0.4470,  0.1311,  0.4755,  1.2327,  1.5029,
          0.0633],
        [ 1.0644, -0.6758, -1.4401,  1.2234,  0.0088,  1.2542,  1.3996,  2.4966,
         -0.2095],
        [-0.2540, -0.6095, -0.3458,  0.0526, -0.9047, -0.0436,  0.8935,  0.2735,
         -0.3552],
        [-1.6524, -0.5973, -0.3269, -1.1282, -0.3656, -1.