In [1]:
# Imports
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os

from theme_attention.rnd import marketsTool as tools

import warnings

from base.sql import factset as fs
from theme_attention.pipeline import dbTool as dbt 

import yaml
from pathlib import Path
import utils
import importlib 
importlib.reload(utils) 

import requests
from io import StringIO
import yfinance as yf
import pickle

Found the config file in: /config.json


In [None]:
# Adjust Pandas display settings to show all rows and columns
#pd.set_option('display.max_rows', None)  # Show all rows
#pd.set_option('display.max_columns', None)  # Show all columns
#pd.set_option('display.width', 1000)  # Adjust the width to fit the output

# Reset Display Settings
#pd.reset_option('display.max_rows')
#pd.reset_option('display.max_columns')
#pd.reset_option('display.width')

In [None]:
# --- TICKERS (initial idea, not used at the moment) ---
headers = {"User-Agent": "Mozilla/5.0"}

# Fetch HTML content
html = requests.get(
    "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies",
    headers=headers
).text

# Read tables
tables = pd.read_html(StringIO(html))
sp500 = tables[1]

# Extract raw tickers
tickers_500 = sp500["Symbol"].tolist()

# Tickers of the top 20 companies by market capitalization in the S&P 500
# If "tickers_500" is too big to process, use "tickers_20" instead
tickers_20 = ["NVDA","AAPL","MSFT","AMZN","GOOGL","AVGO","GOOG","META","TSLA","BRK.B","LLY","JPM","V","XOM","JNJ","WMT","NFLX","MA","ABBV","COST"]

# Bloomberg tickers (US equities)
bb_tickers = [
    f"{t.replace('.', '/').replace(':', '/').replace('-', '/')} US"
    for t in tickers_20
]

# Map Yahoo exchange codes to readable names
yahoo_to_exch = {
    "NMS": "NASDAQ",  
    "NGM": "NASDAQ",
    "NGS": "NASDAQ",
    "NSQ": "NASDAQ",
    "NYQ": "NYSE",    
    "NYS": "NYSE",
    "PCX": "AMEX",    
}

def get_exchange(symbol):
    t = yf.Ticker(symbol)
    ex = t.info.get("exchange")  # e.g. "NMS", "NYQ"
    if ex is None:
        return "UNKNOWN"
    return yahoo_to_exch.get(ex, ex)  # map if known, else keep raw

# Build market tickers: EXCHANGE:SYMBOL (e.g. NASDAQ:AAPL)
exchanges = [get_exchange(sym) for sym in tickers_20]
mkt_tickers = [f"{ex}:{sym}" for ex, sym in zip(exchanges, tickers_20)]

# Create dataframe
df = pd.DataFrame({
    "symbol": tickers_20,
    "bb_ticker": bb_tickers,
    "mkt_ticker": mkt_tickers
})

df


In [None]:
# Adjust the mkt_ticker for Berkshire Hathaway manually
df.loc[df["symbol"] == "BRK.B", "mkt_ticker"] = "NYSE:BRK.B"

df

In [None]:
# Call existing ticker mapping function to get fsym_id and other info
mapT = fs.getFsymIDsFromBBTicker(bb_tickers)

masterT = fs.getFactsetCoverage(mapT['fsym_id'].to_frame())
masterT = masterT.merge(mapT, on='fsym_id')

# Merge Bloomberg ticker, and FactSet ID into one DataFrame
companiesDB = df[['bb_ticker', 'mkt_ticker']].merge(mapT, left_on='bb_ticker', right_on='bbg_ticker', how='left').merge(masterT, left_on='fsym_id', right_on='fsym_id', how='left')

# Final 
companiesMapping = companiesDB[['bb_ticker', 'mkt_ticker', 'fsym_id', 'proper_name']].drop_duplicates().reset_index(drop=True)

companiesMapping


In [2]:
# --- TICKERS ---
# Region Codes (EOD Mapping)
# US  -> United States
# RDM -> Rest of Developed Markets (e.g., Europe, Japan, Australia, etc.)
# EM  -> Emerging Markets (e.g., China, India, Brazil, etc.)

fL = dbt.get_master_mapping_df(region=['US'])
# len(fL["sub_industry"].unique())
fL

Unnamed: 0,fsym_id,fsym_security_id,proper_name,CCY,fref_exchange_code,fref_security_type,price_id,sector,industry_group,industry,sub_industry,Code,isin,bbg_id,bbg_ticker,entry_date,exit_date,region,bm_only_flag
3,B19ST9-R,W38FV3-S,Eli Lilly and Company,USD,USA,SHARE,B19ST9-R,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Pharmaceuticals,Pharmaceuticals,LLY.US,US5324571083,BBG000BNBDC2,LLY US,2019-12-01,2025-12-18,US,0
5,B38P5P-R,VD6QXJ-S,AppLovin Corp. Class A,USD,USA,SHARE,B38P5P-R,Information Technology,Software & Services,Software,Application Software,APP.US,US03831W1080,BBG006HFPX77,APP US,2025-01-01,2025-12-18,US,0
6,B3DVXC-R,MMRHSQ-S,"Freeport-McMoRan, Inc.",USD,USA,SHARE,B3DVXC-R,Materials,Materials,Metals & Mining,Copper,FCX.US,US35671D8570,BBG000BJDB15,FCX US,2019-12-01,2025-12-18,US,0
8,B4PGFJ-R,CP765L-S,DXC Technology Co.,USD,USA,SHARE,B4PGFJ-R,Information Technology,Software & Services,IT Services,IT Consulting & Other Services,DXC.US,US23355L1061,BBG00FN64XT9,DXC US,2019-12-01,2025-12-18,US,0
10,B6V0BZ-R,T8H08Z-S,Pool Corporation,USD,USA,SHARE,B6V0BZ-R,Consumer Discretionary,Consumer Discretionary Distribution & Retail,Distributors,Distributors,POOL.US,US73278L1052,BBG000BCVG28,POOL US,2019-12-01,2025-12-18,US,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,XM8P7T-R,GYN3G6-S,Kinder Morgan Inc Class P,USD,USA,SHARE,XM8P7T-R,Energy,Energy,"Oil, Gas & Consumable Fuels",Oil & Gas Storage & Transportation,KMI.US,US49456B1017,BBG0019JZ882,KMI US,2019-12-01,2025-12-18,US,0
953,XMYTFJ-R,JPY35J-S,"Universal Health Services, Inc. Class B",USD,USA,SHARE,XMYTFJ-R,Health Care,Health Care Equipment & Services,Health Care Providers & Services,Health Care Facilities,UHS.US,US9139031002,BBG000CB8Q50,UHS US,2019-12-01,2025-12-18,US,0
954,XN60TY-R,H7TSNM-S,Quanterix Corporation,USD,USA,SHARE,XN60TY-R,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Life Sciences Tools & Services,Life Sciences Tools & Services,QTRX.US,US74766Q1013,BBG0027NWC69,QTRX US,2023-12-01,2025-12-18,US,0
955,XNRMXZ-R,RW4DW5-S,Coty Inc. Class A,USD,USA,SHARE,XNRMXZ-R,Consumer Staples,Household & Personal Products,Personal Care Products,Personal Care Products,COTY.US,US2220702037,BBG000F395V1,COTY US,2019-12-01,2025-12-18,US,0


In [3]:
# Convert exit_date to datetime
fL["exit_date"] = pd.to_datetime(fL["exit_date"], errors="coerce")

# Keep only rows where exit_date is in the future
today = pd.Timestamp.today().normalize()
mask_date = fL["exit_date"] >= today

# Apply filter
fL_filtered = fL[mask_date].copy()

fL_filtered

Unnamed: 0,fsym_id,fsym_security_id,proper_name,CCY,fref_exchange_code,fref_security_type,price_id,sector,industry_group,industry,sub_industry,Code,isin,bbg_id,bbg_ticker,entry_date,exit_date,region,bm_only_flag
3,B19ST9-R,W38FV3-S,Eli Lilly and Company,USD,USA,SHARE,B19ST9-R,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Pharmaceuticals,Pharmaceuticals,LLY.US,US5324571083,BBG000BNBDC2,LLY US,2019-12-01,2025-12-18,US,0
5,B38P5P-R,VD6QXJ-S,AppLovin Corp. Class A,USD,USA,SHARE,B38P5P-R,Information Technology,Software & Services,Software,Application Software,APP.US,US03831W1080,BBG006HFPX77,APP US,2025-01-01,2025-12-18,US,0
6,B3DVXC-R,MMRHSQ-S,"Freeport-McMoRan, Inc.",USD,USA,SHARE,B3DVXC-R,Materials,Materials,Metals & Mining,Copper,FCX.US,US35671D8570,BBG000BJDB15,FCX US,2019-12-01,2025-12-18,US,0
8,B4PGFJ-R,CP765L-S,DXC Technology Co.,USD,USA,SHARE,B4PGFJ-R,Information Technology,Software & Services,IT Services,IT Consulting & Other Services,DXC.US,US23355L1061,BBG00FN64XT9,DXC US,2019-12-01,2025-12-18,US,0
10,B6V0BZ-R,T8H08Z-S,Pool Corporation,USD,USA,SHARE,B6V0BZ-R,Consumer Discretionary,Consumer Discretionary Distribution & Retail,Distributors,Distributors,POOL.US,US73278L1052,BBG000BCVG28,POOL US,2019-12-01,2025-12-18,US,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,XM8P7T-R,GYN3G6-S,Kinder Morgan Inc Class P,USD,USA,SHARE,XM8P7T-R,Energy,Energy,"Oil, Gas & Consumable Fuels",Oil & Gas Storage & Transportation,KMI.US,US49456B1017,BBG0019JZ882,KMI US,2019-12-01,2025-12-18,US,0
953,XMYTFJ-R,JPY35J-S,"Universal Health Services, Inc. Class B",USD,USA,SHARE,XMYTFJ-R,Health Care,Health Care Equipment & Services,Health Care Providers & Services,Health Care Facilities,UHS.US,US9139031002,BBG000CB8Q50,UHS US,2019-12-01,2025-12-18,US,0
954,XN60TY-R,H7TSNM-S,Quanterix Corporation,USD,USA,SHARE,XN60TY-R,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Life Sciences Tools & Services,Life Sciences Tools & Services,QTRX.US,US74766Q1013,BBG0027NWC69,QTRX US,2023-12-01,2025-12-18,US,0
955,XNRMXZ-R,RW4DW5-S,Coty Inc. Class A,USD,USA,SHARE,XNRMXZ-R,Consumer Staples,Household & Personal Products,Personal Care Products,Personal Care Products,COTY.US,US2220702037,BBG000F395V1,COTY US,2019-12-01,2025-12-18,US,0


In [4]:
# Keep only needed columns
companiesMapping = fL_filtered[
    ['fsym_id', 'proper_name', 'price_id', 'Code', 'bbg_ticker']
].copy()

companiesMapping

Unnamed: 0,fsym_id,proper_name,price_id,Code,bbg_ticker
3,B19ST9-R,Eli Lilly and Company,B19ST9-R,LLY.US,LLY US
5,B38P5P-R,AppLovin Corp. Class A,B38P5P-R,APP.US,APP US
6,B3DVXC-R,"Freeport-McMoRan, Inc.",B3DVXC-R,FCX.US,FCX US
8,B4PGFJ-R,DXC Technology Co.,B4PGFJ-R,DXC.US,DXC US
10,B6V0BZ-R,Pool Corporation,B6V0BZ-R,POOL.US,POOL US
...,...,...,...,...,...
952,XM8P7T-R,Kinder Morgan Inc Class P,XM8P7T-R,KMI.US,KMI US
953,XMYTFJ-R,"Universal Health Services, Inc. Class B",XMYTFJ-R,UHS.US,UHS US
954,XN60TY-R,Quanterix Corporation,XN60TY-R,QTRX.US,QTRX US
955,XNRMXZ-R,Coty Inc. Class A,XNRMXZ-R,COTY.US,COTY US


In [5]:
# Get market capitalizations of the companies
preliminary_fsym_ids = companiesMapping['fsym_id'].unique().tolist()

market_cap = fs.getFundamentals(preliminary_fsym_ids, 'ff_mkt_val', 0, datetime.today() - relativedelta(months=3) , datetime.today())

market_cap

Unnamed: 0,fsym_id,date,af,saf,qf
0,B19ST9-R,2025-09-30,,,721676.683000
1,B38P5P-R,2025-09-30,,,243250.220360
2,B3DVXC-R,2025-09-30,,,56280.700000
3,B4PGFJ-R,2025-09-30,,,2378.456549
4,B6V0BZ-R,2025-09-30,,,11570.525299
...,...,...,...,...,...
621,XM8P7T-R,2025-09-30,,,62982.773850
622,XMYTFJ-R,2025-09-30,,,12773.919438
623,XN60TY-R,2025-09-30,,,253.488690
624,XNRMXZ-R,2025-09-30,,,3530.556000


In [6]:
# Merge market capitalization into companiesMapping
companiesMapping = companiesMapping.merge(
    market_cap[['fsym_id', 'qf']],
    on='fsym_id',
    how='left'
)
companiesMapping = companiesMapping.rename(columns={'qf': 'mkt_cap'}).dropna()

companiesMapping

Unnamed: 0,fsym_id,proper_name,price_id,Code,bbg_ticker,mkt_cap
0,B19ST9-R,Eli Lilly and Company,B19ST9-R,LLY.US,LLY US,721676.683000
1,B38P5P-R,AppLovin Corp. Class A,B38P5P-R,APP.US,APP US,243250.220360
2,B3DVXC-R,"Freeport-McMoRan, Inc.",B3DVXC-R,FCX.US,FCX US,56280.700000
3,B4PGFJ-R,DXC Technology Co.,B4PGFJ-R,DXC.US,DXC US,2378.456549
4,B6V0BZ-R,Pool Corporation,B6V0BZ-R,POOL.US,POOL US,11570.525299
...,...,...,...,...,...,...
656,XM8P7T-R,Kinder Morgan Inc Class P,XM8P7T-R,KMI.US,KMI US,62982.773850
657,XMYTFJ-R,"Universal Health Services, Inc. Class B",XMYTFJ-R,UHS.US,UHS US,12773.919438
658,XN60TY-R,Quanterix Corporation,XN60TY-R,QTRX.US,QTRX US,253.488690
659,XNRMXZ-R,Coty Inc. Class A,XNRMXZ-R,COTY.US,COTY US,3530.556000


In [7]:
FILE_PATH = "/home/azureuser/cloudfiles/code/Users/manuel.noseda/DL_project_news/WatchListNews_FULL_PERIOD.pkl"

with open(FILE_PATH, "rb") as f:
    news_dict = pickle.load(f)

news_dict

{'A.US':                                        id event_type          event_time  \
 0    73394e8a-e0b2-4c22-9a10-aab916597f85       news 2025-03-21 19:15:00   
 1    a5edd796-dcf6-4812-b549-08ffc6e9a204       news 2025-03-21 11:25:35   
 2    728621c1-ca2e-4467-aa58-7f9578bbd0e4       news 2025-03-21 09:25:32   
 3    9d56a15e-62b4-4e75-839c-b2a47c46f5f7       news 2025-03-20 14:06:37   
 4    40a1d27c-d6c2-4912-878f-b0a73b775224       news 2025-03-20 06:52:22   
 ..                                    ...        ...                 ...   
 107  6ac687b2-7a6a-43b6-9529-f6060e6fd7f3       news 2024-10-21 18:37:49   
 108  dfb03e9f-9e37-480f-af0b-a1282e59e648       news 2024-10-18 00:00:00   
 109  a70812d1-af5e-49b6-b8c0-e2b4d058beae       news 2024-10-17 00:00:00   
 110  bb211de5-4a9f-4fb1-91aa-b8e6f77687af       news 2024-10-16 00:00:00   
 111  e040d828-f454-44e3-aac7-fce1cb140f6c       news 2024-10-15 00:00:00   
 
                                             event_data  \
 0    {

In [8]:
news_dict["MSFT.US"]

Unnamed: 0,id,event_type,event_time,event_data,embedding
0,53a5ab57-7e33-4288-aec6-343e9f896149,news,2025-07-01 00:00:00,{'body': 'Cloud and data protection: How the G...,"[-0.02053012, -0.016439177, -0.030653788, 0.01..."
1,e05b8fc3-9cd6-4680-b740-e7b15fc72df4,news,2025-03-21 23:55:00,{'body': 'AI startup Perplexity on Friday (Mar...,
2,5f970aa4-5035-43be-b61b-6e6590fb963a,news,2025-03-21 23:31:22,{'body': 'Artificial intelligence (AI) startup...,
3,6ce468cb-b26f-4f9d-9bf3-5f9fa4d0563c,news,2025-03-21 22:53:50,{'body': 'Snap Inc. stock underperformed Frida...,
4,62418e84-99b7-4c5f-9a36-d5afb9e2fc6d,news,2025-03-21 22:49:42,{'body': 'Microsoft’s (MSFT) stock rallied int...,
...,...,...,...,...,...
10593,5ef64264-d235-4e17-b081-fc0ef508545d,news,2024-10-04 00:00:00,{'body': 'Google Search has always had one ser...,"[-0.046491683, -0.023189556, -0.03324587, -0.0..."
10594,3ec14878-557d-4e3c-b76c-d785024c7ce7,news,2024-10-01 00:00:00,{'body': 'The UK’s Competition and Markets Aut...,"[-0.08114711, 0.0072137765, 0.00032795008, -0...."
10595,72b59c67-4f5d-4f7f-b392-fd626dd2f94b,news,2024-10-01 00:00:00,{'body': 'The tech layoff wave is still going ...,"[-0.059671998, 0.009155298, -0.019394342, 0.01..."
10596,dcabe97b-f666-44ac-8c30-cfab74b6b33b,news,2024-10-01 00:00:00,{'body': 'Ansys has announced a collaboration ...,"[-0.025057904, 0.036574975, 0.003996938, 0.033..."


In [9]:
# keep only companies whose Code exists in news_dict
companies_with_news = companiesMapping[
    companiesMapping['Code'].isin(news_dict.keys())
]

# Get the top 20 companies by market capitalization (and drop the mkt_cap column which is no more needed for cleaner output)
top500 = (
    companies_with_news
    .sort_values('mkt_cap', ascending=False)
    .head(500)
    .drop(columns='mkt_cap')
    .reset_index(drop=True)
)

top500

Unnamed: 0,fsym_id,proper_name,price_id,Code,bbg_ticker
0,K7TPSX-R,NVIDIA Corporation,K7TPSX-R,NVDA.US,NVDA US
1,P8R3C2-R,Microsoft Corporation,P8R3C2-R,MSFT.US,MSFT US
2,MH33D6-R,Apple Inc.,MH33D6-R,AAPL.US,AAPL US
3,HTM0LK-R,Alphabet Inc. Class A,HTM0LK-R,GOOGL.US,GOOGL US
4,MCNYYL-R,"Amazon.com, Inc.",MCNYYL-R,AMZN.US,AMZN US
...,...,...,...,...,...
495,BMBP35-R,"Revolution Medicines, Inc.",BMBP35-R,RVMD.US,RVMD US
496,H8P4JZ-R,"AptarGroup, Inc.",H8P4JZ-R,ATR.US,ATR US
497,RJ022Y-R,Comerica Incorporated,RJ022Y-R,CMA.US,CMA US
498,FDS1XB-R,Federal Realty Investment Trust,FDS1XB-R,FRT.US,FRT US


In [10]:
fsym_ids = top500['fsym_id'].dropna().tolist()
price_ids = top500['price_id'].dropna().tolist()
eod_codes = top500['Code'].dropna().tolist()

In [11]:
# Build the IDdf exactly as getAdjustedPrices() expects
IDdf = pd.DataFrame({
    "fsym_id": fsym_ids,
    "price_id": price_ids   
})

IDdf

Unnamed: 0,fsym_id,price_id
0,K7TPSX-R,K7TPSX-R
1,P8R3C2-R,P8R3C2-R
2,MH33D6-R,MH33D6-R
3,HTM0LK-R,HTM0LK-R
4,MCNYYL-R,MCNYYL-R
...,...,...
495,BMBP35-R,BMBP35-R
496,H8P4JZ-R,H8P4JZ-R
497,RJ022Y-R,RJ022Y-R
498,FDS1XB-R,FDS1XB-R


In [12]:
# Format: year, month, day

startDate = datetime(2024,10,1) 
endDate = datetime(2025,8,1)

In [13]:
# Get raw prices
df_prices = fs.getAdjustedPrices(IDdf, startDate, endDate)

df_prices

Unnamed: 0,fsym_id,price_id,date,adj_price,unadj_price
0,B19ST9-R,B19ST9-R,2024-10-01,884.48,884.48
1,B19ST9-R,B19ST9-R,2024-10-02,891.07,891.07
2,B19ST9-R,B19ST9-R,2024-10-03,885.55,885.55
3,B19ST9-R,B19ST9-R,2024-10-04,887.16,887.16
4,B19ST9-R,B19ST9-R,2024-10-07,898.4,898.40
...,...,...,...,...,...
104495,XQF8XY-R,XQF8XY-R,2025-07-28,32.49,32.49
104496,XQF8XY-R,XQF8XY-R,2025-07-29,32.87,32.87
104497,XQF8XY-R,XQF8XY-R,2025-07-30,32.53,32.53
104498,XQF8XY-R,XQF8XY-R,2025-07-31,32.6,32.60


In [14]:
# Select only needed columns from prices
df_small = df_prices[["fsym_id", "date", "adj_price", "unadj_price"]]

# Build the final dictionary
companies_dict = {}

# Store companyMapping and prices for each fsym_id
for fsym, grp in df_small.groupby("fsym_id"):
    companies_dict[fsym] = {
        "companyMapping": top500[top500["fsym_id"] == fsym].copy(),
        "prices": grp.drop(columns="fsym_id").reset_index(drop=True)
    }

# Access the DataFrame of prices for the first fsym_id
first_id = fsym_ids[0]
proper_name = companies_dict[first_id]['companyMapping']['proper_name'].iloc[0]
print(f'Prices for company "{proper_name}":')
companies_dict[first_id]["prices"]

Prices for company "NVIDIA Corporation":


Unnamed: 0,date,adj_price,unadj_price
0,2024-10-01,117.0,117.00
1,2024-10-02,118.85,118.85
2,2024-10-03,122.85,122.85
3,2024-10-04,124.92,124.92
4,2024-10-07,127.72,127.72
...,...,...,...
204,2025-07-28,176.75,176.75
205,2025-07-29,175.51,175.51
206,2025-07-30,179.27,179.27
207,2025-07-31,177.87,177.87


In [15]:
config_path = Path.cwd() / "config" / "DL_project_config.yml"
with open(config_path) as f:
    config = yaml.safe_load(f)

In [16]:
# Fundamentals to keep
fundamentals_keep = [
    'eps_basic',
    'eps_dil',
    'assets',
    'shldrs_eq',
    'inven',
    'cash_st',
    'debt',
    'net_debt',
    'net_inc',
    'oper_cf'
]

# Iterate over each fsym_id, fetch fundamentals, filter (drop those used for ratios), then store
for fsym_id in fsym_ids:
    # 1) Get full fundamentals dict for this fsym_id
    fundamentals_raw = utils.get_fundamentals(fsym_id, startDate, endDate, config)

    # 2) Keep only the desired keys
    fundamentals_filtered = {
        key: df.drop(columns=["fsym_id"], errors="ignore").reset_index(drop=True)
        for key, df in fundamentals_raw.items()
        if key in fundamentals_keep
    }
    
    # 3) Ensure entry exists and assign filtered fundamentals
    if fsym_id not in companies_dict:
        companies_dict[fsym_id] = {}

    companies_dict[fsym_id]["fundamentals"] = fundamentals_filtered

# Access the DataFrame of fundamentals for the first fsym_id
first_id = fsym_ids[0]
proper_name = companies_dict[first_id]['companyMapping']['proper_name'].iloc[0]
print(f'Fundamentals for company "{proper_name}":')
companies_dict[first_id]["fundamentals"]

Fundamentals for company "NVIDIA Corporation":


{'eps_basic':         date     af   saf     ltm
 0 2024-10-31    NaN  None  2.5650
 1 2025-01-31  2.968  None  2.9689
 2 2025-04-30    NaN  None  3.1327
 3 2025-07-31    NaN  None  3.5417,
 'eps_dil':         date      af   saf     ltm
 0 2024-10-31     NaN  None  2.5387
 1 2025-01-31  2.9382  None  2.9395
 2 2025-04-30     NaN  None  3.1045
 3 2025-07-31     NaN  None  3.5135,
 'assets':         date        af   saf        qf
 0 2024-10-31       NaN  None   96013.0
 1 2025-01-31  111601.0  None  111601.0
 2 2025-04-30       NaN  None  125254.0
 3 2025-07-31       NaN  None  140740.0,
 'shldrs_eq':         date       af   saf        qf
 0 2024-10-31      NaN  None   65899.0
 1 2025-01-31  79327.0  None   79327.0
 2 2025-04-30      NaN  None   83843.0
 3 2025-07-31      NaN  None  100131.0,
 'inven':         date       af   saf       qf
 0 2024-10-31      NaN  None   7654.0
 1 2025-01-31  10080.0  None  10080.0
 2 2025-04-30      NaN  None  11333.0
 3 2025-07-31      NaN  None  14962.0,

In [17]:
# Iterate over each fsym_id and compute ratios
for fsym_id in fsym_ids:
    df = utils.get_ratios(fsym_id, startDate, endDate, config)

    # Convert index to a column named "date"
    df = df.copy()
    df["date"] = df.index
    df = df.reset_index(drop=True)

    # Put "date" as the first column
    cols = ["date"] + [c for c in df.columns if c != "date"]
    df = df[cols]

    companies_dict[fsym_id]["ratios"] = df

# Access the DataFrame of ratios for the first fsym_id
first_id = fsym_ids[0]
proper_name = companies_dict[first_id]['companyMapping']['proper_name'].iloc[0]
print(f'Ratios for company "{proper_name}":')
companies_dict[first_id]["ratios"]

Ratios for company "NVIDIA Corporation":


Unnamed: 0,date,Dividend_Yield,Net_Margin,Gross_Margin,ROE,ROA,Debt_to_Equity
0,2024-10-31,0.000195,0.550407,0.759768,0.911464,0.621963,0.172206
1,2024-11-29,0.000195,0.550407,0.759768,0.911464,0.621963,0.172206
2,2024-12-31,0.000195,0.550407,0.759768,0.911464,0.621963,0.172206
3,2025-01-31,0.000198,0.556851,0.75861,0.957131,0.656932,0.155162
4,2025-02-28,0.000198,0.556851,0.75861,0.957131,0.656932,0.155162
5,2025-03-31,0.000198,0.556851,0.75861,0.957131,0.656932,0.155162
6,2025-04-30,0.000198,0.55848,0.749895,0.918729,0.653041,0.129464
7,2025-05-30,0.000198,0.55848,0.749895,0.918729,0.653041,0.129464
8,2025-06-30,0.000198,0.55848,0.749895,0.918729,0.653041,0.129464
9,2025-07-31,0.000198,0.516944,0.701081,0.915688,0.612946,0.12267


In [18]:
# Iterate over each fsym_id and retrieve news
for fsym_id in fsym_ids:

    company_data = companies_dict[fsym_id]['companyMapping']
    eod_code = company_data['Code'].iloc[0]

    news = news_dict[eod_code]

    companies_dict[fsym_id]["news"] = news

# Access the DataFrame of news for the first fsym_id
first_id = fsym_ids[0]
proper_name = companies_dict[first_id]['companyMapping']['proper_name'].iloc[0]
print(f'News for company "{proper_name}":')
companies_dict[first_id]["news"]

News for company "NVIDIA Corporation":


Unnamed: 0,id,event_type,event_time,event_data,embedding
0,054d0ade-29f3-4310-bff5-c41b7579e89f,news,2025-03-21 23:37:35,{'body': 'In a recent filing with the Securiti...,
1,37672b2d-8cdc-4c13-9d5d-7fc2fc0d07be,news,2025-03-21 23:29:00,{'body': 'The VanEck Semiconductor ETF holds 2...,
2,18154ff6-128c-4593-a2dd-f627611f114c,news,2025-03-21 22:25:13,"{'body': 'Nvidia Corp’s (NASDAQ:NVDA), the sem...",
3,517a5075-6272-4966-96e0-fd0f97e7cb4d,news,2025-03-21 22:21:28,{'body': 'Stocks on Wall Street shook off a we...,
4,9f833f33-d230-4fd9-a75f-6e225a8579fd,news,2025-03-21 21:53:08,{'body': 'Nvidia Corp. investors used to big a...,
...,...,...,...,...,...
11815,8aaf5001-676e-4244-bd8f-4a119e266b7c,news,2024-10-03 00:00:00,{'body': 'Semiconductors star Nvidia (NASDAQ: ...,"[-0.03708184, -0.026284697, -0.0039431644, 0.0..."
11816,c9dfe309-4467-4ce8-b5f9-4561dbded601,news,2024-10-03 00:00:00,{'body': 'Semiconductors star Nvidia (NASDAQ: ...,"[-0.022499098, -0.038779855, -0.0056624617, 0...."
11817,941ba14e-974e-482d-8331-50b6b8beb035,news,2024-10-03 00:00:00,{'body': 'Semiconductor star Nvidia (NASDAQ: N...,"[-0.004705709, -0.05712639, -0.02384835, 0.033..."
11818,fb47d468-4167-4858-8fb7-3cbb587f06d3,news,2024-10-03 00:00:00,{'body': 'Semiconductors star Nvidia (NASDAQ: ...,"[-0.03708184, -0.026284697, -0.0039431644, 0.0..."


In [20]:
# Iterate over each fsym_id and retrieve earning announcements dates 
for fsym_id in fsym_ids:

    company_data = companies_dict[fsym_id]['companyMapping']
    eod_code = company_data['Code'].iloc[0]
    symbol = eod_code.split(".")[0]
    ticker = yf.Ticker(symbol)

    earnings = ticker.get_earnings_dates(limit=20)

    if earnings is None or earnings.empty:
        print(f"No earnings data for {symbol}")
        continue

    # Keep only earnings announcements
    earnings = earnings[earnings["Event Type"] == "Earnings"]

    # Convert index to a column named "Earnings Date"
    earnings = earnings.copy()
    earnings["Earnings Date"] = earnings.index
    earnings = earnings.reset_index(drop=True)

    # Put "Earnings Date" as the first column
    cols = ["Earnings Date"] + [c for c in earnings.columns if c != "Earnings Date"]
    earnings = earnings[cols]

    companies_dict[fsym_id]["earnings"] = earnings

In [21]:
# Access the DataFrame of earnings for the first fsym_id (done separately to get the output if there are no earnings data for a company in the previous cell)
first_id = fsym_ids[0]
proper_name = companies_dict[first_id]['companyMapping']['proper_name'].iloc[0]
print(f'Earnings for company "{proper_name}":')
companies_dict[first_id]["earnings"]

Earnings for company "NVIDIA Corporation":


Unnamed: 0,Earnings Date,EPS Estimate,Reported EPS,Surprise(%),Event Type
0,2025-05-28 16:20:00-04:00,0.93,0.96,2.84,Earnings
1,2025-02-26 16:20:00-05:00,0.84,0.89,5.5,Earnings
2,2024-11-20 16:20:00-05:00,0.75,0.81,8.58,Earnings
3,2024-08-28 16:20:00-04:00,0.64,0.68,6.03,Earnings
4,2024-05-22 16:22:00-04:00,0.56,0.61,9.48,Earnings
5,2024-02-21 16:22:00-05:00,4.64,5.16,11.32,Earnings
6,2023-11-21 16:20:00-05:00,3.37,4.02,19.21,Earnings
7,2023-08-23 16:20:00-04:00,2.09,2.7,29.41,Earnings
8,2023-05-24 16:20:00-04:00,0.92,1.09,18.57,Earnings
9,2023-02-22 16:50:00-05:00,0.81,0.88,8.75,Earnings


In [22]:
# Save the dictionary to a pickle file

local_path = '/home/azureuser/cloudfiles/code/Users/manuel.noseda/temp'
file_name_root = 'DL_dataset'

# create full file path
file_path = os.path.join(local_path, f"{file_name_root}.pkl")

# save the dictionary
with open(file_path, "wb") as f:
    pickle.dump(companies_dict, f)

print(f"File saved in: {file_path}")


File saved in: /home/azureuser/cloudfiles/code/Users/manuel.noseda/temp/DL_dataset.pkl


In [None]:
'''

DATASET STRUCTURE

companies_dict
│
├── fsym_id_1
│     ├── companyMapping     → DataFrame
│     ├── prices             → DataFrame
│     ├── fundamentals       → dict of DataFrames
│     ├── ratios             → DataFrame
│     ├── news               → DataFrame
│     └── earnings           → DataFrame
│
├── fsym_id_2
│     ├── companyMapping
│     ├── prices
│     ├── fundamentals
│     ├── ratios
│     ├── news
│     └── earnings
│
└── ...

companyMapping : DataFrame
    ├── fsym_id
    ├── proper_name
    ├── price_id
    ├── Code
    └── bbg_ticker

prices : DataFrame
    ├── date          
    ├── adj_price     
    └── unadj_price  

fundamentals: Dictionary
    ├── eps_basic   → DataFrame
    ├── eps_dil     → DataFrame
    ├── assets      → DataFrame
    ├── shldrs_eq   → DataFrame
    ├── inven       → DataFrame
    ├── cash_st     → DataFrame
    ├── debt        → DataFrame
    ├── net_debt    → DataFrame
    ├── net_inc     → DataFrame
    └── oper_cf     → DataFrame

    Each of these DataFrames has the same structure: date | af | qf | saf/ltm

ratios : DataFrame
    ├── date
    ├── Dividend_Yield
    ├── Net_Margin
    ├── Gross_Margin
    ├── ROE
    ├── ROA
    └── Debt_to_Equity

news : DataFrame
    ├── id
    ├── event_type
    ├── event_time 
    ├── msh_ids
    ├── event_data 
    └── tags

earnings : DataFrame
    ├── Earnings Date
    ├── EPS Estimate
    ├── Reported EPS
    ├── Surprise(%)
    └── Event Type

'''