In [14]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)

In [15]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)

import yfinance as yf
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import os
import numpy as np
from typing import Dict
import polars as pl

from src.common.AssetData import AssetData
from src.common.AssetDataPolars import AssetDataPolars
from src.common.AssetDataService import AssetDataService
from src.common.AssetFileInOut import AssetFileInOut 

from src.featureAlchemy.FeatureMain import FeatureMain
from src.featureAlchemy.FeatureTA import FeatureTA
from src.featureAlchemy.FeatureGroupDynamic import FeatureGroupDynamic
from src.common.DataFrameTimeOperations import DataFrameTimeOperations as DOps


In [None]:
assets=AssetFileInOut("../src/stockGroups/bin").loadDictFromFile("group_debug")

# Convert to Polars for speedup
assetspl: Dict[str, AssetDataPolars] = {}
for ticker, asset in assets.items():
    assetspl[ticker] = AssetDataService.to_polars(asset)

In [None]:
ticker = "AAPL"
asset = assetspl[ticker]
startDate = pd.Timestamp(2020, 1, 1, tz = "UTC").date()
endDate = pd.Timestamp(2020, 12, 31, tz = "UTC").date()
params = {
        'idxLengthOneMonth': 21,
        'fouriercutoff': 5,
        'multFactor': 8,
        'timesteps': 20,
        'lagList': [1, 10, 100],
        'monthHorizonList': [1,2,6,12],
    }
ffM = FeatureMain(
    assetspl,
    startDate, 
    endDate, 
    params = params
)

  FeatureMain initialized with 321 assets and 262 dates.


In [18]:
meta_tree, treeFeatures_np, treeFeaturesNames = ffM.getTreeFeatures()

  Processing ticker AAPL (1/321)
  Processing ticker ABT (2/321)
  Processing ticker ACN (3/321)
  Processing ticker ADBE (4/321)
  Processing ticker ADI (5/321)
  Processing ticker ADP (6/321)
  Processing ticker ADSK (7/321)
  Processing ticker AEE (8/321)
  Processing ticker AEP (9/321)
  Processing ticker AES (10/321)
  Processing ticker AIG (11/321)
  Processing ticker AIZ (12/321)
  Processing ticker AJG (13/321)
  Processing ticker AKAM (14/321)
  Processing ticker ALB (15/321)
  Processing ticker ALGN (16/321)
  Processing ticker AMAT (17/321)
  Processing ticker AME (18/321)
  Processing ticker AMGN (19/321)
  Processing ticker AMP (20/321)
  Processing ticker AMT (21/321)
  Processing ticker AMZN (22/321)
  Processing ticker ANSS (23/321)
  Processing ticker AON (24/321)
  Processing ticker AOS (25/321)
  Processing ticker APA (26/321)
  Processing ticker APD (27/321)
  Processing ticker APH (28/321)
  Processing ticker ARE (29/321)
  Processing ticker ATO (30/321)
  Processi

In [19]:
print(treeFeatures_np.shape[1] == len(treeFeaturesNames))
print(len(meta_tree.dtype.names) == 3)
print(meta_tree.shape[0] == treeFeatures_np.shape[0])

num_nans = np.count_nonzero(np.isnan(meta_tree["date"]))
print(f"Number of missing (NaN) entries: {num_nans}")

total_elements = treeFeatures_np.size
print(f"Total elements in arr: {total_elements}")

num_nans = np.count_nonzero(np.isnan(treeFeatures_np))
print(f"Number of missing (NaN) entries: {num_nans}")

num_invalid = np.count_nonzero(~np.isfinite(treeFeatures_np))
print(f"Number of non-finite entries (NaN or ±Inf): {num_invalid}")

nan_pos = np.argwhere(np.isnan(treeFeatures_np))
inf_pos = np.argwhere(~np.isfinite(treeFeatures_np))

samples_with_nan = np.unique(nan_pos[:, 0])
features_with_nan = np.unique(nan_pos[:, 1])
feature_with_inf = np.unique(inf_pos[:, 1])

for i in features_with_nan:
    print(treeFeaturesNames[i])
print("Ended printing features with NaN")

for i in feature_with_inf:
    print(treeFeaturesNames[i])
print("Ended printing features with Inf")



True
True
True
Number of missing (NaN) entries: 0
Total elements in arr: 92664033
Number of missing (NaN) entries: 9614
Number of non-finite entries (NaN or ±Inf): 9614
FinData_quar_surprise_lagquot_qm1
FinData_quar_surprisePercentage_lagquot_qm1
FinData_quar_surprise_lagquot_qm2
FinData_quar_surprisePercentage_lagquot_qm2
FinData_quar_surprise_lagquot_qm3
FinData_quar_surprisePercentage_lagquot_qm3
FinData_quar_surprise_lagquot_qm4
FinData_quar_surprisePercentage_lagquot_qm4
FinData_quar_surprise_lagquot_qm5
FinData_quar_surprisePercentage_lagquot_qm5
FinData_quar_surprise_lagquot_qm6
FinData_quar_surprisePercentage_lagquot_qm6
FinData_quar_surprise_lagquot_qm7
FinData_quar_surprisePercentage_lagquot_qm7
FinData_quar_surprise_lagquot_qm8
FinData_quar_surprisePercentage_lagquot_qm8
FinData_quar_surprise_lagquot_qm9
FinData_quar_surprisePercentage_lagquot_qm9
FinData_quar_surprise_lagquot_qm10
FinData_quar_surprisePercentage_lagquot_qm10
FinData_quar_surprise_lagquot_qm11
FinData_quar_s

In [20]:
# Make business days
asset_start_idx = DOps(asset.shareprice).getNextLowerOrEqualIndex(startDate)+1
asset_end_idx = DOps(asset.shareprice).getNextLowerOrEqualIndex(endDate)
business_days = asset.shareprice["Date"].slice(asset_start_idx, asset_end_idx - asset_start_idx + 1).to_numpy()  
business_days = np.array([pd.Timestamp(x, tz ="UTC").date() for x in business_days])

## Checks on the dataframe

In [None]:
fTA = FeatureTA(asset, startDate, endDate, params=params)
featureTANames = fTA.getFeatureNames()
featuresTA = np.zeros((len(business_days), len(featureTANames)), dtype = np.float32)
for i, date in enumerate(business_days):
    featuresTA[i] = fTA.apply(date, 1.0, None)

ticker_df = featuresTA

common_cols = [a in featureTANames for a in treeFeaturesNames] 
ticker_filtermask = meta_tree["ticker"] == ticker
filtered_featurenp = treeFeatures_np[:, common_cols]
filtered_featurenp = filtered_featurenp[ticker_filtermask, :]

A = filtered_featurenp
B = ticker_df
print(np.max(np.abs(A-B))/np.prod(A.shape))

0.0


In [None]:
fGD = FeatureGroupDynamic(assetspl, startDate, endDate, params=params)
featureGDNames = fGD.getFeatureNames()
featuresGD = np.zeros((len(business_days), len(featureGDNames)), dtype = np.float32)
FGD_tick_date = {date: fGD.apply(date, None) for date in (business_days)}
for i, date in enumerate(business_days):
    featuresGD[i] = FGD_tick_date[date][ticker]
    
ticker_df = featuresGD
    
common_cols = [a in featureGDNames for a in treeFeaturesNames] 
ticker_filtermask = meta_tree["ticker"] == ticker
filtered_featuredf = treeFeatures_np[:, common_cols]
filtered_featuredf = filtered_featuredf[ticker_filtermask, :]

A = filtered_featuredf
B = ticker_df
print(np.max(np.abs(A-B))/np.prod(A.shape))

0.0


## Checks in the time dataframe

In [23]:
meta_time, timeFeatures_np, timeFeaturesNames = ffM.getTimeFeatures()

  Processing ticker AAPL (1/321)
  Processing ticker ABT (2/321)
  Processing ticker ACN (3/321)
  Processing ticker ADBE (4/321)
  Processing ticker ADI (5/321)
  Processing ticker ADP (6/321)
  Processing ticker ADSK (7/321)
  Processing ticker AEE (8/321)
  Processing ticker AEP (9/321)
  Processing ticker AES (10/321)
  Processing ticker AIG (11/321)
  Processing ticker AIZ (12/321)
  Processing ticker AJG (13/321)
  Processing ticker AKAM (14/321)
  Processing ticker ALB (15/321)
  Processing ticker ALGN (16/321)
  Processing ticker AMAT (17/321)
  Processing ticker AME (18/321)
  Processing ticker AMGN (19/321)
  Processing ticker AMP (20/321)
  Processing ticker AMT (21/321)
  Processing ticker AMZN (22/321)
  Processing ticker ANSS (23/321)
  Processing ticker AON (24/321)
  Processing ticker AOS (25/321)
  Processing ticker APA (26/321)
  Processing ticker APD (27/321)
  Processing ticker APH (28/321)
  Processing ticker ARE (29/321)
  Processing ticker ATO (30/321)
  Processi

In [24]:
print(timeFeatures_np.shape[2] == len(timeFeaturesNames))
print(timeFeatures_np.shape[1] == params["timesteps"])
print(len(meta_time.dtype.names) == 3)
print(meta_time.shape[0] == timeFeatures_np.shape[0])

num_nans = np.count_nonzero(np.isnan(meta_time["date"]))
print(f"Number of missing (NaN) entries: {num_nans}")

total_elements = timeFeatures_np.size
print(f"Total elements in arr: {total_elements}")

num_nans = np.count_nonzero(np.isnan(timeFeatures_np))
print(f"Number of missing (NaN) entries: {num_nans}")

nan_pos = np.argwhere(np.isnan(timeFeatures_np))

samples_with_nan = np.unique(nan_pos[:, 0])
features_with_nan = np.unique(nan_pos[:, 2])

for i in features_with_nan:
    print(treeFeaturesNames[i])
    
cnt_outside = np.sum((timeFeatures_np < 0) | (timeFeatures_np > 1))
print(f"Number of entries outside [0, 1]: {cnt_outside}")

mask = ((timeFeatures_np < 0) | (timeFeatures_np > 1)).any(axis=(0,1))
for i in range(mask.sum()):
    idx = np.where(mask)[0][i]
    print(f"{timeFeaturesNames[idx]}")
    


True
True
True
True
Number of missing (NaN) entries: 0
Total elements in arr: 97455600
Number of missing (NaN) entries: 0
Number of entries outside [0, 1]: 0


In [None]:
fTA = FeatureTA(asset, startDate, endDate, params=params)
featureTANames = fTA.getTimeFeatureNames()
featuresTA = np.zeros((len(business_days), params['timesteps'], len(featureTANames)), dtype = np.float32)
for i, date in enumerate(business_days):
    featuresTA[i,:,:] = fTA.apply_timeseries(date, None)

ticker_np = featuresTA

common_cols = [a in featureTANames for a in timeFeaturesNames] 
ticker_filtermask = meta_time["ticker"] == ticker
filtered_featuredf = timeFeatures_np[:, :, common_cols]
filtered_featuredf = filtered_featuredf[ticker_filtermask, :, :]

A = filtered_featuredf
B = ticker_np
print(np.max(np.abs(A-B))/np.prod(A.shape))

0.0


In [None]:
fGD = FeatureGroupDynamic(assetspl, startDate, endDate, params=params)
featureGDNames = fGD.getTimeFeatureNames()
featuresGD = np.zeros((len(business_days), params['timesteps'], len(featureGDNames)), dtype = np.float32)
FGD_tick_date = {date: fGD.apply_timeseries(date, None) for date in (business_days)}
for i, date in enumerate(business_days):
    featuresGD[i] = FGD_tick_date[date][ticker]
    
ticker_np = featuresGD

common_cols = [a in featureGDNames for a in timeFeaturesNames] 
ticker_filtermask = meta_time["ticker"] == ticker
filtered_featuredf = timeFeatures_np[:, :, common_cols]
filtered_featuredf = filtered_featuredf[ticker_filtermask, :, :]

A = filtered_featuredf
B = ticker_np
print(np.max(np.abs(A-B))/np.prod(A.shape))

0.0
