In [1]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)

In [2]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)

import yfinance as yf
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import os
import numpy as np
from typing import Dict
import polars as pl

from src.common.AssetData import AssetData
from src.common.AssetDataPolars import AssetDataPolars
from src.common.AssetDataService import AssetDataService
from src.common.AssetFileInOut import AssetFileInOut 

from src.featureAlchemy.MainFeature import MainFeature
from src.featureAlchemy.FeatureTA import FeatureTA
from src.featureAlchemy.FeatureGroupDynamic import FeatureGroupDynamic
from src.common.DataFrameTimeOperations import DataFrameTimeOperations as DOps

from src.featureAlchemy.FeatureFourierCoeff import FeatureFourierCoeff
from src.featureAlchemy.FeatureCategory import FeatureCategory
from src.featureAlchemy.FeatureFinancialData import FeatureFinancialData
from src.featureAlchemy.FeatureMathematical import FeatureMathematical
from src.featureAlchemy.FeatureSeasonal import FeatureSeasonal
from src.featureAlchemy.FeatureTA import FeatureTA
from src.featureAlchemy.FeatureGroupDynamic import FeatureGroupDynamic


feature_classes = [ 
    FeatureCategory, 
    FeatureFinancialData,
    FeatureMathematical, 
    FeatureSeasonal, 
    FeatureTA, 
    FeatureGroupDynamic,
    FeatureFourierCoeff
]


In [3]:
assets=AssetFileInOut("../src/stockGroups/bin").loadDictFromFile("group_debug")

# Convert to Polars for speedup
assetspl: Dict[str, AssetDataPolars] = {}
for ticker, asset in assets.items():
    assetspl[ticker] = AssetDataService.to_polars(asset)

In [4]:
ticker = "NVDA"
asset = assetspl[ticker]
startDate = pd.Timestamp(2020, 1, 1, tz = "UTC").date()
endDate = pd.Timestamp(2020, 12, 31, tz = "UTC").date()
params = {
        'idxLengthOneMonth': 21,
        'fouriercutoff': 5,
        'multFactor': 8,
        'timesteps': 20,
        'lagList': [1, 10, 100],
        'monthHorizonList': [1,2,6,12],
    }
ffM = MainFeature(
    assetspl,
    feature_classes = feature_classes,
    startDate = startDate, 
    endDate = endDate, 
    params = params
)

In [5]:
meta_tree, treeFeatures_np, treeFeaturesNames = ffM.get_features()

In [6]:
print(treeFeatures_np.shape[1] == len(treeFeaturesNames))
print(len(meta_tree.dtype.names) == 5)
print(meta_tree.shape[0] == treeFeatures_np.shape[0])

num_nans = np.count_nonzero(np.isnan(meta_tree["date"]))
print(f"Number of missing (NaN) entries: {num_nans}")

total_elements = treeFeatures_np.size
print(f"Total elements in arr: {total_elements}")

num_nans = np.count_nonzero(np.isnan(treeFeatures_np))
print(f"Number of missing (NaN) entries: {num_nans}")

num_invalid = np.count_nonzero(~np.isfinite(treeFeatures_np))
print(f"Number of non-finite entries (NaN or ±Inf): {num_invalid}")

nan_pos = np.argwhere(np.isnan(treeFeatures_np))
inf_pos = np.argwhere(~np.isfinite(treeFeatures_np))

samples_with_nan = np.unique(nan_pos[:, 0])
features_with_nan = np.unique(nan_pos[:, 1])
feature_with_inf = np.unique(inf_pos[:, 1])

for i in features_with_nan:
    print(treeFeaturesNames[i])
print("Ended printing features with NaN")

for i in feature_with_inf:
    print(treeFeaturesNames[i])
print("Ended printing features with Inf")

dates = meta_tree["date"]
print(np.all(dates[:-1] <= dates[1:]))



True
True
True
Number of missing (NaN) entries: 0
Total elements in arr: 6027674
Number of missing (NaN) entries: 0
Number of non-finite entries (NaN or ±Inf): 0
Ended printing features with NaN
Ended printing features with Inf
True


## Checks on the dataframe

In [7]:
ticker_filtermask = (meta_tree["ticker"] == ticker)
dates_filtermask = (meta_tree["date"] >= startDate) & (meta_tree["date"] <= endDate)
dates = meta_tree["date"][dates_filtermask]

fTA = FeatureTA(asset, startDate, endDate, params=params)
featureTANames = fTA.getFeatureNames()
featuresTA = fTA.apply(np.unique(dates))

common_cols = [a in featureTANames for a in treeFeaturesNames] 
filtered_featurenp = treeFeatures_np[:, common_cols][ticker_filtermask & dates_filtermask]

A = filtered_featurenp
B = featuresTA
print(np.max(np.abs(A-B))/np.prod(A.shape))

0.0


In [8]:
fGD = FeatureGroupDynamic(assetspl, startDate, endDate, params=params)
featureGDNames = fGD.getFeatureNames()
featuresGD = fGD.apply(np.unique(dates))[ticker]

common_cols = [a in featureGDNames for a in treeFeaturesNames] 
filtered_featurenp = treeFeatures_np[:, common_cols][ticker_filtermask & dates_filtermask]

A = filtered_featurenp
B = featuresGD
print(np.max(np.abs(A-B))/np.prod(A.shape))

0.0
