In [1]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)

In [2]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)

import yfinance as yf
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import os
import numpy as np
from typing import Dict
import polars as pl

from src.common.AssetData import AssetData
from src.common.AssetDataPolars import AssetDataPolars
from src.common.AssetDataService import AssetDataService
from src.common.AssetFileInOut import AssetFileInOut 

from src.featureAlchemy.FeatureMain import FeatureMain
from src.featureAlchemy.FeatureTA import FeatureTA
from src.featureAlchemy.FeatureGroupDynamic import FeatureGroupDynamic
from src.common.DataFrameTimeOperations import DataFrameTimeOperationsPolars as DPl


In [3]:
assets=AssetFileInOut("../src/stockGroups/bin").loadDictFromFile("group_debug")

# Convert to Polars for speedup
assetspl: Dict[str, AssetDataPolars] = {}
for ticker, asset in assets.items():
    assetspl[ticker] = AssetDataService.to_polars(asset)

In [4]:
ticker = "NVDA"
asset = assetspl[ticker]
startDate = pd.Timestamp(2020, 1, 1, tz = "UTC")
endDate = pd.Timestamp(2020, 12, 31, tz = "UTC")
params = {
        'idxLengthOneMonth': 21,
        'fouriercutoff': 5,
        'multFactor': 8,
        'monthsHorizon': 13,
        'timesteps': 20,
    }
ffM = FeatureMain(
    assetspl,
    startDate, 
    endDate, 
    lagList = [1, 10, 100],
    monthHorizonList = [1,2,6,12],
    params = params
)

FeatureMain initialized with 34 assets and 253 dates.


In [5]:
feature_df = ffM.getTreeFeatures()

  Processing ticker AAPL (1/34)
Processing date 2020-01-02 00:00:00+00:00 (1/253)
Processing date 2020-01-03 00:00:00+00:00 (2/253)
Processing date 2020-01-06 00:00:00+00:00 (3/253)
Processing date 2020-01-07 00:00:00+00:00 (4/253)
Processing date 2020-01-08 00:00:00+00:00 (5/253)
Processing date 2020-01-09 00:00:00+00:00 (6/253)
Processing date 2020-01-10 00:00:00+00:00 (7/253)
Processing date 2020-01-13 00:00:00+00:00 (8/253)
Processing date 2020-01-14 00:00:00+00:00 (9/253)
Processing date 2020-01-15 00:00:00+00:00 (10/253)
Processing date 2020-01-16 00:00:00+00:00 (11/253)
Processing date 2020-01-17 00:00:00+00:00 (12/253)
Processing date 2020-01-21 00:00:00+00:00 (13/253)
Processing date 2020-01-22 00:00:00+00:00 (14/253)
Processing date 2020-01-23 00:00:00+00:00 (15/253)
Processing date 2020-01-24 00:00:00+00:00 (16/253)
Processing date 2020-01-27 00:00:00+00:00 (17/253)
Processing date 2020-01-28 00:00:00+00:00 (18/253)
Processing date 2020-01-29 00:00:00+00:00 (19/253)
Processi

In [6]:
asset_start_idx = DPl(asset.shareprice).getNextLowerOrEqualIndex(startDate)+1
asset_end_idx = DPl(asset.shareprice).getNextLowerOrEqualIndex(endDate)
business_days = asset.shareprice["Date"].slice(asset_start_idx, asset_end_idx - asset_start_idx + 1).to_numpy()  
business_days = np.array([pd.Timestamp(x, tz ="UTC") for x in business_days])

## Checks on the dataframe

In [7]:
fTA = FeatureTA(asset, startDate, endDate, [1, 10, 100], params=params)
featureTANames = fTA.getFeatureNames()
featuresTA = np.zeros((len(business_days), len(featureTANames)), dtype = np.float32)
for i, date in enumerate(business_days):
    featuresTA[i] = fTA.apply(date, 1.0, None)

ticker_df = pl.DataFrame(featuresTA, schema = featureTANames)

common_cols = [c for c in ticker_df.columns if c in feature_df.columns]
filtered_featuredf = feature_df.filter(pl.col("ticker") == ticker).select(common_cols)

A = filtered_featuredf.to_numpy().astype(np.float64)
B = ticker_df.to_numpy().astype(np.float64)
print(np.max(np.abs(A-B))/np.prod(A.shape))

0.0001388998592601103


In [8]:
fGD = FeatureGroupDynamic(assetspl, startDate, endDate, lagList=[1, 10, 100], monthHorizonList=[1,2,6,12], params=params)
featureGDNames = fGD.getFeatureNames()
featuresGD = np.zeros((len(business_days), len(featureGDNames)), dtype = np.float32)
FGD_tick_date = {date: fGD.apply(date, None) for date in (business_days)}
for i, date in enumerate(business_days):
    featuresGD[i] = FGD_tick_date[date][ticker]
    
ticker_df = pl.DataFrame(featuresGD, schema = featureGDNames)
common_cols = [c for c in ticker_df.columns if c in feature_df.columns]
filtered_featuredf = feature_df.filter(pl.col("ticker") == ticker).select(common_cols)

A = filtered_featuredf.to_numpy().astype(np.float64)
B = ticker_df.to_numpy().astype(np.float64)
print(np.max(np.abs(A-B))/np.prod(A.shape))

8.803775528307594e-07


## Checks in the time dataframe

In [9]:
meta_time, timeFeatures_np, timeFeaturesNames = ffM.getTimeFeatures()

  Processing ticker AAPL (1/34)
Processing date 2020-01-02 00:00:00+00:00 (1/253)
Processing date 2020-01-03 00:00:00+00:00 (2/253)
Processing date 2020-01-06 00:00:00+00:00 (3/253)
Processing date 2020-01-07 00:00:00+00:00 (4/253)
Processing date 2020-01-08 00:00:00+00:00 (5/253)
Processing date 2020-01-09 00:00:00+00:00 (6/253)
Processing date 2020-01-10 00:00:00+00:00 (7/253)
Processing date 2020-01-13 00:00:00+00:00 (8/253)
Processing date 2020-01-14 00:00:00+00:00 (9/253)
Processing date 2020-01-15 00:00:00+00:00 (10/253)
Processing date 2020-01-16 00:00:00+00:00 (11/253)
Processing date 2020-01-17 00:00:00+00:00 (12/253)
Processing date 2020-01-21 00:00:00+00:00 (13/253)
Processing date 2020-01-22 00:00:00+00:00 (14/253)
Processing date 2020-01-23 00:00:00+00:00 (15/253)
Processing date 2020-01-24 00:00:00+00:00 (16/253)
Processing date 2020-01-27 00:00:00+00:00 (17/253)
Processing date 2020-01-28 00:00:00+00:00 (18/253)
Processing date 2020-01-29 00:00:00+00:00 (19/253)
Processi

In [14]:
print(timeFeatures_np.shape[2] == len(timeFeaturesNames))
print(timeFeatures_np.shape[1] == params["timesteps"])
print(meta_time.shape[1] == 3)
print(meta_time.shape[0] == timeFeatures_np.shape[0])

True
True
True
True


In [12]:
fTA = FeatureTA(asset, startDate, endDate, [1, 10, 100], params=params)
featureTANames = fTA.getTimeFeatureNames()
featuresTA = np.zeros((len(business_days), params['timesteps'], len(featureTANames)), dtype = np.float32)
for i, date in enumerate(business_days):
    featuresTA[i,:,:] = fTA.apply_timeseries(date, None)

ticker_np = featuresTA

common_cols = [a in featureTANames for a in timeFeaturesNames] 
ticker_filtermask = meta_time[:,1] == ticker
filtered_featuredf = timeFeatures_np[:, :, common_cols]
filtered_featuredf = filtered_featuredf[ticker_filtermask, :, :]

A = filtered_featuredf
B = ticker_np
print(np.max(np.abs(A-B))/np.prod(A.shape))

4.6575295899301315e-08


In [16]:
fGD = FeatureGroupDynamic(assetspl, startDate, endDate, lagList=[1, 10, 100], monthHorizonList=[1,2,6,12], params=params)
featureGDNames = fGD.getTimeFeatureNames()
featuresGD = np.zeros((len(business_days), params['timesteps'], len(featureGDNames)), dtype = np.float32)
FGD_tick_date = {date: fGD.apply_timeseries(date, None) for date in (business_days)}
for i, date in enumerate(business_days):
    featuresGD[i] = FGD_tick_date[date][ticker]
    
ticker_np = featuresGD

common_cols = [a in featureGDNames for a in timeFeaturesNames] 
ticker_filtermask = meta_time[:,1] == ticker
filtered_featuredf = timeFeatures_np[:, :, common_cols]
filtered_featuredf = filtered_featuredf[ticker_filtermask, :, :]

A = filtered_featuredf
B = ticker_np
print(np.max(np.abs(A-B))/np.prod(A.shape))

0.0
