In [1]:
# General
import numpy as np
import pandas as pd
import os

# For loading data and feature engineering
from feature_set_natracker import MergedDataLoader


In [3]:
# Define file locations.
directory = r"C:\Users\victo\git_new\thesis_vri_vp\data"
factor_file = os.path.join(directory, "1estimation_index_returns.csv")
market_file = os.path.join(directory, "1new_market_data.csv")

# List of factors to process.
all_factors = ["iwf", "mtum", "qual", "size", "usmv", "vlue"]

# Dictionary to hold data loaded via MergedDataLoader.
merged_data = {}

for factor_name in all_factors:
    
    # Load data using MergedDataLoader and store in the dictionary.
    data = MergedDataLoader(
        factor_file=factor_file,
        market_file=market_file,
        ver="v2",
        factor_col=factor_name
    ).load(start_date="2002-05-31", end_date="2025-04-03")
    
    merged_data[factor_name] = data

# Load estimation_index_returns.csv normally
normal_data = pd.read_csv(factor_file, parse_dates=["date"]).dropna()
normal_data.set_index("date", inplace=True)

# Example: compare date counts using the first factor's data for the merged results.
first_factor = all_factors[0]
merged_dates = merged_data[first_factor].X.index.unique()
normal_dates = normal_data.index.unique()

print(f"Number of dates in MergedDataLoader result for '{first_factor}': {len(merged_dates)}")
print(f"Number of dates in the normal data: {len(normal_dates)}")

# # After loading your data, inspect the dropped rows but format Timestamps to dates only.
# for factor_name, data in merged_data.items():
#     print(f"\nFactor: {factor_name}")

#     print("\nDropped observations from raw files (per column):")
#     for file_type, col_drops in data.dropped_obs.items():
#         print(f"  File: {file_type}")
#         for col, dates in col_drops.items():
#             dates_only = [d.date() for d in dates]  # Convert each Timestamp to date.
#             print(f"    Column '{col}': {dates_only}")

#     print("\nDropped rows during pipeline processing:")
#     for step, dates in data.dropped_pipeline.items():
#         dates_only = [d.date() for d in dates]
#         print(f"  Step '{step}': {dates_only}")



Number of dates in MergedDataLoader result for 'iwf': 5590
Number of dates in the normal data: 5654


In [3]:
# After loading your data, inspect the dropped rows but format Timestamps to dates only.
for factor_name, data in merged_data.items():
    print(f"\nFactor: {factor_name}")

    print("\nDropped observations from raw files (per column):")
    for file_type, col_drops in data.dropped_obs.items():
        print(f"  File: {file_type}")
        for col, dates in col_drops.items():
            dates_only = [d.date() for d in dates]  # Convert each Timestamp to date.
            print(f"    Column '{col}': {dates_only}")

    print("\nDropped rows during pipeline processing:")
    for step, dates in data.dropped_pipeline.items():
        dates_only = [d.date() for d in dates]
        print(f"  Step '{step}': {dates_only}")



Factor: iwf

Dropped observations from raw files (per column):
  File: factor_file
  File: market_file

Dropped rows during pipeline processing:

Factor: mtum

Dropped observations from raw files (per column):
  File: factor_file
  File: market_file

Dropped rows during pipeline processing:

Factor: qual

Dropped observations from raw files (per column):
  File: factor_file
  File: market_file

Dropped rows during pipeline processing:

Factor: size

Dropped observations from raw files (per column):
  File: factor_file
  File: market_file

Dropped rows during pipeline processing:

Factor: usmv

Dropped observations from raw files (per column):
  File: factor_file
  File: market_file

Dropped rows during pipeline processing:

Factor: vlue

Dropped observations from raw files (per column):
  File: factor_file
  File: market_file

Dropped rows during pipeline processing:


In [None]:
# Number of dates in MergedDataLoader result for 'iwf': 4642
# Number of dates in the normal data: 5752

In [None]:
# Number of dates in MergedDataLoader result for 'iwf': 5570
# Number of dates in the normal data: 5752

In [5]:
# Define file locations.
directory = r"C:\Users\victo\git_new\thesis_vri_vp\data"
factor_file = os.path.join(directory, "1estimation_index_returns.csv")
market_file = os.path.join(directory, "1new_market_data.csv")

# List of factors to process.
all_factors = ["iwf", "mtum", "qual", "size", "usmv", "vlue"]

# Dictionary to hold data loaded via MergedDataLoader.
merged_data = {}

for factor_name in all_factors:
    # Load data using MergedDataLoader and store in the dictionary.
    data = MergedDataLoader(
        factor_file=factor_file,
        market_file=market_file,
        ver="v2",
        factor_col=factor_name
    ).load(start_date="2002-05-31", end_date="2025-04-03")
    merged_data[factor_name] = data

# Load the CSV normally.
normal_data = pd.read_csv(factor_file, parse_dates=["date"]).dropna()
normal_data.set_index("date", inplace=True)

# Pick a factor (use the first factor) to compare the date indices.
first_factor = all_factors[0]
merged_dates = merged_data[first_factor].X.index.unique()
normal_dates = normal_data.index.unique()

print(f"Number of dates in MergedDataLoader result for '{first_factor}': {len(merged_dates)}")
print(f"Number of dates in the normal data: {len(normal_dates)}")

# Convert both merged_dates and normal_dates to sets of date objects.
normal_date_set = {d.date() for d in normal_dates}
merged_date_set = {d.date() for d in merged_dates}

# Compute the non-overlapping dates.
# These are dates present in the normal CSV but missing in the merged result.
dropped_by_merged = normal_date_set - merged_date_set

# These are dates present in the merged result but missing in the normal CSV.
dropped_by_normal = merged_date_set - normal_date_set

print("\nComparison of dropped dates:")
print("Dates present in normal data but dropped by MergedDataLoader:")
print(sorted(dropped_by_merged))
print("\nDates present in MergedDataLoader but missing in normal data:")
print(sorted(dropped_by_normal))


Number of dates in MergedDataLoader result for 'iwf': 5590
Number of dates in the normal data: 5654

Comparison of dropped dates:
Dates present in normal data but dropped by MergedDataLoader:
[datetime.date(2002, 5, 30), datetime.date(2002, 5, 31), datetime.date(2002, 6, 3), datetime.date(2002, 6, 4), datetime.date(2002, 6, 5), datetime.date(2002, 6, 6), datetime.date(2002, 6, 7), datetime.date(2002, 6, 10), datetime.date(2002, 6, 11), datetime.date(2002, 6, 12), datetime.date(2002, 6, 13), datetime.date(2002, 6, 14), datetime.date(2002, 6, 17), datetime.date(2002, 6, 18), datetime.date(2002, 6, 19), datetime.date(2002, 6, 20), datetime.date(2002, 6, 21), datetime.date(2002, 6, 24), datetime.date(2002, 6, 25), datetime.date(2002, 6, 26), datetime.date(2002, 6, 27), datetime.date(2002, 6, 28), datetime.date(2002, 7, 1), datetime.date(2002, 7, 2), datetime.date(2002, 7, 3), datetime.date(2002, 7, 8), datetime.date(2002, 7, 9), datetime.date(2002, 7, 10), datetime.date(2002, 7, 11), datet

In [4]:
# Load merged data for your factors.
merged_data = {}
for factor_name in all_factors:
    data = MergedDataLoader(
        factor_file=factor_file,
        market_file=market_file,
        ver="v2",
        factor_col=factor_name
    ).load(start_date="2002-05-31", end_date="2025-04-03")
    merged_data[factor_name] = data

# Now, after all data are loaded, inspect the April 2024 features for factor 'iwf'.
df_features = merged_data['iwf'].X
april_features = df_features.loc["2024-04"]
print("DownsideDev_log_21 in April 2024 for factor 'iwf':")
print(april_features["DownsideDev_log_21"])


DownsideDev_log_21 in April 2024 for factor 'iwf':


KeyError: 'DownsideDev_log_21'