In [None]:
import pandas as pd
import numpy as np

# Define the date_id threshold
start_date_id = 1006

try:
    # Load the original training data
    train_df = pd.read_csv("train.csv")
    print(f"Original dataframe shape: {train_df.shape}")

    # --- 1. Filter the DataFrame ---
    # Create a new DataFrame starting from the date_id where we have rich data
    usable_df = train_df[train_df['date_id'] >= start_date_id].copy()
    print(f"Filtered dataframe shape (date_id >= {start_date_id}): {usable_df.shape}")

    # --- 2. Apply Forward-Fill (ffill) ---
    # Identify feature columns (all columns except date_id and the 3 target/label columns)
    feature_columns = [col for col in usable_df.columns if col not in ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']]
    
    # Apply ffill to the feature columns.
    usable_df[feature_columns] = usable_df[feature_columns].ffill()
    
    # After a forward fill, the very first few rows might *still* have NaNs
    # if they were at the beginning of the filtered dataset.
    # We use backfill (bfill) to fill any remaining NaNs with the *next* available value.
    usable_df[feature_columns] = usable_df[feature_columns].bfill()

    print("\n--- Cleaned DataFrame Info ---")
    # Check if all NaNs are handled in the *feature columns*
    usable_df.info()

    # --- 3. Save the Cleaned Data ---
    # This is the line that creates the file you need
    usable_df.to_csv("train_cleaned.csv", index=False)
    print("\nSuccessfully created 'train_cleaned.csv'")
    
    # Verify remaining NaNs
    remaining_nans = usable_df[feature_columns].isnull().sum().sum()
    print(f"Total remaining NaNs in feature columns: {remaining_nans}")

except FileNotFoundError:
    print("Error: train.csv not found. Make sure it's in the same directory.")
except Exception as e:
    print(f"An error occurred: {e}")