In [17]:
import pandas as pd
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler


In [18]:
income_after_outlier_drop_standardized = pd.read_pickle("income_after_outlier_drop_standardized.pkl")
balance_after_outlier_drop_standardized = pd.read_pickle("balance_after_outlier_drop_standardized.pkl")
cashflow_after_outlier_drop_standardized = pd.read_pickle("cashflow_after_outlier_drop_standardized.pkl")
pe_aligned = pd.read_pickle("pe_aligned.pkl")

print(income_after_outlier_drop_standardized.shape)

(410, 13)


In [19]:
def compute_latent_factors(df, n_factors=3, prefix="LF"):
    """
    Compute standardized latent factor variables (LF1, LF2, ...) from shared variation
    using Factor Analysis. All numeric features are standardized to mean=0, std=1.

    Parameters
    ----------
    df : pd.DataFrame
        The aligned dataframe (e.g., income_aligned, balance_aligned, etc.)
    n_factors : int, default=3
        Number of latent factors to compute.
    prefix : str, default="LF"
        Prefix for naming latent factor columns.

    Returns
    -------
    latent_df : pd.DataFrame
        DataFrame containing latent factor values (already standardized input).
    """

    # Select numeric columns only and drop rows with missing values
    numeric_df = df.select_dtypes(include=['number']).dropna()

    # Standardize: mean=0, variance=1
    scaler = StandardScaler()
    Z = scaler.fit_transform(numeric_df)

    # Fit Factor Analysis model on standardized data
    fa = FactorAnalysis(n_components=n_factors, random_state=0)
    latent_values = fa.fit_transform(Z)

    # Create latent factor dataframe
    latent_df = pd.DataFrame(
        latent_values,
        columns=[f"{prefix}{i+1}" for i in range(n_factors)],
        index=numeric_df.index
    )

    return latent_df


In [20]:
# Compute latent factors for each aligned dataset
income_latent = compute_latent_factors(income_after_outlier_drop_standardized, n_factors=2)
balance_latent = compute_latent_factors(balance_after_outlier_drop_standardized, n_factors=2)
cashflow_latent = compute_latent_factors(cashflow_after_outlier_drop_standardized, n_factors=2)

In [21]:
income_latent.shape

(410, 2)