In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler

In [2]:
income_after_outlier_drop_standardized = pd.read_pickle("income_after_outlier_drop_standardized.pkl")
balance_after_outlier_drop_standardized = pd.read_pickle("balance_after_outlier_drop_standardized.pkl")
cashflow_after_outlier_drop_standardized = pd.read_pickle("cashflow_after_outlier_drop_standardized.pkl")
pe_aligned = pd.read_pickle("pe_aligned.pkl")

print(income_after_outlier_drop_standardized.shape)

(410, 13)


In [3]:
def compute_latent_factors_with_components_no_standardize(
    df: pd.DataFrame,
    n_factors: int = 3,
    dropna: bool = True,
    check_standardized: bool = True,
    mean_tol: float = 1e-2,
    std_tol: float = 1e-2,
):
    """
    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing numeric features (and possibly id cols).
    n_factors : int
        Number of latent factors.
    dropna : bool
        If True, drop rows with any NaNs in numeric columns.
    check_standardized : bool
        If True, prints warnings if data doesn't look standardized (mean~0, std~1).
    mean_tol, std_tol : float
        Tolerances for the standardized check.

    Returns
    -------
    F_df : pd.DataFrame
        Latent factor scores (n x k).
    Lambda : np.ndarray
        Loadings matrix (p x k).
    macro_total_df : pd.DataFrame
        Total macro component in original feature space (n x p).
    macro_by_factor_df : dict[str, pd.DataFrame]
        Each factor's macro component in original feature space.
    idio_df : pd.DataFrame
        Idiosyncratic component (X - macro_total).
    """
    numeric_df = df.select_dtypes(include=["number"])
    if dropna:
        numeric_df = numeric_df.dropna()

    # Convert to numpy array without scaling
    X = numeric_df.to_numpy(dtype=float)

    if check_standardized:
        col_means = np.nanmean(X, axis=0)
        col_stds = np.nanstd(X, axis=0, ddof=0)

        max_abs_mean = float(np.max(np.abs(col_means))) if col_means.size else 0.0
        max_abs_std1 = float(np.max(np.abs(col_stds - 1.0))) if col_stds.size else 0.0

        if max_abs_mean > mean_tol or max_abs_std1 > std_tol:
            print(
                f"[warn] Data may NOT be standardized: "
                f"max|mean|={max_abs_mean:.4g}, max|std-1|={max_abs_std1:.4g}. "
                f"(If you intended correlation-based FA, standardize upstream once.)"
            )

    # Factor Analysis on X as-is
    fa = FactorAnalysis(n_components=n_factors, random_state=0)
    F = fa.fit_transform(X)          # (n x k)
    Lambda = fa.components_.T        # (p x k)

    # Macro components
    X_macro_total = F @ Lambda.T     # (n x p)

    macro_by_factor_df = {}
    for j in range(n_factors):
        Fj = F[:, [j]]               # (n x 1)
        Lj = Lambda[:, [j]]          # (p x 1)
        X_macro_j = Fj @ Lj.T        # (n x p)
        macro_by_factor_df[f"LF{j+1}_macro"] = pd.DataFrame(
            X_macro_j, index=numeric_df.index, columns=numeric_df.columns
        )

    # Idiosyncratic component in same space as X
    X_idio = X - X_macro_total

    # Outputs as DataFrames
    F_df = pd.DataFrame(
        F, index=numeric_df.index, columns=[f"LF{i+1}" for i in range(n_factors)]
    )
    macro_total_df = pd.DataFrame(
        X_macro_total, index=numeric_df.index, columns=numeric_df.columns
    )
    idio_df = pd.DataFrame(
        X_idio, index=numeric_df.index, columns=numeric_df.columns
    )

    return F_df, Lambda, macro_total_df, macro_by_factor_df, idio_df


In [5]:
# Compute latent factors for each aligned dataset
income_latent = compute_latent_factors_with_components_no_standardize(income_after_outlier_drop_standardized, n_factors=2)
balance_latent = compute_latent_factors_with_components_no_standardize(balance_after_outlier_drop_standardized, n_factors=2)
cashflow_latent = compute_latent_factors_with_components_no_standardize(cashflow_after_outlier_drop_standardized, n_factors=2)