In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

IN_A = Path("input_pca_set_A_.csv")
IN_B = Path("input_pca_set_B_.csv")

In [2]:
def load_numeric_df(path: Path) -> tuple[pd.DataFrame, list[str]]:
    df = pd.read_csv(path)
    num = df.select_dtypes(include=[np.number]).copy()
    num = num.dropna(axis=0, how="any")
    return num, list(num.columns)

In [3]:
def run_pca_sklearn(X: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    n_components = min(X.shape[0], X.shape[1])
    pca = PCA(n_components=n_components, svd_solver="full")
    scores = pca.fit_transform(X)                          # U * S (sample × component)
    loadings = pca.components_.T                           # feature × component
    ev = pca.explained_variance_
    evr = pca.explained_variance_ratio_
    return scores, loadings, ev, evr

In [4]:
def save_pca_outputs(prefix: str, numeric_df: pd.DataFrame, feature_names: list[str]) -> None:
    """
    Save PC scores, loadings, and explained variance tables to CSV.
    """
    X = numeric_df.values
    scores, loadings, ev, evr = run_pca_sklearn(X)

    k = scores.shape[1]
    pc_cols = [f"PC{i}" for i in range(1, k + 1)]

    # 1) Principal components (scores): same row-index as input
    scores_df = pd.DataFrame(scores, index=numeric_df.index, columns=pc_cols)
    scores_df.to_csv(f"{prefix}_principal_components.csv", index_label="row_index")

    # 2) Loadings: rows = features, cols = PCs
    loadings_df = pd.DataFrame(loadings, index=feature_names, columns=pc_cols)
    loadings_df.to_csv(f"{prefix}_pca_loadings.csv", index_label="feature")

    # 3) Explained variance and ratio
    ev_df = pd.DataFrame(
        {"explained_variance": ev, "explained_variance_ratio": evr},
        index=pc_cols,
    )
    ev_df.to_csv(f"{prefix}_explained_variance.csv", index_label="component")

    print(f"Wrote: {prefix}_principal_components.csv")
    print(f"Wrote: {prefix}_pca_loadings.csv")
    print(f"Wrote: {prefix}_explained_variance.csv")

In [5]:
dfA, featsA = load_numeric_df(IN_A)
dfB, featsB = load_numeric_df(IN_B)

save_pca_outputs("set_A", dfA, featsA)
save_pca_outputs("set_B", dfB, featsB)

Wrote: set_A_principal_components.csv
Wrote: set_A_pca_loadings.csv
Wrote: set_A_explained_variance.csv
Wrote: set_B_principal_components.csv
Wrote: set_B_pca_loadings.csv
Wrote: set_B_explained_variance.csv
