In [8]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

INPUT_FILE = "input_pca_set_A_.csv"
OUT_DIR = Path("pca_set_A")
OUT_DIR.mkdir(parents=True, exist_ok=True)

SCREE_PNG = OUT_DIR / "scree.png"
LOADINGS_PNG = OUT_DIR / "pca_components_coefficients_plot.png"
BIPLOT_PNG = OUT_DIR / "pca_biplot_pc1_pc2.png"
CONTRIB_CSV = OUT_DIR / "pca_contributions_percent.csv"
CONTRIB_MD  = OUT_DIR / "pca_contributions_percent.md"

In [9]:
if not Path(INPUT_FILE).exists():
    raise FileNotFoundError(f"Could not find {INPUT_FILE}")

df = pd.read_csv(INPUT_FILE)
metrics = list(df.columns)
X = df.to_numpy(dtype=float)

X = X - X.mean(axis=0, keepdims=True)

n_components = min(10, X.shape[1])  # up to 10 PCs or number of features
pca = PCA(n_components=n_components, svd_solver="full", random_state=0)
scores = pca.fit_transform(X)                   # projected samples
evr = pca.explained_variance_ratio_             # length = n_components
cum_evr = np.cumsum(evr)
components = pca.components_                    # shape (n_components, n_features)
loadings = components.T                         # shape (n_features, n_components)

In [10]:
plt.figure(figsize=(9, 6))
xs = np.arange(1, n_components + 1)
plt.plot(xs, evr, marker="o", label="Explained variance ratio")
plt.plot(xs, cum_evr, marker="s", label="Cumulative variance")
plt.xlabel("Principal Component")
plt.ylabel("Variance Ratio")
plt.title("Scree Plot & Cumulative Variance")
plt.xticks(xs)
plt.legend()
plt.tight_layout()
plt.savefig(SCREE_PNG, dpi=300)
plt.close()

In [11]:
fig, ax = plt.subplots(figsize=(14, 6))
ind = np.arange(len(metrics))
width = 0.8 / n_components  # fit all PCs inside each metric slot

for j in range(n_components):
    ax.bar(ind + j * width, loadings[:, j], width, label=f"PC{j+1}")

ax.axhline(0, linewidth=1)
ax.set_xticks(ind + width * (n_components - 1) / 2)
ax.set_xticklabels([m.replace("_", " ") for m in metrics], rotation=45, ha="right")
ax.set_ylabel("Coefficient value")
ax.set_title("PCA Component Coefficients (loadings directions)")
ax.legend(title="Principal Components", ncols=min(n_components, 5))
fig.tight_layout()
fig.savefig(LOADINGS_PNG, dpi=300)
plt.close(fig)

In [12]:
fig, ax = plt.subplots(figsize=(10, 8))

ax.scatter(scores[:, 0], scores[:, 1], marker="x", alpha=0.7, label="Samples")
scaled_loadings = loadings[:, :2] * np.sqrt(pca.explained_variance_[:2])

score_range = max(np.ptp(scores[:, 0]), np.ptp(scores[:, 1]))
arrow_max = np.max(np.linalg.norm(scaled_loadings, axis=1))
scale = 0.85 * score_range / (arrow_max if arrow_max > 0 else 1.0)

for i, (lx, ly) in enumerate(scaled_loadings):
    ax.arrow(0, 0, lx * scale, ly * scale, width=0.0, head_width=0.12,
             length_includes_head=True, alpha=0.9, color="red")
    ax.text(lx * scale * 1.05, ly * scale * 1.05, metrics[i].replace("_", " "),
            color="red", ha="center", va="center")

ax.axhline(0, color="gray", linewidth=1)
ax.axvline(0, color="gray", linewidth=1)
ax.set_xlabel(f"PC1 ({evr[0]*100:.2f}%)")
ax.set_ylabel(f"PC2 ({evr[1]*100:.2f}%)")
ax.set_title("PCA Biplot (PC1 vs PC2)")
ax.legend(loc="upper right")
fig.tight_layout()
fig.savefig(BIPLOT_PNG, dpi=300)
plt.close(fig)

In [13]:
sq_loadings = loadings ** 2                                 # (features x components)
col_sums = sq_loadings.sum(axis=0, keepdims=True)           # (1 x components)
contrib = (sq_loadings / np.where(col_sums == 0, 1, col_sums)) * 100.0

k = min(6, contrib.shape[1])
contrib_k = contrib[:, :k]

contrib_df = pd.DataFrame(contrib_k, index=metrics,
                          columns=[f"PC{i+1}" for i in range(k)]).round(2)

contrib_df.to_csv(CONTRIB_CSV, index_label="Metric")

with open(CONTRIB_MD, "w", encoding="utf-8") as f:
    f.write("| Metric | " + " | ".join(contrib_df.columns) + " |\n")
    f.write("|" + " --- |" * (len(contrib_df.columns) + 1) + "\n")
    for metric, row in contrib_df.iterrows():
        f.write("| **" + metric.replace("_", "\\_") + "** | " +
                " | ".join(f"{val:.2f}" for val in row.values) + " |\n")

print("Column sums (%):", np.round(contrib_df.sum(axis=0).values, 2))

print("\nSaved outputs in:", OUT_DIR.resolve())
for p in [SCREE_PNG, LOADINGS_PNG, BIPLOT_PNG, CONTRIB_CSV, CONTRIB_MD]:
    print(" -", p)

Column sums (%): [100.   100.   100.01  99.99 100.   100.01]

Saved outputs in: /home/max/gits/pca-notebook/pca_set_A
 - pca_set_A/scree.png
 - pca_set_A/pca_components_coefficients_plot.png
 - pca_set_A/pca_biplot_pc1_pc2.png
 - pca_set_A/pca_contributions_percent.csv
 - pca_set_A/pca_contributions_percent.md
