In [None]:
# -------------------------------
# STEP: PCA (Dimensionality Reduction)
# -------------------------------
from sklearn.decomposition import PCA

# Keep only numeric columns for PCA
numeric_df = df.select_dtypes(include=[np.number])

pca = PCA(n_components=0.95)  # keep 95% variance
pca_data = pca.fit_transform(numeric_df)

print("Original shape:", numeric_df.shape)
print("Reduced shape (PCA):", pca_data.shape)

# Convert PCA result back to DataFrame
df_pca = pd.DataFrame(pca_data, columns=[f"PC{i+1}" for i in range(pca_data.shape[1])])

# Save PCA-transformed dataset
df_pca["Depression"] = df["Depression"].values  # keep target
df_pca.to_csv("step6_pca_features.csv", index=False)


In [None]:
# -------------------------------
# STEP: PCA Visualization
# -------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Explained variance ratio (how much info each PC holds)
plt.figure(figsize=(8,5))
plt.plot(range(1, len(pca.explained_variance_ratio_)+1),
         pca.explained_variance_ratio_, marker='o')
plt.title("Explained Variance Ratio per Principal Component")
plt.xlabel("Principal Component")
plt.ylabel("Variance Ratio")
plt.grid(True)
plt.show()

# 2. Cumulative variance explained
plt.figure(figsize=(8,5))
plt.plot(range(1, len(pca.explained_variance_ratio_)+1),
         pca.explained_variance_ratio_.cumsum(), marker='o', color="green")
plt.title("Cumulative Explained Variance")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Variance Explained")
plt.grid(True)
plt.show()

# 3. Scatterplot of first 2 PCs, colored by Depression
if "Depression" in df_pca.columns:
    plt.figure(figsize=(8,6))
    sns.scatterplot(
        x="PC1", y="PC2",
        hue="Depression",
        palette="Set1",
        data=df_pca,
        alpha=0.7
    )
    plt.title("PCA - First Two Principal Components")
    plt.legend(title="Depression")
    plt.show()

# 4. 3D scatterplot (optional) if more separation is needed
from mpl_toolkits.mplot3d import Axes3D

if df_pca.shape[1] > 3:  # check if we have at least PC3
    fig = plt.figure(figsize=(10,7))
    ax = fig.add_subplot(111, projection="3d")
    scatter = ax.scatter(
        df_pca["PC1"], df_pca["PC2"], df_pca["PC3"],
        c=df_pca["Depression"], cmap="Set1", alpha=0.7
    )
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    plt.title("3D PCA Scatterplot")
    plt.colorbar(scatter, label="Depression")
    plt.show()
