In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
%matplotlib inline
sns.set(color_codes=True)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
BASE_PATH = '/content/drive/MyDrive/StudentStressLevelMonitoring'
INPUT_CSV = os.path.join(BASE_PATH, 'results/outputs', 'final_selected_features_dataset.csv')  # <- your file
TARGET = 'stress_level'
N_COMPONENTS = 5  # change if you want more/less PCs

OUTPUTS_DRIVE = os.path.join(BASE_PATH, 'results/outputs')
os.makedirs(OUTPUTS_DRIVE, exist_ok=True)

OUTPUTS_DRIVE_EDA = os.path.join(BASE_PATH, 'results/eda_visualizations')
os.makedirs(OUTPUTS_DRIVE_EDA, exist_ok=True)


In [5]:
df = pd.read_csv(INPUT_CSV)
print("Loaded:", INPUT_CSV, "| Shape:", df.shape)
display(df.head())

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in the dataset.")


Loaded: /content/drive/MyDrive/StudentStressLevelMonitoring/results/outputs/final_selected_features_dataset.csv | Shape: (793, 14)


Unnamed: 0,health_index,academic_stress,social_environment,self_esteem,headache,blood_pressure,sleep_quality,safety,basic_needs,academic_performance,teacher_student_relationship,future_career_concerns,bullying,stress_level
0,4.5,2.666667,2.333333,0.666667,0.4,0.0,0.4,0.6,0.4,0.6,0.6,0.6,0.4,1.0
1,6.75,3.333333,3.333333,0.266667,1.0,1.0,0.2,0.4,0.4,0.2,0.2,1.0,1.0,2.0
2,4.75,2.333333,2.333333,0.6,0.4,0.0,0.4,0.6,0.4,0.4,0.6,0.4,0.4,1.0
3,6.25,3.333333,3.333333,0.4,0.8,1.0,0.2,0.4,0.4,0.4,0.2,0.8,1.0,2.0
4,3.25,3.0,3.666667,0.933333,0.4,1.0,1.0,0.8,0.6,0.8,0.2,0.4,1.0,1.0


In [6]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c != TARGET]

if not feature_cols:
    raise ValueError("No numeric feature columns available for PCA.")

X = df[feature_cols].copy()
y = df[TARGET].copy()

In [7]:
valid_mask = ~X.isnull().any(axis=1) & ~y.isnull()
X = X[valid_mask]
y = y[valid_mask]
print("After NA filter -> X:", X.shape, "| y:", y.shape)

After NA filter -> X: (793, 13) | y: (793,)


In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
n_comps = min(N_COMPONENTS, X_scaled.shape[1])
pca = PCA(n_components=n_comps, random_state=42)
X_pca = pca.fit_transform(X_scaled)

expl_var = pca.explained_variance_ratio_
cum_expl_var = np.cumsum(expl_var)

print("Explained variance ratio per component:", expl_var)
print("Total explained variance by", n_comps, "components:", cum_expl_var[-1])


Explained variance ratio per component: [0.67702749 0.09993981 0.05258723 0.03612792 0.02645345]
Total explained variance by 5 components: 0.8921359051606538


In [10]:
pc_cols = [f"PC{i+1}" for i in range(n_comps)]
df_pca = pd.DataFrame(X_pca, columns=pc_cols, index=X.index)
df_pca[TARGET] = y.values

display(df_pca.head())


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,stress_level
0,0.534055,-0.850207,0.746456,-0.631632,-0.214342,1.0
1,5.408248,1.237747,0.135709,0.303778,0.853521,2.0
2,0.475956,-1.478501,-0.202965,-0.395619,-0.113921,1.0
3,4.507427,1.29476,0.385568,0.648172,0.168732,2.0
4,0.664906,1.42673,1.168274,2.787152,-0.360303,1.0


In [11]:
plt.figure(figsize=(8,5))
plt.plot(range(1, n_comps+1), expl_var, marker='o')
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.title("Scree Plot")
plt.xticks(range(1, n_comps+1))
plt.tight_layout()
for outdir in [OUTPUTS_DRIVE_EDA]:
    path = os.path.join(outdir, "scree_plot.png")
    plt.savefig(path, dpi=150, bbox_inches="tight")
    print("Saved:", path)
plt.close()

plt.figure(figsize=(8,5))
plt.plot(range(1, n_comps+1), cum_expl_var, marker='o')
plt.xlabel("Principal Component")
plt.ylabel("Cumulative Explained Variance")
plt.title("Cumulative Variance Explained")
plt.xticks(range(1, n_comps+1))
plt.ylim(0, 1.01)
plt.tight_layout()
for outdir in [OUTPUTS_DRIVE_EDA]:
    path = os.path.join(outdir, "cumulative_variance.png")
    plt.savefig(path, dpi=150, bbox_inches="tight")
    print("Saved:", path)
plt.close()


Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/scree_plot.png
Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/cumulative_variance.png


In [12]:
if n_comps >= 2:
    plt.figure(figsize=(7,6))
    scatter = plt.scatter(df_pca["PC1"], df_pca["PC2"], c=df_pca[TARGET], cmap="viridis", alpha=0.7)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title("PCA: PC1 vs PC2")
    cbar = plt.colorbar(scatter)
    cbar.set_label(TARGET)
    plt.tight_layout()
    for outdir in [OUTPUTS_DRIVE_EDA]:
        path = os.path.join(outdir, "scatter_PC1_PC2.png")
        plt.savefig(path, dpi=150, bbox_inches="tight")
        print("Saved:", path)
    plt.close()

Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/scatter_PC1_PC2.png


In [13]:
if n_comps >= 5:
    plt.figure(figsize=(7,6))
    scatter = plt.scatter(df_pca["PC1"], df_pca["PC5"], c=df_pca[TARGET], cmap="viridis", alpha=0.7)
    plt.xlabel("PC1")
    plt.ylabel("PC5")
    plt.title("PCA: PC1 vs PC5")
    cbar = plt.colorbar(scatter)
    cbar.set_label(TARGET)
    plt.tight_layout()
    for outdir in [OUTPUTS_DRIVE_EDA]:
        path = os.path.join(outdir, "scatter_PC1_PC5.png")
        plt.savefig(path, dpi=150, bbox_inches="tight")
        print("Saved:", path)
    plt.close()

Saved: /content/drive/MyDrive/StudentStressLevelMonitoring/results/eda_visualizations/scatter_PC1_PC5.png


In [14]:
PCA_CSV_DRIVE = os.path.join(OUTPUTS_DRIVE, "pca_components.csv")
df_pca.to_csv(PCA_CSV_DRIVE, index=False)
print("\nSaved:")
print(" -", PCA_CSV_DRIVE)



Saved:
 - /content/drive/MyDrive/StudentStressLevelMonitoring/results/outputs/pca_components.csv
