In [None]:
# -------------------------------
# STEP: Normalization / Scaling
# Scale numeric features:
#   Age, CGPA, Work/Study Hours, Academic Pressure, Work Pressure, Financial Stress
# -------------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler  # (Z-score)
# from sklearn.preprocessing import MinMaxScaler  # (0–1) <-- optional alternative
import joblib

# Define the intended numeric columns
numeric_features = [
    "Age",
    "CGPA",
    "Work/Study Hours",
    "Academic Pressure",
    "Work Pressure",
    "Financial Stress",
]

# Keep only columns that actually exist (prevents KeyErrors)
numeric_cols_present = [c for c in numeric_features if c in df.columns]
if not numeric_cols_present:
    raise ValueError("None of the expected numeric columns are present to scale.")

# Coerce to numeric (if any are object dtype)
for c in numeric_cols_present:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Option A: Overwrite in place (common for modeling)
scaler = StandardScaler()
df[numeric_cols_present] = scaler.fit_transform(df[numeric_cols_present])

# (Optional) Persist the scaler for later use (e.g., on test or production data)
joblib.dump(scaler, "scaler_standard.pkl")
print("Scaled columns:", numeric_cols_present)
print("Scaler saved to scaler_standard.pkl")
print("df shape after scaling:", df.shape)

# --------- OPTIONAL: Alternative that adds _scaled columns (keeps originals) ----------
# mm = MinMaxScaler()
# df[[c + "_scaled" for c in numeric_cols_present]] = mm.fit_transform(df[numeric_cols_present])
# joblib.dump(mm, "scaler_minmax.pkl")
# print("Also created MinMax-scaled copies with suffix _scaled.")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 1. Basic Info
print("Dataset shape:", df.shape)
print("\nColumn types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())


# 6. Correlation Heatmap
plt.figure(figsize=(10,6))
# Select only numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=np.number)
sns.heatmap(numeric_df.corr(), cmap="coolwarm", center=0, annot=False)
plt.title("Correlation Heatmap")
plt.show()