In [None]:
# -------------------------------
# STEP: Binning / Discretization
# -------------------------------
import pandas as pd

before_cols = df.shape[1]

# 1. Age → grouped bins
df["AgeGroup"] = pd.cut(
    df["Age"],
    bins=[15, 20, 25, 30, 40, 60],
    labels=["15-20", "21-25", "26-30", "31-40", "41-60"]
)

# 2. CGPA → grouped bins
df["CGPA_Level"] = pd.cut(
    df["CGPA"],
    bins=[0, 5, 7, 10],
    labels=["Low", "Medium", "High"]
)

# 3. Convert to dummy/one-hot encoding
df = pd.get_dummies(df, columns=["AgeGroup", "CGPA_Level"], drop_first=True)

after_cols = df.shape[1]

print(f"Binning complete. Columns before: {before_cols}, after: {after_cols}")
print("New columns added:", [c for c in df.columns if "AgeGroup" in c or "CGPA_Level" in c])
print(df.head())

In [None]:
# -------------------------------
# STEP: Visualization after Binning / Discretization
# -------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Count plot for AgeGroup (original categorical view)
# Use the existing 'AgeGroup' column for plotting
if "AgeGroup" in df.columns:
    plt.figure(figsize=(8,5))
    # Reorder categories for plotting if necessary
    age_order = ["15-20", "21-25", "26-30", "31-40", "41-60"]
    sns.countplot(x="AgeGroup", data=df, order=age_order)
    plt.title("Age Group Distribution")
    plt.xlabel("Age Group")
    plt.ylabel("Count")
    plt.show()
else:
    print("AgeGroup column not found. Skipping Age Group distribution plot.")


# 2. Count plot for CGPA_Level
# Use the existing 'CGPA_Level' column for plotting
if "CGPA_Level" in df.columns:
    plt.figure(figsize=(8,5))
    # Reorder categories for plotting if necessary
    cgpa_order = ["Low", "Medium", "High"]
    sns.countplot(x="CGPA_Level", data=df, order=cgpa_order)
    plt.title("CGPA Level Distribution")
    plt.xlabel("CGPA Level")
    plt.ylabel("Count")
    plt.show()
else:
     print("CGPA_Level column not found. Skipping CGPA Level distribution plot.")


# 3. Heatmap: correlation of dummy columns with numeric features
# Identify dummy columns based on the pattern created in cell tQkiYPWh10hH
# Assuming dummy columns were created from 'AgeGroup' and 'CGPA_Level'
age_dummy_cols = [c for c in df.columns if c.startswith("AgeGroup_")]
cgpa_dummy_cols = [c for c in df.columns if c.startswith("CGPA_Level_")]
dummy_cols = age_dummy_cols + cgpa_dummy_cols

num_cols = df.select_dtypes(include=['number']).columns.tolist()

# Ensure there are dummy columns and numeric columns before attempting heatmap
if dummy_cols and num_cols:
    # Filter out the target column if it's in num_cols to avoid correlation with itself
    if "Depression" in num_cols:
        num_cols.remove("Depression")

    # Ensure there are still numeric columns after removing target
    if num_cols:
        plt.figure(figsize=(12,8)) # Increased size for better readability
        # Select only the relevant columns for correlation
        corr_data = df[dummy_cols + num_cols]
        # Calculate correlation matrix
        corr_matrix = corr_data.corr()
        # Select only the correlations between dummy columns and numeric columns
        corr_subset = corr_matrix.loc[dummy_cols, num_cols]

        sns.heatmap(corr_subset, annot=True, cmap="coolwarm", fmt=".2f", annot_kws={"size": 8})
        plt.title("Correlation Heatmap (Dummy Features vs Numeric Features)")
        plt.show()
    else:
        print("No numeric columns (excluding target) found for heatmap.")
else:
    print("No dummy or numeric columns found for heatmap.")


# 4. Boxplot: Depression vs AgeGroup (if Depression exists)
# Use the existing 'AgeGroup' column for plotting
if "Depression" in df.columns and "AgeGroup" in df.columns:
    plt.figure(figsize=(8,5))
    # Reorder categories for plotting if necessary
    age_order = ["15-20", "21-25", "26-30", "31-40", "41-60"]
    sns.boxplot(x="AgeGroup", y="Depression", data=df, order=age_order)
    plt.title("Depression Score by Age Group")
    plt.xlabel("Age Group")
    plt.ylabel("Depression Score")
    plt.show()
elif "Depression" not in df.columns:
    print("Depression column not found. Skipping Depression vs Age Group boxplot.")
elif "AgeGroup" not in df.columns:
     print("AgeGroup column not found. Skipping Depression vs Age Group boxplot.")


# 5. Boxplot: Depression vs CGPA_Level
# Use the existing 'CGPA_Level' column for plotting
if "Depression" in df.columns and "CGPA_Level" in df.columns:
    plt.figure(figsize=(8,5))
    # Reorder categories for plotting if necessary
    cgpa_order = ["Low", "Medium", "High"]
    sns.boxplot(x="CGPA_Level", y="Depression", data=df, order=cgpa_order)
    plt.title("Depression Score by CGPA Level")
    plt.xlabel("CGPA Level")
    plt.ylabel("Depression Score")
    plt.show()
elif "Depression" not in df.columns:
    print("Depression column not found. Skipping Depression vs CGPA Level boxplot.")
elif "CGPA_Level" not in df.columns:
    print("CGPA_Level column not found. Skipping Depression vs CGPA Level boxplot.")