In [None]:
import numpy as np
import pandas as pd

before_n = len(df)

# 1) Sanity ranges for common survey scales (adjust if your dataset differs)
scale_0_10_cols = ["Academic Pressure","Work Pressure","Financial Stress","CGPA"]
for col in scale_0_10_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df = df[df[col].between(0, 10)]

# 2) No negatives and reasonable caps
non_negative_cols = ["Age","Work/Study Hours"]
for col in non_negative_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df = df[df[col] >= 0]

# 3) IQR filter for Work/Study Hours (robust outlier removal)
col = "Work/Study Hours"
if col in df.columns:
    q1, q3 = df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lower = max(0, q1 - 1.5*iqr)   # don’t allow negative hours
    upper = min(24, q3 + 1.5*iqr)  # keep logical cap
    df = df[df[col].between(lower, upper)]

# 4) (Optional) IQR filter for Age (still respecting 10–60)
col = "Age"
if col in df.columns:
    q1, q3 = df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lower = max(10, q1 - 1.5*iqr)
    upper = min(60, q3 + 1.5*iqr)
    df = df[df[col].between(lower, upper)]

df = df.reset_index(drop=True)

after_n = len(df)
print(f"Extra filtering removed {before_n - after_n} rows. Remaining: {after_n}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Basic Info
print("Dataset shape:", df.shape)
print("\nColumn types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

if "Depression" in df.columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(x="Sleep Duration", y="Depression", data=df)
    plt.title("Sleep Duration vs Depression")
    plt.show()