In [None]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

# -------------------------------
# Clean & standardize category text
# -------------------------------
def _norm(s):
    if pd.isna(s): return np.nan
    return str(s).strip().lower()

# Standardize Sleep Duration
sleep_map = {
    '<5 hours': '<5 hours',
    'less than 5 hours': '<5 hours',
    '5-6 hours': '5-6 hours',
    '6-7 hours': '6-7 hours',
    '7-8 hours': '7-8 hours',
    '>8 hours': '>8 hours',
    'more than 8 hours': '>8 hours',
    'others': np.nan,   # treat 'Others' as missing; will impute below
    'other': np.nan
}

df['Sleep Duration'] = df['Sleep Duration'].map(_norm).map(sleep_map)

# If any remain unmapped (unexpected spellings), set to NaN
valid_sleep = {'<5 hours','5-6 hours','6-7 hours','7-8 hours','>8 hours'}
df.loc[~df['Sleep Duration'].isin(valid_sleep), 'Sleep Duration'] = np.nan

# Impute missing Sleep Duration with the mode (most frequent)
if df['Sleep Duration'].isna().any():
    mode_sleep = df['Sleep Duration'].mode(dropna=True)
    if not mode_sleep.empty:
        df['Sleep Duration'].fillna(mode_sleep.iloc[0], inplace=True)

# Standardize Dietary Habits
diet_map = {
    'unhealthy': 'Unhealthy',
    'poor': 'Unhealthy',
    'avg': 'Average',
    'average': 'Average',
    'moderate': 'Average',
    'healthy': 'Healthy'
}
df['Dietary Habits'] = df['Dietary Habits'].map(_norm).map(diet_map)

# If any remain unmapped, set to NaN then impute with mode
valid_diet = {'Unhealthy','Average','Healthy'}
df.loc[~df['Dietary Habits'].isin(valid_diet), 'Dietary Habits'] = np.nan
if df['Dietary Habits'].isna().any():
    mode_diet = df['Dietary Habits'].mode(dropna=True)
    if not mode_diet.empty:
        df['Dietary Habits'].fillna(mode_diet.iloc[0], inplace=True)

# -------------------------------
# Ordinal encoding (robust to leftovers)
# -------------------------------
sleep_order = ["<5 hours", "5-6 hours", "6-7 hours", "7-8 hours", ">8 hours"]
diet_order  = ["Unhealthy", "Average", "Healthy"]

# Use handle_unknown to guard against any rare leftover text
ord_enc = OrdinalEncoder(
    categories=[sleep_order, diet_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

df[["Sleep Duration", "Dietary Habits"]] = ord_enc.fit_transform(
    df[["Sleep Duration", "Dietary Habits"]]
)

print("Encoded categories OK. Unique values now are:")
print("Sleep Duration:", sorted(df["Sleep Duration"].unique()))
print("Dietary Habits:", sorted(df["Dietary Habits"].unique()))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Basic Info
print("Dataset shape:", df.shape)
print("\nColumn types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

if "Depression" in df.columns:
    plt.figure(figsize=(6,4))
    sns.countplot(x="Depression", data=df, palette="Set2")
    plt.title("Depression Level Counts")
    plt.show()