In [1]:
# Sleep Health & Lifestyle Analysis - Data Preprocessing & Feature Engineering
# Part 2: Cleaning, Transformation, and Feature Creation

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


In [3]:

# ===========================
# 1.A. LOAD DATA
# ===========================

df = pd.read_csv("D:\\GIT_HUB\\12_Final_Projects_of_all\\01_Analysis\Dataset\\sleep_health_lifestyle_dataset.csv")
print("Original Dataset Shape:", df.shape)


Original Dataset Shape: (400, 13)


In [None]:

# ===========================
# 1.A. LOAD DATA
# ===========================

df = pd.read_csv("D:\\GIT_HUB\\12_Final_Projects_of_all\\01_Analysis\Dataset\\sleep_health_lifestyle_dataset.csv")
print("Original Dataset Shape:", df.shape)

# ========================================================
# 1.B. COLUMN RENAMING (FROM PART 1) ### [NEWLY ADDED] ###
# ========================================================
# This is critical to prevent KeyErrors in the steps below.
print("\n" + "=" * 60)
print("RENAMING COLUMNS FOR CONSISTENCY")
print("=" * 60)

column_rename_map = {
    'Person ID': 'Person_ID',
    'Sleep Duration (hours)': 'Sleep Duration',
    'Quality of Sleep (scale: 1-10)': 'Quality of Sleep',
    'Physical Activity Level (minutes/day)': 'Physical Activity Level',
    'Stress Level (scale: 1-10)': 'Stress Level',
    'Blood Pressure (systolic/diastolic)': 'Blood Pressure',
    'Heart Rate (bpm)': 'Heart Rate'
}

df = df.rename(columns=column_rename_map)
print("Columns successfully renamed.")

# Create a copy for preprocessing
df_processed = df.copy()


In [4]:

# ========================================================
# 1.B. COLUMN RENAMING (FROM PART 1)
# ========================================================
print("\n" + "=" * 60)
print("RENAMING COLUMNS FOR CONSISTENCY")
print("=" * 60)

column_rename_map = {
    'Person ID': 'Person_ID',
    'Sleep Duration (hours)': 'Sleep Duration',
    'Quality of Sleep (scale: 1-10)': 'Quality of Sleep',
    'Physical Activity Level (minutes/day)': 'Physical Activity Level',
    'Stress Level (scale: 1-10)': 'Stress Level',
    'Blood Pressure (systolic/diastolic)': 'Blood Pressure',
    'Heart Rate (bpm)': 'Heart Rate'
}

df = df.rename(columns=column_rename_map)
print("Columns successfully renamed.")

# ========================================================
# 1.C. HANDLE MISSING TARGET (THE "TWIST" FIX)
# ========================================================
print("\n" + "=" * 60)
print("IMPUTING MISSING 'Sleep Disorder' VALUES")
print("=" * 60)

# Check for NaNs
missing_count = df['Sleep Disorder'].isnull().sum()
print(f"Found {missing_count} missing 'Sleep Disorder' values.")

if missing_count > 0:
    # 1. Create the "Imputation Flag" feature
    # This is our "expert twist"
    df['SleepDisorder_Imputed'] = df['Sleep Disorder'].isnull().astype(int)
    
    # 2. Fill NaNs with 'None' (as you requested)
    df['Sleep Disorder'] = df['Sleep Disorder'].fillna('None')
    
    print(f"✓ Created 'SleepDisorder_Imputed' flag column.")
    print(f"✓ Filled {missing_count} NaNs with 'None'.")
else:
    # If no NaNs, still create the flag column (all 0s)
    df['SleepDisorder_Imputed'] = 0
    print("✓ No missing 'Sleep Disorder' values found. Flag column created.")

# ========================================================
# 1.D. CREATE DATASET COPIES
# ========================================================
# Now we create the copy for preprocessing, which includes our fix.
# This single, unified dataset will work for BOTH Power BI and ML.

df_processed = df.copy()

print("\n✓ Preprocessing copy created. Ready to proceed with Part 2.")

# -------------------------------------------------------------------
# (The rest of your script, Sections 2-9, remains *exactly* the same)
# -------------------------------------------------------------------


RENAMING COLUMNS FOR CONSISTENCY
Columns successfully renamed.

IMPUTING MISSING 'Sleep Disorder' VALUES
Found 290 missing 'Sleep Disorder' values.
✓ Created 'SleepDisorder_Imputed' flag column.
✓ Filled 290 NaNs with 'None'.

✓ Preprocessing copy created. Ready to proceed with Part 2.


In [5]:

# ===========================
# 2. BLOOD PRESSURE PROCESSING
# ===========================

print("\n" + "=" * 60)
print("PROCESSING BLOOD PRESSURE")
print("=" * 60)

# Split Blood Pressure into Systolic and Diastolic
# This now works because we renamed the column to 'Blood Pressure'
df_processed[['Systolic_BP', 'Diastolic_BP']] = df_processed['Blood Pressure'].str.split('/', expand=True)
df_processed['Systolic_BP'] = pd.to_numeric(df_processed['Systolic_BP'])
df_processed['Diastolic_BP'] = pd.to_numeric(df_processed['Diastolic_BP'])

# Create Blood Pressure categories based on medical standards
def categorize_bp(systolic, diastolic):
    if systolic < 120 and diastolic < 80:
        return 'Normal'
    elif systolic < 130 and diastolic < 80:
        return 'Elevated'
    elif systolic < 140 or diastolic < 90:
        return 'High_Stage1'
    else:
        return 'High_Stage2'

df_processed['BP_Category'] = df_processed.apply(
    lambda x: categorize_bp(x['Systolic_BP'], x['Diastolic_BP']), axis=1
)

print("Blood Pressure Category Distribution:")
print(df_processed['BP_Category'].value_counts())

# Drop original Blood Pressure column
df_processed = df_processed.drop('Blood Pressure', axis=1)



PROCESSING BLOOD PRESSURE
Blood Pressure Category Distribution:
BP_Category
Normal         152
Elevated       138
High_Stage1    105
High_Stage2      5
Name: count, dtype: int64


In [6]:

# ===========================
# 3. FEATURE ENGINEERING
# ===========================

print("\n" + "=" * 60)
print("FEATURE ENGINEERING")
print("=" * 60)

# 1. Sleep Efficiency Score (Custom metric)
# These columns are now guaranteed to exist
df_processed['Sleep_Efficiency'] = (df_processed['Sleep Duration'] / 8) * df_processed['Quality of Sleep']



FEATURE ENGINEERING


In [7]:

# 2. Health Risk Score (Composite metric)
### [CORRECTED] ###
# Added 'Underweight': 1 to prevent NaNs.
# Mapped 'Underweight', 'Normal', and 'Normal Weight' to 1 (low risk).
bmi_risk_map = {
    'Underweight': 1,
    'Normal': 1,
    'Normal Weight': 1,
    'Overweight': 2,
    'Obese': 3
}

df_processed['Health_Risk_Score'] = (
    df_processed['Stress Level'] * 0.3 +
    (10 - df_processed['Physical Activity Level'] / 10) * 0.3 +
    (df_processed['Heart Rate'] / 10) * 0.2 +
    (df_processed['BMI Category'].map(bmi_risk_map)) * 0.2
)

# Check for any NaNs created by the map (should be 0)
print(f"NaNs in Health_Risk_Score: {df_processed['Health_Risk_Score'].isnull().sum()}")



NaNs in Health_Risk_Score: 0


In [8]:

# 3. Age Groups
def categorize_age(age):
    if age < 30:
        return 'Young_Adult'
    elif age < 45:
        return 'Middle_Age'
    elif age < 60:
        return 'Senior'
    else:
        return 'Elderly'

df_processed['Age_Group'] = df_processed['Age'].apply(categorize_age)


In [9]:

# 4. Sleep Duration Category
def categorize_sleep_duration(duration):
    if duration < 6:
        return 'Insufficient'
    elif duration < 7:
        return 'Below_Optimal'
    elif duration <= 9:
        return 'Optimal'
    else:
        return 'Excessive'

df_processed['Sleep_Duration_Category'] = df_processed['Sleep Duration'].apply(categorize_sleep_duration)


In [10]:

# 5. Activity Level Category
def categorize_activity(minutes):
    if minutes < 30:
        return 'Sedentary'
    elif minutes < 60:
        return 'Moderate'
    else:
        return 'Active'

df_processed['Activity_Category'] = df_processed['Physical Activity Level'].apply(categorize_activity)


In [11]:

# 6. Stress Category
def categorize_stress(stress):
    if stress <= 3:
        return 'Low'
    elif stress <= 6:
        return 'Moderate'
    else:
        return 'High'

df_processed['Stress_Category'] = df_processed['Stress Level'].apply(categorize_stress)


In [12]:

# 7. Heart Rate Category
def categorize_heart_rate(hr):
    if hr < 60:
        return 'Low'
    elif hr <= 100:
        return 'Normal'
    else:
        return 'High'

df_processed['Heart_Rate_Category'] = df_processed['Heart Rate'].apply(categorize_heart_rate)


In [13]:

# 8. Daily Steps Category (Based on fitness goals)
def categorize_steps(steps):
    if steps < 5000:
        return 'Sedentary'
    elif steps < 7500:
        return 'Low_Active'
    elif steps < 10000:
        return 'Somewhat_Active'
    else:
        return 'Active'

df_processed['Steps_Category'] = df_processed['Daily Steps'].apply(categorize_steps)


In [14]:

# 9. Sleep Quality Category
def categorize_sleep_quality(quality):
    if quality <= 4:
        return 'Poor'
    elif quality <= 6:
        return 'Fair'
    elif quality <= 8:
        return 'Good'
    else:
        return 'Excellent'

df_processed['Sleep_Quality_Category'] = df_processed['Quality of Sleep'].apply(categorize_sleep_quality)


In [15]:
# 10. Binary Sleep Disorder Flag
df_processed['Has_Sleep_Disorder'] = (df_processed['Sleep Disorder'] != 'None').astype(int)

print("\nNew Features Created:")
new_features = ['Sleep_Efficiency', 'Health_Risk_Score', 'Age_Group',
                'Sleep_Duration_Category', 'Activity_Category', 'Stress_Category',
                'Heart_Rate_Category', 'Steps_Category', 'Sleep_Quality_Category',
                'Has_Sleep_Disorder', 'Systolic_BP', 'Diastolic_BP', 'BP_Category']
for feature in new_features:
    print(f"  - {feature}")



New Features Created:
  - Sleep_Efficiency
  - Health_Risk_Score
  - Age_Group
  - Sleep_Duration_Category
  - Activity_Category
  - Stress_Category
  - Heart_Rate_Category
  - Steps_Category
  - Sleep_Quality_Category
  - Has_Sleep_Disorder
  - Systolic_BP
  - Diastolic_BP
  - BP_Category


In [16]:

# ===========================
# 4. HANDLE CATEGORICAL VARIABLES
# ===========================

print("\n" + "=" * 60)
print("ENCODING CATEGORICAL VARIABLES")
print("=" * 60)

# Create a copy for ML
df_ml = df_processed.copy()

# Label Encoding for ordinal variables
label_encoders = {}

### [CORRECTED] ###
# Added 'Underweight' to the BMI Category list to prevent NaNs.
ordinal_features = {
    'BMI Category': ['Underweight', 'Normal', 'Normal Weight', 'Overweight', 'Obese'],
    'Sleep_Duration_Category': ['Insufficient', 'Below_Optimal', 'Optimal', 'Excessive'],
    'Activity_Category': ['Sedentary', 'Moderate', 'Active'],
    'Stress_Category': ['Low', 'Moderate', 'High'],
    'Sleep_Quality_Category': ['Poor', 'Fair', 'Good', 'Excellent'],
    'BP_Category': ['Normal', 'Elevated', 'High_Stage1', 'High_Stage2']
}

for feature, order in ordinal_features.items():
    # Create a mapping based on the order
    category_map = {val: idx for idx, val in enumerate(order)}
    df_ml[feature + '_Encoded'] = df_ml[feature].map(category_map)
    # Check for any new NaNs
    if df_ml[feature + '_Encoded'].isnull().sum() > 0:
        print(f"Warning: NaNs created in {feature}_Encoded. Check map and data.")

# One-Hot Encoding for nominal variables
nominal_features = ['Gender', 'Occupation', 'Age_Group',
                    'Heart_Rate_Category', 'Steps_Category']

# We explicitly DO NOT one-hot encode 'Sleep Disorder' here
# as it is our multi-class target variable.

df_ml = pd.get_dummies(df_ml, columns=nominal_features, prefix=nominal_features, drop_first=True)

print(f"Dataset shape after encoding: {df_ml.shape}")
print(f"Total features: {df_ml.shape[1]}")



ENCODING CATEGORICAL VARIABLES
Dataset shape after encoding: (400, 38)
Total features: 38


In [17]:
# ===========================
# 5. OUTLIER DETECTION
# ===========================

print("\n" + "=" * 60)
print("OUTLIER DETECTION (IQR Method)")
print("=" * 60)

# This list is correct, as it's the original numerical features
numerical_features = ['Age', 'Sleep Duration', 'Physical Activity Level',
                      'Stress Level', 'Heart Rate', 'Daily Steps',
                      'Systolic_BP', 'Diastolic_BP']

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers)

for feature in numerical_features:
    outlier_count = detect_outliers_iqr(df_processed, feature)
    print(f"{feature}: {outlier_count} outliers")



OUTLIER DETECTION (IQR Method)
Age: 3 outliers
Sleep Duration: 0 outliers
Physical Activity Level: 0 outliers
Stress Level: 0 outliers
Heart Rate: 0 outliers
Daily Steps: 0 outliers
Systolic_BP: 0 outliers
Diastolic_BP: 0 outliers


In [18]:

# ===========================
# 6. FEATURE SCALING (for ML)
# ===========================

print("\n" + "=" * 60)
print("FEATURE SCALING")
print("=" * 60)

# Select numerical features to scale
features_to_scale = ['Age', 'Sleep Duration', 'Physical Activity Level',
                     'Stress Level', 'Heart Rate', 'Daily Steps',
                     'Systolic_BP', 'Diastolic_BP', 'Sleep_Efficiency', 'Health_Risk_Score']

scaler = StandardScaler()
df_ml[features_to_scale] = scaler.fit_transform(df_ml[features_to_scale])

print("Numerical features scaled using StandardScaler")



FEATURE SCALING
Numerical features scaled using StandardScaler


In [19]:

# ===========================
# 7. PREPARE DATASETS FOR DIFFERENT ML TASKS
# ===========================

print("\n" + "=" * 60)
print("PREPARING ML DATASETS")
print("=" * 60)

# --- Define Target (y) variables ---
# We get these from the *unscaled* processed DataFrame
y_quality_regression = df_processed['Quality of Sleep']
y_quality_classification = df_processed['Sleep_Quality_Category']
y_disorder_multiclass = df_processed['Sleep Disorder']
y_disorder_binary = df_processed['Has_Sleep_Disorder']


# --- Define Feature (X) sets ---

# Task 1: Sleep Quality Prediction (Regression/Classification)
# We drop the Person_ID and all target-related columns
# (Sleep Disorder AND Sleep Quality)
drop_cols_quality = [
    'Person_ID',
    'Quality of Sleep', 'Sleep_Quality_Category', 'Sleep_Quality_Category_Encoded',
    'Sleep Disorder', 'Has_Sleep_Disorder'
]
# We also drop the original categorical strings that have been encoded
for cat in list(ordinal_features.keys()) + ['Gender', 'Occupation', 'Age_Group', 'Heart_Rate_Category', 'Steps_Category']:
    if cat in df_ml.columns:
        drop_cols_quality.append(cat)

X_quality = df_ml.drop(columns=drop_cols_quality, errors='ignore')


# Task 2: Sleep Disorder Prediction (Multi-class/Binary Classification)
### [CRITICAL FIX: PREVENTING DATA LEAKAGE] ###
# We must ALSO drop the 'Quality of Sleep' columns.
# Knowing sleep quality is 'Poor' is a direct leak for predicting 'Insomnia'.
drop_cols_disorder = [
    'Person_ID',
    'Sleep Disorder', 'Has_Sleep_Disorder',
    'Quality of Sleep', 'Sleep_Quality_Category', 'Sleep_Quality_Category_Encoded'
]
# Drop original categorical strings
for cat in list(ordinal_features.keys()) + ['Gender', 'Occupation', 'Age_Group', 'Heart_Rate_Category', 'Steps_Category']:
    if cat in df_ml.columns:
        drop_cols_disorder.append(cat)
        
X_disorder = df_ml.drop(columns=drop_cols_disorder, errors='ignore')


print(f"\nSleep Quality Prediction Dataset:")
print(f"  Features shape: {X_quality.shape}")
print(f"  Target (regression) shape: {y_quality_regression.shape}")
print(f"  Target (classification) shape: {y_quality_classification.shape}")

print(f"\nSleep Disorder Prediction Dataset:")
print(f"  Features shape: {X_disorder.shape}")
print(f"  Target (multi-class) shape: {y_disorder_multiclass.shape}")
print(f"  Target (binary) shape: {y_disorder_binary.shape}")



PREPARING ML DATASETS

Sleep Quality Prediction Dataset:
  Features shape: (400, 27)
  Target (regression) shape: (400,)
  Target (classification) shape: (400,)

Sleep Disorder Prediction Dataset:
  Features shape: (400, 27)
  Target (multi-class) shape: (400,)
  Target (binary) shape: (400,)


In [20]:

# ===========================
# 8. TRAIN-TEST SPLIT
# ===========================

# For Sleep Quality Prediction (Regression)
X_train_q_reg, X_test_q_reg, y_train_q_reg, y_test_q_reg = train_test_split(
    X_quality, y_quality_regression, test_size=0.2, random_state=42
)

# For Sleep Disorder Prediction (Multi-class)
X_train_d_multi, X_test_d_multi, y_train_d_multi, y_test_d_multi = train_test_split(
    X_disorder, y_disorder_multiclass, test_size=0.2, random_state=42,
    stratify=y_disorder_multiclass # Stratify is crucial here
)

print("\n" + "=" * 60)
print("TRAIN-TEST SPLIT COMPLETED")
print("=" * 60)
print(f"Sleep Quality (Reg) - Train: {X_train_q_reg.shape[0]}, Test: {X_test_q_reg.shape[0]}")
print(f"Sleep Disorder (Multi) - Train: {X_train_d_multi.shape[0]}, Test: {X_test_d_multi.shape[0]}")



TRAIN-TEST SPLIT COMPLETED
Sleep Quality (Reg) - Train: 320, Test: 80
Sleep Disorder (Multi) - Train: 320, Test: 80


In [21]:

# ===========================
# 9. SAVE PROCESSED DATA
# ===========================

print("\n" + "=" * 60)
print("SAVING PROCESSED DATASETS")
print("=" * 60)

# Save processed data for Power BI / Tableau
# D:\\GIT_HUB\\12_Final_Projects_of_all\\01_Analysis\\Dataset\\
# This is the human-readable version *before* scaling and one-hot encoding
df_processed.to_csv('D:\\GIT_HUB\\12_Final_Projects_of_all\\01_Analysis\\Dataset\\sleep_health_processed_for_viz.csv', index=False)
# df_processed.to_csv('sleep_health_processed_for_viz.csv', index=False)
print("✓ Processed data for visualization saved: sleep_health_processed_for_viz.csv")

# Save the final ML-ready dataset (with all features/targets)
# This is useful for notebooks
df_ml.to_csv('D:\\GIT_HUB\\12_Final_Projects_of_all\\01_Analysis\\Dataset\\sleep_health_ml_ready_full.csv', index=False)
# df_ml.to_csv('sleep_health_ml_ready_full.csv', index=False)

print("✓ Full ML-ready data saved: sleep_health_ml_ready_full.csv")

# Save feature names for later use
feature_names_quality = X_quality.columns.tolist()
pd.DataFrame({'feature': feature_names_quality}).to_csv('D:\\GIT_HUB\\12_Final_Projects_of_all\\01_Analysis\\Dataset\\feature_names_quality.csv', index=False)
# pd.DataFrame({'feature': feature_names_quality}).to_csv('feature_names_quality.csv', index=False)
print("✓ Quality model feature names saved: feature_names_quality.csv")

feature_names_disorder = X_disorder.columns.tolist()
pd.DataFrame({'feature': feature_names_disorder}).to_csv('D:\\GIT_HUB\\12_Final_Projects_of_all\\01_Analysis\\Dataset\\feature_names_disorder.csv', index=False)
# pd.DataFrame({'feature': feature_names_disorder}).to_csv('feature_names_disorder.csv', index=False)

print("✓ Disorder model feature names saved: feature_names_disorder.csv")

print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE!")
print("=" * 60)
print(f"""
Summary:
- Original features: {df.shape[1]}
- Engineered features: {len(new_features)}
- Final ML features (Quality model): {X_quality.shape[1]}
- Final ML features (Disorder model): {X_disorder.shape[1]}
- Records: {df_ml.shape[0]}

Files Created:
1. sleep_health_processed_for_viz.csv (for Power BI / Tableau)
2. sleep_health_ml_ready_full.csv (for ML notebooks)
3. feature_names_quality.csv (feature reference)
4. feature_names_disorder.csv (feature reference)

Ready for:
✓ Machine Learning Model Development
✓ Power BI Dashboard Creation
✓ Advanced Statistical Analysis
""")


SAVING PROCESSED DATASETS
✓ Processed data for visualization saved: sleep_health_processed_for_viz.csv
✓ Full ML-ready data saved: sleep_health_ml_ready_full.csv
✓ Quality model feature names saved: feature_names_quality.csv
✓ Disorder model feature names saved: feature_names_disorder.csv

PREPROCESSING COMPLETE!

Summary:
- Original features: 14
- Engineered features: 13
- Final ML features (Quality model): 27
- Final ML features (Disorder model): 27
- Records: 400

Files Created:
1. sleep_health_processed_for_viz.csv (for Power BI / Tableau)
2. sleep_health_ml_ready_full.csv (for ML notebooks)
3. feature_names_quality.csv (feature reference)
4. feature_names_disorder.csv (feature reference)

Ready for:
✓ Machine Learning Model Development
✓ Power BI Dashboard Creation
✓ Advanced Statistical Analysis

