In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import shap
import pickle
import warnings
warnings.filterwarnings("ignore")
import joblib
import os

# connecting to the database and getting the data

In [2]:
host = r'127.0.0.1'
db = r'MSDS610' 
user = r'postgres'
pw = r'12345'
port = r'5432' 


table_name = r'sleep_data_cleaned'
schema = r'cleaned' 
db_conn = create_engine("postgresql://{}:{}@{}:{}/{}".format(user, pw, host, port, db))



df_cleaned = pd.read_sql_table(table_name, db_conn, schema)

df_cleaned

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,Systolic BP,Diastolic BP
0,1,1,27,9,6.1,6,42,6,3,77,4200,1,126,83
1,2,1,28,1,6.2,6,60,8,0,75,10000,1,125,80
2,3,1,28,1,6.2,6,60,8,0,75,10000,1,125,80
3,4,1,28,6,5.9,4,30,8,2,85,3000,2,140,90
4,5,1,28,6,5.9,4,30,8,2,85,3000,2,140,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,0,59,5,8.1,9,75,3,3,68,7000,2,140,95
370,371,0,59,5,8.0,9,75,3,3,68,7000,2,140,95
371,372,0,59,5,8.1,9,75,3,3,68,7000,2,140,95
372,373,0,59,5,8.1,9,75,3,3,68,7000,2,140,95


# Analytical Question

Can we predict sleep quality  based on lifestyle factors, health metrics, and sleep patterns?

# features choosed

In [4]:
df=df_cleaned.copy()
df['Sleep Efficiency'] = df['Sleep Duration'] / 24
df['Stress to Sleep Ratio'] = df['Stress Level'] / (df['Sleep Duration'] + 1e-6)
df= df.rename(str,axis="columns") 

Sleep Efficiency and Stress to Sleep Ratio are chosen because they are directly relevant to the analytical question likely to be good predictors of sleep quality and offer a good balance between capturing important information and avoiding excessive redundancy. Including Stress Level and Sleep Duration alongside the ratio allows the model to capture their individual effects.

# Feature Selection and Data Splitting

In [5]:

X = df.drop(columns=['Quality of Sleep'])
y = df['Quality of Sleep'].apply(lambda x: 1 if x >= 7 else 0)

In [6]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42, stratify=y_train_val)


# Feature Set Experiment 

In [9]:
feature_sets = [
    ['Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio', 'Age', 'Heart Rate', 'BMI Category'], 
    ['Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio', 'Age'],  
    ['Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio'],  
    ['Sleep Duration', 'Stress Level', 'Physical Activity Level', 'Sleep Efficiency', 'Stress to Sleep Ratio'],  
    ['Age', 'Gender', 'Occupation', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Heart Rate', 'Daily Steps', 'Sleep Efficiency', 'Stress to Sleep Ratio']  
  
]

To determine the optimal combination of features for predicting sleep quality several feature sets were evaluated. These included sets comprised of the top 4, 5, and 7 most important features identified through feature importance analysis as well as the original feature set and a set containing all available features.

# Hyperparameter Tuning and Best Model Initialization

In [7]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', 'balanced_subsample']
}

best_accuracy = 0
best_test_accuracy = 0
best_model = None
best_features = None
best_hyperparams = None
best_scaler = None


In [None]:
for features in feature_sets:
 
    X_train_subset = X_train[features]
    X_val_subset = X_val[features]

 
    scaler = StandardScaler()
    X_train_subset_scaled = scaler.fit_transform(X_train_subset)
    X_val_subset_scaled = scaler.transform(X_val_subset)

 
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_subset_scaled, y_train)

    model = grid_search.best_estimator_

 
    X_val_subset = X_val[features]
    X_val_subset_scaled = scaler.transform(X_val_subset)
    y_val_pred = model.predict(X_val_subset_scaled)
    val_accuracy = accuracy_score(y_val, y_val_pred)

   
    X_test_subset = X_test[features]
    X_test_subset_scaled = scaler.transform(X_test_subset)
    y_test_pred = model.predict(X_test_subset_scaled)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"\nFeatures: {features}")
    print("Validation Accuracy:", val_accuracy)
    print("Test Accuracy:", test_accuracy) # Print test accuracy for each feature set
    print(classification_report(y_val, y_val_pred))

    if test_accuracy > best_test_accuracy:  # Compare TEST accuracies
        best_test_accuracy = test_accuracy
        best_model = model
        best_features = features
        best_hyperparams = grid_search.best_params_
        best_scaler = scaler

Fitting 5 folds for each of 120 candidates, totalling 600 fits

Features: ['Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio', 'Age', 'Heart Rate', 'BMI Category']
Validation Accuracy: 0.9770992366412213
Test Accuracy: 1.0
              precision    recall  f1-score   support

           0       0.95      0.98      0.96        41
           1       0.99      0.98      0.98        90

    accuracy                           0.98       131
   macro avg       0.97      0.98      0.97       131
weighted avg       0.98      0.98      0.98       131

Fitting 5 folds for each of 120 candidates, totalling 600 fits

Features: ['Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio', 'Age']
Validation Accuracy: 0.9770992366412213
Test Accuracy: 0.9911504424778761
              precision    recall  f1-score   support

           0       0.95      0.98      0.96        41
           1       0.99      0.98      0.98        90

    accuracy            

The model training and evaluation process iterates through various feature set combinations. For each combination features are selected from the training and validation sets and a StandardScaler is fitted and used to transform these subsets. A GridSearchCV object utilizing a RandomForestClassifier and a predefined parameter grid performs hyperparameter tuning with 5-fold cross-validation. The trained model is then used to predict on the scaled validation set and the validation accuracy is calculated. Concurrently for each feature set and hyperparameter combination the corresponding features are selected from the test set scaled using the same StandardScaler and used to generate predictions. The test accuracy is then calculated. Throughout this process the model exhibiting the highest test accuracy along with its corresponding features, hyperparameters, and scaler, is tracked and stored. Finally for each iteration the current feature set validation accuracy test accuracy and a classification report for the validation set are printed to provide a comprehensive overview of model performance across different feature sets and hyperparameter combinations. The final model selection is based on the highest achieved test accuracy.  The provided output shows the validation and test accuracies obtained for different feature sets.  For example, the combination of 'Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio', 'Age', 'Heart Rate', and 'BMI Category' yielded a validation accuracy of 0.977 and a test accuracy of 1.0.  Other combinations, such as the top 5 or top 4 features also produced high validation and test accuracies. The model with the highest test accuracy in this case, 1.0 for the top 7 features is selected as the final model.

# best model

In [12]:
print(f"\nBest Model (based on Validation Accuracy):")
print(f"Features: {best_features}")
print(f"Hyperparameters: {best_hyperparams}")
print(f"Best Validation Accuracy: {best_accuracy}")
print(f"Final Test Accuracy: {best_test_accuracy}") # Report the final test accuracy


Best Model (based on Validation Accuracy):
Features: ['Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio', 'Age', 'Heart Rate', 'BMI Category']
Hyperparameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best Validation Accuracy: 0.9770992366412213
Final Test Accuracy: 1.0



The best performing model, selected based on achieving the highest test accuracy of 1 utilized the top 7 features: 'Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio', 'Age', 'Heart Rate', and 'BMI Category'. This model, with hyperparameters tuned to {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}

# 7. Save the Best ModelD and atasets

In [14]:

joblib.dump(best_model, 'sleep_quality_classifier.pkl')
joblib.dump(best_features, 'best_features.pkl')
joblib.dump(best_scaler, 'scaler.pkl')

if not os.path.exists('datasets'):
    os.makedirs('datasets')

X_val.to_csv('datasets/X_val.csv', index=False)
y_val.to_csv('datasets/y_val.csv', index=False)

# summary


This notebook explores the prediction of sleep quality  based on lifestyle factors, health metrics, and sleep patterns using a RandomForestClassifier. Feature engineering was performed, creating 'Sleep Efficiency' and 'Stress to Sleep Ratio'.  Several feature sets including the top 4, 5, and 7 most important features, the original feature set, and a set containing all features were evaluated.  Hyperparameter tuning was carried out using GridSearchCV with 5-fold cross-validation exploring different combinations of n_estimators, max_depth, min_samples_split, and class_weight. The model training and evaluation process iterated through these feature sets and hyperparameter combinations. For each combination features were selected and scaled and the model was trained and evaluated on both the validation and test sets. Crucially the model with the highest test accuracy was selected as the best model.  The best performing model, achieving a test accuracy of 1.0, utilized the top 7 features: 'Stress Level', 'Sleep Efficiency', 'Sleep Duration', 'Stress to Sleep Ratio', 'Age', 'Heart Rate', and 'BMI Category', with hyperparameters {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}.  This trained model was then saved using joblib along with the validation data for use in a subsequent notebook