

# Importing Libraries




In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.under_sampling import CondensedNearestNeighbour, TomekLinks, RandomUnderSampler

import warnings
warnings.filterwarnings('ignore')

# Dataset Load & Preprocessing

In [32]:
df = pd.read_csv("/content/Sleep_health_and_lifestyle_dataset.csv")
df.fillna("None", inplace=True)
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].str.split('/', expand=True)
df.drop(['Person ID', 'Blood Pressure'], axis=1, inplace=True)
df = pd.get_dummies(df, columns=['Occupation', 'BMI Category'], drop_first=False)

label_encoder = LabelEncoder()
columns_to_encode = ['Gender', 'Sleep Disorder']
for col in columns_to_encode:
    df[col] = label_encoder.fit_transform(df[col])

X = df.drop('Sleep Disorder', axis=1)
y = df['Sleep Disorder']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# ML Model Result Storage

In [35]:
ML_Model = []
ML_Config = []
accuracy = []
f1_score = []
recall = []
precision = []
auc_roc = []  # Adding a holder for AUC-ROC

# Function to call for storing the results
def storeResults(model, config, a, b, c, d, e):
    """
    Store model performance results

    Parameters:
    model: Name of the ML model
    config: Configuration name (preprocessing steps applied)
    a: Accuracy score
    b: F1 score
    c: Recall score
    d: Precision score
    e: AUC-ROC score
    """
    ML_Model.append(model)
    ML_Config.append(config)
    accuracy.append(round(a, 6))
    f1_score.append(round(b, 6))
    recall.append(round(c, 6))
    precision.append(round(d, 6))
    auc_roc.append(round(e, 6))

# Random Forest with K-Fold, Oversampling, Undersampling, Randomsampling

In [52]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# applying k-fold cv with Random Forest
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# RandomForest classifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
cv_score = cross_val_score(rfc, X_normalized, y, cv=kfold, scoring='accuracy').mean()
cv_score = float(cv_score)
print("\nApplying K-Fold cross validation Random Forest's score is: ", cv_score)

# applying oversampling Smote & ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
X_train_resample_smote, y_train_resample_smote = smote.fit_resample(X_train_normalized, y_train)
X_train_resample_adasyn, y_train_resample_adasyn = adasyn.fit_resample(X_train_normalized, y_train)
configurations.append(('SMOTE', X_train_resample_smote, X_test_normalized, y_train_resample_smote))
configurations.append(('ADASYN', X_train_resample_adasyn, X_test_normalized, y_train_resample_adasyn))

# applying undersampling CNN & Tomek Links
cnn = CondensedNearestNeighbour(random_state=42)
tomek = TomekLinks()
X_train_resample_cnn, y_train_resample_cnn = cnn.fit_resample(X_train_normalized, y_train)
X_train_resample_tomek, y_train_resample_tomek = tomek.fit_resample(X_train_normalized, y_train)
configurations.append(('CondensedNN', X_train_resample_cnn, X_test_normalized, y_train_resample_cnn))
configurations.append(('Tomek Links', X_train_resample_tomek, X_test_normalized, y_train_resample_tomek))

# applying randomsampling Randomoversampling & Randomundersampling
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_resample_ros, y_train_resample_ros = ros.fit_resample(X_train_normalized, y_train)
X_train_resample_rus, y_train_resample_rus = rus.fit_resample(X_train_normalized, y_train)
configurations.append(('Random Oversampling', X_train_resample_ros, X_test_normalized, y_train_resample_ros))
configurations.append(('Random Undersampling', X_train_resample_rus, X_test_normalized, y_train_resample_rus))

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
  rfc.fit(X_train_cfg, y_train_cfg)

  y_train_rf = rfc.predict(X_train_cfg)
  y_test_rf = rfc.predict(X_test_cfg)
  y_train_rf_proba = rfc.predict_proba(X_train_cfg)
  y_test_rf_proba = rfc.predict_proba(X_test_cfg)

  metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_rf),
            metrics.accuracy_score(y_test, y_test_rf),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.f1_score(y_test, y_test_rf, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.recall_score(y_test, y_test_rf, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.precision_score(y_test, y_test_rf, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
        ]
    }

  df_metrics = pd.DataFrame(metrics_dict)
  print("\nRandom Forest Model Performance Metrics")
  print("Configuration Name: ", name)
  print(df_metrics.to_string(index=False))



Applying K-Fold cross validation Random Forest's score is:  0.9066856330014224

Random Forest Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923843 0.921041   0.927580 0.990345
    Test  0.893617  0.843243 0.844207   0.851058 0.928146

Random Forest Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923843 0.921041   0.927580 0.990282
    Test  0.893617  0.843243 0.844207   0.851058 0.927725

Random Forest Model Performance Metrics
Configuration Name:  SMOTE
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.961382  0.961429 0.961382   0.961621 0.995024
    Test  0.914894  0.876705 0.874510   0.880688 0.919585

Random Forest Model Performance Metrics
Configuration Name:  ADASYN
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.960159  0.960070 0.960115   0.960127 

# Decision Tree with K-Fold, Oversampling, Undersampling, Randomsampling

In [51]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# applying k-fold cv with Decision Tree
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# DecisionTree classifier
dtc = DecisionTreeClassifier(random_state=42)
cv_score = cross_val_score(dtc, X_normalized, y, cv=kfold, scoring='accuracy').mean()
cv_score = float(cv_score)
print("\nApplying K-Fold cross validation Decision Tree's score is: ", cv_score)

# applying oversampling Smote & ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
X_train_resample_smote, y_train_resample_smote = smote.fit_resample(X_train_normalized, y_train)
X_train_resample_adasyn, y_train_resample_adasyn = adasyn.fit_resample(X_train_normalized, y_train)
configurations.append(('SMOTE', X_train_resample_smote, X_test_normalized, y_train_resample_smote))
configurations.append(('ADASYN', X_train_resample_adasyn, X_test_normalized, y_train_resample_adasyn))

# applying undersampling CNN & Tomek Links
cnn = CondensedNearestNeighbour(random_state=42)
tomek = TomekLinks()
X_train_resample_cnn, y_train_resample_cnn = cnn.fit_resample(X_train_normalized, y_train)
X_train_resample_tomek, y_train_resample_tomek = tomek.fit_resample(X_train_normalized, y_train)
configurations.append(('CondensedNN', X_train_resample_cnn, X_test_normalized, y_train_resample_cnn))
configurations.append(('Tomek Links', X_train_resample_tomek, X_test_normalized, y_train_resample_tomek))

# applying randomsampling Randomoversampling & Randomundersampling
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_resample_ros, y_train_resample_ros = ros.fit_resample(X_train_normalized, y_train)
X_train_resample_rus, y_train_resample_rus = rus.fit_resample(X_train_normalized, y_train)
configurations.append(('Random Oversampling', X_train_resample_ros, X_test_normalized, y_train_resample_ros))
configurations.append(('Random Undersampling', X_train_resample_rus, X_test_normalized, y_train_resample_rus))

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
  dtc.fit(X_train_cfg, y_train_cfg)

  y_train_dt = dtc.predict(X_train_cfg)
  y_test_dt = dtc.predict(X_test_cfg)
  y_train_dt_proba = dtc.predict_proba(X_train_cfg)
  y_test_dt_proba = dtc.predict_proba(X_test_cfg)

  metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_dt),
            metrics.accuracy_score(y_test, y_test_dt),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_dt, average='macro'),
            metrics.f1_score(y_test, y_test_dt, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_dt, average='macro'),
            metrics.recall_score(y_test, y_test_dt, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_dt, average='macro'),
            metrics.precision_score(y_test, y_test_dt, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_dt_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_dt_proba, multi_class='ovr', average='macro'),
        ]
    }

  df_metrics = pd.DataFrame(metrics_dict)
  print("\nDeicion Tree Model Performance Metrics")
  print("Configuration Name: ", name)
  print(df_metrics.to_string(index=False))



Applying K-Fold cross validation Decision Tree's score is:  0.8907539118065433

Deicion Tree Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645   0.927169 0.990717
    Test  0.893617  0.843243 0.844207   0.851058 0.887355

Deicion Tree Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645   0.927169 0.990717
    Test  0.893617  0.843243 0.844207   0.851058 0.887355

Deicion Tree Model Performance Metrics
Configuration Name:  SMOTE
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.961382  0.961431 0.961382   0.961527 0.996486
    Test  0.914894  0.876705 0.874510   0.880688 0.908142

Deicion Tree Model Performance Metrics
Configuration Name:  ADASYN
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.960159  0.960070 0.960115   0.960127 0.99

# Gradient Boosting with K-Fold, Oversampling, Undersampling, Randomsampling

In [54]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# applying k-fold cv with Gradient Boosting
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# GradientBoosting classifier
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=42)
cv_score = cross_val_score(gbc, X_normalized, y, cv=kfold, scoring='accuracy').mean()
cv_score = float(cv_score)
print("\nApplying K-Fold cross validation Gradient Boosting's score is: ", cv_score)

# applying oversampling Smote & ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
X_train_resample_smote, y_train_resample_smote = smote.fit_resample(X_train_normalized, y_train)
X_train_resample_adasyn, y_train_resample_adasyn = adasyn.fit_resample(X_train_normalized, y_train)
configurations.append(('SMOTE', X_train_resample_smote, X_test_normalized, y_train_resample_smote))
configurations.append(('ADASYN', X_train_resample_adasyn, X_test_normalized, y_train_resample_adasyn))

# applying undersampling CNN & Tomek Links
cnn = CondensedNearestNeighbour(random_state=42)
tomek = TomekLinks()
X_train_resample_cnn, y_train_resample_cnn = cnn.fit_resample(X_train_normalized, y_train)
X_train_resample_tomek, y_train_resample_tomek = tomek.fit_resample(X_train_normalized, y_train)
configurations.append(('CondensedNN', X_train_resample_cnn, X_test_normalized, y_train_resample_cnn))
configurations.append(('Tomek Links', X_train_resample_tomek, X_test_normalized, y_train_resample_tomek))

# applying randomsampling Randomoversampling & Randomundersampling
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_resample_ros, y_train_resample_ros = ros.fit_resample(X_train_normalized, y_train)
X_train_resample_rus, y_train_resample_rus = rus.fit_resample(X_train_normalized, y_train)
configurations.append(('Random Oversampling', X_train_resample_ros, X_test_normalized, y_train_resample_ros))
configurations.append(('Random Undersampling', X_train_resample_rus, X_test_normalized, y_train_resample_rus))

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
  gbc.fit(X_train_cfg, y_train_cfg)

  y_train_gb = gbc.predict(X_train_cfg)
  y_test_gb = gbc.predict(X_test_cfg)
  y_train_gb_proba = gbc.predict_proba(X_train_cfg)
  y_test_gb_proba = gbc.predict_proba(X_test_cfg)

  metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_gb),
            metrics.accuracy_score(y_test, y_test_gb),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_gb, average='macro'),
            metrics.f1_score(y_test, y_test_gb, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_gb, average='macro'),
            metrics.recall_score(y_test, y_test_gb, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_gb, average='macro'),
            metrics.precision_score(y_test, y_test_gb, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gb_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro'),
        ]
    }

  df_metrics = pd.DataFrame(metrics_dict)
  print("\nGradien Boosting Model Performance Metrics")
  print("Configuration Name: ", name)
  print(df_metrics.to_string(index=False))



Applying K-Fold cross validation Gradient Boosting's score is:  0.9092460881934565

Gradien Boosting Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645   0.927169 0.990717
    Test  0.893617  0.843243 0.844207   0.851058 0.890514

Gradien Boosting Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645   0.927169 0.990717
    Test  0.893617  0.843243 0.844207   0.851058 0.890514

Gradien Boosting Model Performance Metrics
Configuration Name:  SMOTE
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.961382  0.961429 0.961382   0.961621 0.996486
    Test  0.914894  0.876705 0.874510   0.880688 0.898305

Gradien Boosting Model Performance Metrics
Configuration Name:  ADASYN
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.960159  0.960070 0.96

# Extra Trees with K-Fold, Oversampling, Undersampling, Randomsampling

In [56]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# applying k-fold cv with Extra Trees Classifier
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# ExtraTrees classifier
etc = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
cv_score = cross_val_score(etc, X_normalized, y, cv=kfold, scoring='accuracy').mean()
cv_score = float(cv_score)
print("\nApplying K-Fold cross validation Extra Trees Classifier's score is: ", cv_score)

# applying oversampling Smote & ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
X_train_resample_smote, y_train_resample_smote = smote.fit_resample(X_train_normalized, y_train)
X_train_resample_adasyn, y_train_resample_adasyn = adasyn.fit_resample(X_train_normalized, y_train)
configurations.append(('SMOTE', X_train_resample_smote, X_test_normalized, y_train_resample_smote))
configurations.append(('ADASYN', X_train_resample_adasyn, X_test_normalized, y_train_resample_adasyn))

# applying undersampling CNN & Tomek Links
cnn = CondensedNearestNeighbour(random_state=42)
tomek = TomekLinks()
X_train_resample_cnn, y_train_resample_cnn = cnn.fit_resample(X_train_normalized, y_train)
X_train_resample_tomek, y_train_resample_tomek = tomek.fit_resample(X_train_normalized, y_train)
configurations.append(('CondensedNN', X_train_resample_cnn, X_test_normalized, y_train_resample_cnn))
configurations.append(('Tomek Links', X_train_resample_tomek, X_test_normalized, y_train_resample_tomek))

# applying randomsampling Randomoversampling & Randomundersampling
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_resample_ros, y_train_resample_ros = ros.fit_resample(X_train_normalized, y_train)
X_train_resample_rus, y_train_resample_rus = rus.fit_resample(X_train_normalized, y_train)
configurations.append(('Random Oversampling', X_train_resample_ros, X_test_normalized, y_train_resample_ros))
configurations.append(('Random Undersampling', X_train_resample_rus, X_test_normalized, y_train_resample_rus))

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
  etc.fit(X_train_cfg, y_train_cfg)

  y_train_et = etc.predict(X_train_cfg)
  y_test_et = etc.predict(X_test_cfg)
  y_train_et_proba = etc.predict_proba(X_train_cfg)
  y_test_et_proba = etc.predict_proba(X_test_cfg)

  metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_et),
            metrics.accuracy_score(y_test, y_test_et),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_et, average='macro'),
            metrics.f1_score(y_test, y_test_et, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_et, average='macro'),
            metrics.recall_score(y_test, y_test_et, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_et, average='macro'),
            metrics.precision_score(y_test, y_test_et, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_et_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_et_proba, multi_class='ovr', average='macro'),
        ]
    }

  df_metrics = pd.DataFrame(metrics_dict)
  print("\nExtraTrees Model Performance Metrics")
  print("Configuration Name: ", name)
  print(df_metrics.to_string(index=False))



Applying K-Fold cross validation Extra Trees Classifier's score is:  0.9092460881934565

ExtraTrees Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645   0.927169 0.990717
    Test  0.893617  0.843243 0.844207   0.851058 0.890514

ExtraTrees Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645   0.927169 0.990717
    Test  0.893617  0.843243 0.844207   0.851058 0.890514

ExtraTrees Model Performance Metrics
Configuration Name:  SMOTE
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.961382  0.961429 0.961382   0.961621 0.996486
    Test  0.914894  0.876705 0.874510   0.880688 0.898305

ExtraTrees Model Performance Metrics
Configuration Name:  ADASYN
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.960159  0.960070 0.960115   0.960127 0.9