# Preprocessing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from imblearn.over_sampling import RandomOverSampler
import numpy as np
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
sleep_df = pd.read_csv("Sleep_Efficiency_Updated.csv")
sleep_df.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,Female,06/03/2021 01:00,06/03/2021 07:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,05/12/2021 02:00,05/12/2021 09:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,25/05/2021 21:30,25/05/2021 05:30,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,03/11/2021 02:30,03/11/2021 08:30,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,13/03/2021 01:00,13/03/2021 09:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0


In [3]:
sleep_df = sleep_df.drop(columns=['ID','Bedtime', 'Wakeup time'])

In [4]:
sleep_df.nunique()

Age                       61
Gender                     2
Sleep duration             9
Sleep efficiency          50
REM sleep percentage      13
Deep sleep percentage     29
Light sleep percentage    29
Awakenings                 5
Caffeine consumption       6
Alcohol consumption        6
Smoking status             2
Exercise frequency         6
dtype: int64

In [5]:
sleep_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     618 non-null    int64  
 1   Gender                  618 non-null    object 
 2   Sleep duration          618 non-null    float64
 3   Sleep efficiency        618 non-null    float64
 4   REM sleep percentage    618 non-null    int64  
 5   Deep sleep percentage   618 non-null    int64  
 6   Light sleep percentage  618 non-null    int64  
 7   Awakenings              591 non-null    float64
 8   Caffeine consumption    582 non-null    float64
 9   Alcohol consumption     597 non-null    float64
 10  Smoking status          618 non-null    object 
 11  Exercise frequency      611 non-null    float64
dtypes: float64(6), int64(4), object(2)
memory usage: 58.1+ KB


In [6]:
sleep_df = pd.get_dummies(sleep_df)

In [7]:
sleep_df

Unnamed: 0,Age,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Exercise frequency,Gender_Female,Gender_Male,Smoking status_No,Smoking status_Yes
0,65,6.0,0.88,18,70,12,0.0,0.0,0.0,3.0,1,0,0,1
1,69,7.0,0.66,19,28,53,3.0,0.0,3.0,3.0,0,1,0,1
2,40,8.0,0.89,20,70,10,1.0,0.0,0.0,3.0,1,0,1,0
3,40,6.0,0.51,23,25,52,3.0,50.0,5.0,1.0,1,0,0,1
4,57,8.0,0.76,27,55,18,3.0,0.0,3.0,3.0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
613,61,7.0,0.67,23,23,54,2.0,50.0,5.0,0.0,1,0,1,0
614,30,8.0,0.94,22,63,15,1.0,75.0,0.0,2.0,1,0,0,1
615,48,6.0,0.79,24,60,16,4.0,0.0,0.0,2.0,0,1,0,1
616,32,5.0,0.86,20,65,15,1.0,25.0,1.0,0.0,1,0,1,0


In [8]:
sleep_df['Sleep efficiency'] = sleep_df['Sleep efficiency'].apply(lambda x: 1 if x > 0.85 else 0)

In [9]:
sleep_df

Unnamed: 0,Age,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Exercise frequency,Gender_Female,Gender_Male,Smoking status_No,Smoking status_Yes
0,65,6.0,1,18,70,12,0.0,0.0,0.0,3.0,1,0,0,1
1,69,7.0,0,19,28,53,3.0,0.0,3.0,3.0,0,1,0,1
2,40,8.0,1,20,70,10,1.0,0.0,0.0,3.0,1,0,1,0
3,40,6.0,0,23,25,52,3.0,50.0,5.0,1.0,1,0,0,1
4,57,8.0,0,27,55,18,3.0,0.0,3.0,3.0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
613,61,7.0,0,23,23,54,2.0,50.0,5.0,0.0,1,0,1,0
614,30,8.0,1,22,63,15,1.0,75.0,0.0,2.0,1,0,0,1
615,48,6.0,0,24,60,16,4.0,0.0,0.0,2.0,0,1,0,1
616,32,5.0,1,20,65,15,1.0,25.0,1.0,0.0,1,0,1,0


**Create the labels set (y) from the “Sleep efficiency” column, and then create the features (X) DataFrame from the remaining columns.**

In [10]:
# Split our prepocessed  data into our features and target arrays
y = sleep_df['Sleep efficiency'].values

X = sleep_df.drop(columns=['Sleep efficiency']).values

**Split the data into training and testing datasets by using train_test_split**

In [11]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [12]:
# Create a Standard instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Optimizing Model with original Dataset (Accuracy: 58%)

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 112       
                                                                 
 dense_1 (Dense)             (None, 5)                 45        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 163 (652.00 Byte)
Trainable params: 163 (652.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [15]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

5/5 - 0s - loss: nan - accuracy: 0.5871 - 300ms/epoch - 60ms/step
Loss: nan, Accuracy: 0.5870967507362366


# Optimizing Model with droping all NaN values (Accuracy: 90%)

In [17]:
#Dropping rows that have NaN
sleep_df = sleep_df.dropna()

In [18]:
sleep_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 528 entries, 0 to 617
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     528 non-null    int64  
 1   Sleep duration          528 non-null    float64
 2   Sleep efficiency        528 non-null    int64  
 3   REM sleep percentage    528 non-null    int64  
 4   Deep sleep percentage   528 non-null    int64  
 5   Light sleep percentage  528 non-null    int64  
 6   Awakenings              528 non-null    float64
 7   Caffeine consumption    528 non-null    float64
 8   Alcohol consumption     528 non-null    float64
 9   Exercise frequency      528 non-null    float64
 10  Gender_Female           528 non-null    uint8  
 11  Gender_Male             528 non-null    uint8  
 12  Smoking status_No       528 non-null    uint8  
 13  Smoking status_Yes      528 non-null    uint8  
dtypes: float64(5), int64(5), uint8(4)
memory u

In [19]:
# Split our preprocessed data into our features and target arrays
y = sleep_df['Sleep efficiency'].values

X = sleep_df.drop(columns=['Sleep efficiency']).values

In [20]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [21]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)


# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 8)                 112       
                                                                 
 dense_4 (Dense)             (None, 5)                 45        
                                                                 
 dense_5 (Dense)             (None, 1)                 6         
                                                                 
Total params: 163 (652.00 Byte)
Trainable params: 163 (652.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [24]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [25]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

5/5 - 0s - loss: 0.2292 - accuracy: 0.9242 - 248ms/epoch - 50ms/step
Loss: 0.22920311987400055, Accuracy: 0.9242424368858337


# Optimizing Model with more neurons (Accuracy: 92%)

In [26]:
# Create a new neural network model with more neurons
nn = tf.keras.models.Sequential()

nn.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=number_input_features))

nn.add(tf.keras.layers.Dense(units=80, activation="sigmoid"))

nn.add(tf.keras.layers.Dense(units=80, activation="sigmoid"))

nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 80)                1120      
                                                                 
 dense_7 (Dense)             (None, 80)                6480      
                                                                 
 dense_8 (Dense)             (None, 80)                6480      
                                                                 
 dense_9 (Dense)             (None, 1)                 81        
                                                                 
Total params: 14161 (55.32 KB)
Trainable params: 14161 (55.32 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [28]:
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

5/5 - 0s - loss: 0.2738 - accuracy: 0.9394 - 296ms/epoch - 59ms/step
Loss: 0.2737595736980438, Accuracy: 0.939393937587738


# Logistic Regression

**Create the labels set (y) from the “Sleep efficiency” column, and then create the features (X) DataFrame from the remaining columns.**

In [30]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = sleep_df['Sleep efficiency']
# Separate the X variable, the features
x = sleep_df.drop(columns=['Sleep efficiency']).values

In [31]:
# Review the y variable Series
y[:5]

0    1
1    0
2    1
3    0
4    0
Name: Sleep efficiency, dtype: int64

In [32]:
# Review the X variable DataFrame
x[:5]

array([[65.,  6., 18., 70., 12.,  0.,  0.,  0.,  3.,  1.,  0.,  0.,  1.],
       [69.,  7., 19., 28., 53.,  3.,  0.,  3.,  3.,  0.,  1.,  0.,  1.],
       [40.,  8., 20., 70., 10.,  1.,  0.,  0.,  3.,  1.,  0.,  1.,  0.],
       [40.,  6., 23., 25., 52.,  3., 50.,  5.,  1.,  1.,  0.,  0.,  1.],
       [57.,  8., 27., 55., 18.,  3.,  0.,  3.,  3.,  0.,  1.,  1.,  0.]])

**Check the balance of the labels variable (y) by using the value_counts function.**

In [33]:
# Check the balance of our target values
y.value_counts()

0    310
1    218
Name: Sleep efficiency, dtype: int64

**Check the balance of the labels variable (y) by using the value_counts function.**

In [34]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)

**Create a Logistic Regression Model with the Original Data**

**Fit a logistic regression model by using the training data (X_train and y_train).**

In [35]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(solver = 'lbfgs', max_iter = 200, random_state=1)
# Fit the model using training data
logistic_regression_model.fit(X_train, y_train)

**Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.**

In [36]:
# Make a prediction using the testing data
predictions = logistic_regression_model.predict(X_test)

In [37]:
# Make a prediction using the testing data
y_predictions = logistic_regression_model.predict(X_test)

df_results = pd.DataFrame({
    "y_test": y_test,
    "prediction": y_predictions
})

df_results.sample(10)

Unnamed: 0,y_test,prediction
38,1,1
415,0,0
117,0,0
213,1,1
206,1,1
433,1,1
79,0,0
550,1,1
254,0,0
601,1,1


**Evaluate the model’s performance**

In [38]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_predictions)

0.8877995642701525

In [39]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_predictions)

array([[66, 15],
       [ 2, 49]], dtype=int64)

In [40]:
# Print the classification report for the model
print("Classification Report:")
print(classification_report(y_test, y_predictions, target_names = ["Bad_sleep", "Good_sleep"]))

Classification Report:
              precision    recall  f1-score   support

   Bad_sleep       0.97      0.81      0.89        81
  Good_sleep       0.77      0.96      0.85        51

    accuracy                           0.87       132
   macro avg       0.87      0.89      0.87       132
weighted avg       0.89      0.87      0.87       132



**Question:** How well does the logistic regression model predict both the `0` (bad_sleep) and `1` (good_sleep) labels?

**Answer:** The logistic regression model predict both labels as a positive inpact. The Number of healthy sleeping  is greater than the number of unhealthy sleeping. The model has a good accuracy model of 87%, the precision score for 1 (good sleeping) is 77% and the precision for 0 labels is bad sleep at 97%. The recall score is also quite high at 81% for prediction of 0 labels and 96% for good-sleep  with the label 1.

# RandomOverSampled Data

In [41]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
r_model = RandomOverSampler(random_state=1)


# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = r_model.fit_resample(X_train, y_train)

In [42]:
# Count the distinct values of the resampled labels data
labels = y_resampled.value_counts()
num_distinct_val = len(labels)

print("Number of distinct values of the resampled labels : ",num_distinct_val )
print(labels)

Number of distinct values of the resampled labels :  2
0    229
1    229
Name: Sleep efficiency, dtype: int64


In [43]:
#Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model = LogisticRegression(solver = 'lbfgs', max_iter = 200, random_state=1)

# Fit the model using training data
lr_model.fit(X_train, y_train)

# Fit the model using the resampled training data
lr_model.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
predictions = lr_model.predict(X_test)

In [44]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.8877995642701525

In [45]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[66, 15],
       [ 2, 49]], dtype=int64)

In [46]:
# Print the classification report for the model
classification_rep = classification_report(y_test, predictions)
print("Classification Report:")
print(classification_rep)

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.81      0.89        81
           1       0.77      0.96      0.85        51

    accuracy                           0.87       132
   macro avg       0.87      0.89      0.87       132
weighted avg       0.89      0.87      0.87       132



### RandomOverSample gives similar accuracy as Logistic Regression