In [40]:
pip install imblearn



In [41]:
# Import the modules
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `Sleep_Efficiency_Updated.csv` data from the `Resources` folder into a Pandas DataFrame.

In [83]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
sleep_df = pd.read_csv('Sleep_Efficiency_Updated.csv')

# Review the DataFrame
display(sleep_df.head())
display(sleep_df.tail())

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,Female,06/03/2021 01:00,06/03/2021 07:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,05/12/2021 02:00,05/12/2021 09:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,25/05/2021 21:30,25/05/2021 05:30,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,03/11/2021 02:30,03/11/2021 08:30,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,13/03/2021 01:00,13/03/2021 09:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0


Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
613,614,61,Female,05/12/2021 02:00,05/12/2021 09:00,7.0,0.67,23,23,54,2.0,50.0,5.0,No,0.0
614,615,30,Female,05/08/2021 00:00,05/08/2021 08:00,8.0,0.94,22,63,15,1.0,75.0,0.0,Yes,2.0
615,616,48,Male,09/01/2021 02:30,09/01/2021 08:30,6.0,0.79,24,60,16,4.0,0.0,0.0,Yes,2.0
616,617,32,Female,31/12/2021 01:30,31/12/2021 06:30,5.0,0.86,20,65,15,1.0,25.0,1.0,No,0.0
617,618,52,Male,19/08/2021 22:30,19/08/2021 05:30,7.0,0.83,23,57,20,1.0,0.0,0.0,Yes,0.0


In [84]:
# drop ID, bedtime, wake_time
sleep_df = sleep_df.drop(columns=['ID','Bedtime', 'Wakeup time'])

In [87]:
sleep_df = sleep_df.dropna()

In [88]:
sleep_df.nunique()

Age                       60
Gender                     2
Sleep duration             9
Sleep efficiency          50
REM sleep percentage      13
Deep sleep percentage     28
Light sleep percentage    28
Awakenings                 5
Caffeine consumption       6
Alcohol consumption        6
Smoking status             2
Exercise frequency         6
dtype: int64

In [89]:
sleep_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 528 entries, 0 to 617
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     528 non-null    int64  
 1   Gender                  528 non-null    object 
 2   Sleep duration          528 non-null    float64
 3   Sleep efficiency        528 non-null    float64
 4   REM sleep percentage    528 non-null    int64  
 5   Deep sleep percentage   528 non-null    int64  
 6   Light sleep percentage  528 non-null    int64  
 7   Awakenings              528 non-null    float64
 8   Caffeine consumption    528 non-null    float64
 9   Alcohol consumption     528 non-null    float64
 10  Smoking status          528 non-null    object 
 11  Exercise frequency      528 non-null    float64
dtypes: float64(6), int64(4), object(2)
memory usage: 53.6+ KB


In [90]:
# Create dumies
sleep_df = pd.get_dummies(sleep_df)

In [91]:
sleep_df['Sleep efficiency'] = sleep_df['Sleep efficiency'].apply(lambda x: 1 if x > 0.85 else 0)

### Step 2: Create the labels set (`y`)  from the “Sleep efficiency” column, and then create the features (`X`) DataFrame from the remaining columns.

In [92]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = sleep_df['Sleep efficiency']
# Separate the X variable, the features
x = sleep_df[['Sleep duration', 'REM sleep percentage', 'Deep sleep percentage', 'Light sleep percentage', 'Awakenings']]

In [93]:
# Review the y variable Series
y[:5]

0    1
1    0
2    1
3    0
4    0
Name: Sleep efficiency, dtype: int64

In [94]:
# Review the X variable DataFrame
x[:5]

Unnamed: 0,Sleep duration,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings
0,6.0,18,70,12,0.0
1,7.0,19,28,53,3.0
2,8.0,20,70,10,1.0
3,6.0,23,25,52,3.0
4,8.0,27,55,18,3.0


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [95]:
# Check the balance of our target values
y.value_counts()

0    310
1    218
Name: Sleep efficiency, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [96]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [97]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(solver = 'lbfgs', max_iter = 200, random_state=1)
# Fit the model using training data
logistic_regression_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [98]:
# Make a prediction using the testing data
predictions = logistic_regression_model.predict(X_test)

In [99]:
# Make a prediction using the testing data
y_predictions = logistic_regression_model.predict(X_test)

df_results = pd.DataFrame({
    "y_test": y_test,
    "prediction": y_predictions
})

df_results.sample(10)

Unnamed: 0,y_test,prediction
74,0,0
483,0,0
47,0,1
523,1,1
226,1,1
535,0,0
213,1,1
197,0,0
238,0,0
134,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [100]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_predictions)

0.8816267247639797

In [101]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_predictions)

array([[65, 16],
       [ 2, 49]])

In [110]:
# Print the classification report for the model
print("Classification Report:")
print(classification_report(y_test, y_predictions, target_names = ["Bad_sleep", "Good_sleep"]))

Classification Report:
              precision    recall  f1-score   support

   Bad_sleep       0.97      0.80      0.88        81
  Good_sleep       0.75      0.96      0.84        51

    accuracy                           0.86       132
   macro avg       0.86      0.88      0.86       132
weighted avg       0.89      0.86      0.87       132



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (bad_sleep) and `1` (good_sleep) labels?

**Answer:** The logistic regression model predict both labels as a positive inpact. The Number of healthy sleeping  is greater than the number of unhealthy sleeping. The model has a good accuracy model of 86%, the precision score for 1 (good sleeping) is 75% and the precision for 0 labels is bad sleep at 85%. The recall score is also quite high at 80% for prediction of 0 labels and 96% for good-sleep  with the label 1.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points.

In [111]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
r_model = RandomOverSampler(random_state=1)


# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = r_model.fit_resample(X_train, y_train)

In [112]:
# Count the distinct values of the resampled labels data
labels = y_resampled.value_counts()
num_distinct_val = len(labels)

print("Number of distinct values of the resampled labels : ",num_distinct_val )
print(labels)

Number of distinct values of the resampled labels :  2
0    229
1    229
Name: Sleep efficiency, dtype: int64


In [114]:
#y_resampled

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [115]:
 #Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model = LogisticRegression(solver = 'lbfgs', max_iter = 200, random_state=1)

# Fit the model using training data
lr_model.fit(X_train, y_train)

# Fit the model using the resampled training data
lr_model.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
predictions = lr_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [116]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.8914306463326072

In [117]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[65, 16],
       [ 1, 50]])

In [118]:
# Print the classification report for the model
classification_rep = classification_report(y_test, predictions)
print("Classification Report:")
print(classification_rep)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.80      0.88        81
           1       0.76      0.98      0.85        51

    accuracy                           0.87       132
   macro avg       0.87      0.89      0.87       132
weighted avg       0.90      0.87      0.87       132



### Step 4: Answer the following question

**Question:** ?

**Answer:**