In [3]:
import pandas as pd
import numpy as np 
import plotly.express as px
import seaborn as sns
import plotly.offline as py
import plotly.graph_objects as go

# Data Collection and Preprocessing

In [4]:
sleep = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
sleep

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


# Data Cleaning and Preprocessing

In [5]:
# drop person ID
sleep = sleep.drop('Person ID', axis=1)

In [6]:
# replace all NaNs in target with "No Disorder"
sleep['Sleep Disorder'] = sleep['Sleep Disorder'].fillna('No Disorder')

In [8]:
sleep['Sleep Disorder'].unique()

array(['No Disorder', 'Sleep Apnea', 'Insomnia'], dtype=object)

In [9]:
sleep['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [10]:
sleep['Age'].unique()

array([27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
       44, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])

In [11]:
sleep['Occupation'].unique()

array(['Software Engineer', 'Doctor', 'Sales Representative', 'Teacher',
       'Nurse', 'Engineer', 'Accountant', 'Scientist', 'Lawyer',
       'Salesperson', 'Manager'], dtype=object)

In [12]:
sleep['Sleep Duration'].unique()

array([6.1, 6.2, 5.9, 6.3, 7.8, 6. , 6.5, 7.6, 7.7, 7.9, 6.4, 7.5, 7.2,
       5.8, 6.7, 7.3, 7.4, 7.1, 6.6, 6.9, 8. , 6.8, 8.1, 8.3, 8.5, 8.4,
       8.2])

In [13]:
sleep['Quality of Sleep'].unique()

array([6, 4, 7, 5, 8, 9])

In [14]:
sleep['Physical Activity Level'].unique()

array([42, 60, 30, 40, 75, 35, 45, 50, 32, 70, 80, 55, 90, 47, 65, 85])

In [15]:
sleep['Stress Level'].unique()

array([6, 8, 7, 4, 3, 5])

In [18]:
sleep['BMI Category'].unique()

array(['Overweight', 'Normal', 'Obese'], dtype=object)

In [17]:
# replacing "Normal Weight" with "Normal"
to_combine = ['Normal', 'Normal Weight']
sleep['BMI Category'] = sleep['BMI Category'].replace(to_combine, 'Normal')

Blood Pressure needs to be split into upper and lower values. These are non-integer features, so, convert them to integers.

In [19]:
sleep = pd.concat([sleep, sleep['Blood Pressure'].str.split('/', expand=True)], axis=1).drop('Blood Pressure', axis=1)
sleep.head()

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,0,1
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,No Disorder,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,No Disorder,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,No Disorder,125,80
3,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


In [20]:
# renaming as lower_bp and upper_bp
sleep = sleep.rename(columns={0: 'bp_upper', 1: 'bp_lower'})
sleep.head()

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,bp_upper,bp_lower
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,No Disorder,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,No Disorder,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,No Disorder,125,80
3,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


In [22]:
# convert to int
sleep['bp_upper'] = sleep['bp_upper'].astype('int')
sleep['bp_lower'] = sleep['bp_lower'].astype('int')

In [23]:
sleep['Heart Rate'].unique()

array([77, 75, 85, 82, 70, 80, 78, 69, 72, 68, 76, 81, 65, 84, 74, 67, 73,
       83, 86])

In [24]:
sleep['Daily Steps'].unique()

array([ 4200, 10000,  3000,  3500,  8000,  4000,  4100,  6800,  5000,
        7000,  5500,  5200,  5600,  3300,  4800,  7500,  7300,  6200,
        6000,  3700])

In [26]:
sleep['bp_upper'].unique()

array([126, 125, 140, 120, 132, 130, 117, 118, 128, 131, 115, 135, 129,
       119, 121, 122, 142, 139])

In [27]:
sleep['bp_lower'].unique()

array([83, 80, 90, 87, 86, 76, 85, 84, 75, 88, 78, 77, 79, 82, 92, 95, 91])

# Basic Descriptive Statistics

In [28]:
sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   374 non-null    object 
 1   Age                      374 non-null    int64  
 2   Occupation               374 non-null    object 
 3   Sleep Duration           374 non-null    float64
 4   Quality of Sleep         374 non-null    int64  
 5   Physical Activity Level  374 non-null    int64  
 6   Stress Level             374 non-null    int64  
 7   BMI Category             374 non-null    object 
 8   Heart Rate               374 non-null    int64  
 9   Daily Steps              374 non-null    int64  
 10  Sleep Disorder           374 non-null    object 
 11  bp_upper                 374 non-null    int64  
 12  bp_lower                 374 non-null    int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 38.1+ KB


In [29]:
sleep.describe()

Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,bp_upper,bp_lower
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492,128.553476,84.649733
std,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679,7.748118,6.161611
min,27.0,5.8,4.0,30.0,3.0,65.0,3000.0,115.0,75.0
25%,35.25,6.4,6.0,45.0,4.0,68.0,5600.0,125.0,80.0
50%,43.0,7.2,7.0,60.0,5.0,70.0,7000.0,130.0,85.0
75%,50.0,7.8,8.0,75.0,7.0,72.0,8000.0,135.0,90.0
max,59.0,8.5,9.0,90.0,8.0,86.0,10000.0,142.0,95.0


In [30]:
sleep.to_csv('sleep_cleaned.csv', index=False)