In [9]:
import pandas as pd
import os

df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [10]:
df.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,86.0,10000.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [12]:
df[['Systolic', 'Diastolic']] = df['Blood Pressure'].str.split('/', expand=True)

In [13]:
df['Systolic'] = pd.to_numeric(df['Systolic'])
df['Diastolic'] = pd.to_numeric(df['Diastolic'])

In [14]:

df = df.drop(['Blood Pressure', 'Person ID'], axis=1)

In [15]:
print("Data after cleaning and feature engineering:")
df.head()

Data after cleaning and feature engineering:


Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,Systolic,Diastolic
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


In [17]:
# Identify the categorical columns that need encoding
categorical_cols = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']

# Apply one-hot encoding
sleep_df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display the first few rows of the fully encoded DataFrame
print("DataFrame after One-Hot Encoding:")
sleep_df_encoded.head()

# Check the new columns
print("\nNew columns created by encoding:")
print(sleep_df_encoded.columns)

DataFrame after One-Hot Encoding:

New columns created by encoding:
Index(['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level',
       'Stress Level', 'Heart Rate', 'Daily Steps', 'Systolic', 'Diastolic',
       'Gender_Male', 'Occupation_Doctor', 'Occupation_Engineer',
       'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Nurse',
       'Occupation_Sales Representative', 'Occupation_Salesperson',
       'Occupation_Scientist', 'Occupation_Software Engineer',
       'Occupation_Teacher', 'BMI Category_Normal Weight',
       'BMI Category_Obese', 'BMI Category_Overweight',
       'Sleep Disorder_Sleep Apnea'],
      dtype='object')


In [18]:
# Import the StandardScaler
from sklearn.preprocessing import StandardScaler

# Create a copy to store the final processed data
df_processed = sleep_df_encoded.copy()

# Identify all numerical columns in the encoded DataFrame that need to be scaled
# We exclude the one-hot encoded columns, as they are already on a 0/1 scale.
numerical_cols = [
    'Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 
    'Stress Level', 'Heart Rate', 'Daily Steps', 'Systolic', 'Diastolic'
]

# Create an instance of the scaler
scaler = StandardScaler()

# Fit the scaler to the numerical data and transform it
df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])

# Display the first few rows of the final, fully processed DataFrame
print("Final processed DataFrame after Standardization:")
df_processed.head()

Final processed DataFrame after Standardization:


Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,Systolic,Diastolic,Gender_Male,...,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,BMI Category_Normal Weight,BMI Category_Obese,BMI Category_Overweight,Sleep Disorder_Sleep Apnea
0,-1.753096,-1.298887,-1.09828,-0.825418,0.347021,1.654719,-1.619584,-0.330002,-0.268102,True,...,False,False,False,False,True,False,False,False,True,False
1,-1.637643,-1.173036,-1.09828,0.039844,1.475592,1.170474,1.970077,-0.459239,-0.75564,True,...,False,False,False,False,False,False,False,False,False,False
2,-1.637643,-1.173036,-1.09828,0.039844,1.475592,1.170474,1.970077,-0.459239,-0.75564,True,...,False,False,False,False,False,False,False,False,False,False
3,-1.637643,-1.550588,-2.771424,-1.40226,1.475592,3.591698,-2.362273,1.479309,0.869486,True,...,False,True,False,False,False,False,False,True,False,True
4,-1.637643,-1.550588,-2.771424,-1.40226,1.475592,3.591698,-2.362273,1.479309,0.869486,True,...,False,True,False,False,False,False,False,True,False,True
