In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

# Importing the Raw data

In [None]:
raw_data = pd.read_csv('/kaggle/input/sleep-health-and-lifestyle-dataset/Sleep_health_and_lifestyle_dataset.csv')
raw_data.head()

Let's make a copy of the raw data for further processing and run to see if the data is matching.

In [None]:
# copying data

df = raw_data.copy()
df.head()

In [None]:
df.shape

The dataframe has 374 rows and 13 columns.

In [None]:
df.info()

Let's check the descriptive statisticts summary table.

In [None]:
df.describe()

Check for missing values:

In [None]:
(df.isnull().sum()/len(df))

There are no missing values in the dataset.

# EDA of Categorical Features

Plotting some countplots for categorical variables.

Target (Sleep Disorder):

In [None]:
sns.countplot(x='Sleep Disorder', data = df)
plt.show()

The dataset contains slightly more samples with no sleep disorders. However, the number of samples of Sleep Apnea and Insomnia are almost equal. 

In [None]:
sns.countplot(x='Gender', data = df, hue='Sleep Disorder')
plt.show()

The Gender variable is well balanced. It can be observed that females tend to have higher cases of sleep disorders than males.

In [None]:
sns.countplot(y='Occupation', data = df)
plt.show()

 It can be observed that few occupations have higher prominance than the other and hence the data imbalanced. Let's combine software engineer, sales rep, scientist and manager, and label them as 'other' to better balance the dataset.

In [None]:
to_combine = ['Software Engineer', 'Sales Representative', 'Scientist', 'Manager']
df['Occupation'] = df['Occupation'].replace(to_combine, 'Other')

In [None]:
df['Occupation'].unique()

In [None]:
sns.countplot(y='Occupation', data = df, hue = 'Sleep Disorder')
plt.show()

Nurses have the highest sleep apnea whereas teachers and salespersons have highest insomnia.

In [None]:
df['BMI Category'].unique()

Let's combine 'Normal' and 'Normal Weight'.

In [None]:
# combining 'normal weight' and 'normal'
to_combine = ['Normal', 'Normal Weight']
df['BMI Category'] = df['BMI Category'].replace(to_combine, 'Normal')

In [None]:
df['BMI Category'].unique()

In [None]:
sns.countplot(x='BMI Category', data = df, hue='Sleep Disorder')
plt.show()

All obese people have either sleep apnea or insomnia, most of the overweight people have a sleep disorder, and very few people with normal BMI have sleep disorders.

# Data Preprocessing

Let's start by splitting the 'Blood Pressure' into 'bp_lower' and 'bp_upper'.  

In [None]:
df_bp_split = pd.concat([df, df['Blood Pressure'].str.split('/', expand=True)], axis=1).drop('Blood Pressure', axis=1)
df_bp_split.head()

In [None]:
df_bp_split = df_bp_split.rename(columns={0: 'bp_upper', 1: 'bp_lower'})
df_bp_split.head()

In [None]:
df_bp_split['bp_upper'] = df_bp_split['bp_upper'].astype('int')
df_bp_split['bp_lower'] = df_bp_split['bp_lower'].astype('int')

In [None]:
df_bp_split.info()

We can drop 'Person ID' as it does not carry any information.

In [None]:
# dropping person ID
df_bp_split = df_bp_split.drop('Person ID', axis=1)
df_bp_split.head()

As only few categories are present, we can apply one-hot encoding to get dummies.

In [None]:
#one-hot encoding 
dummies = pd.get_dummies(df_bp_split[['Gender', 'Occupation', 'BMI Category']])
df_bp_split = pd.concat([df_bp_split, dummies], axis = 1)

In [None]:
df_bp_split = df_bp_split.drop(['Gender', 'Occupation', 'BMI Category'], axis = 1)

Let's make a copy of the data before scaling and transforming the inputs.

In [None]:
unscaled = df_bp_split.copy()
unscaled.head()

In [None]:
class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std

    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]


In [None]:
columns_reordered = ['Age', 'Sleep Duration', 'Quality of Sleep',
       'Physical Activity Level', 'Stress Level', 'Heart Rate',
       'Daily Steps', 'bp_upper', 'bp_lower',
       'Gender_Female', 'Gender_Male', 'Occupation_Accountant',
       'Occupation_Doctor', 'Occupation_Engineer', 'Occupation_Lawyer',
       'Occupation_Nurse', 'Occupation_Other', 'Occupation_Salesperson',
       'Occupation_Teacher', 'BMI Category_Normal', 'BMI Category_Obese',
       'BMI Category_Overweight','Sleep Disorder']

In [None]:
unscaled = unscaled[columns_reordered]
unscaled.head()

In [None]:
unscaled_inputs = unscaled.iloc[:,:-1]

In [None]:
columns_to_omit = ['Gender_Female',
       'Gender_Male', 'Occupation_Accountant', 'Occupation_Doctor',
       'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Nurse',
       'Occupation_Other', 'Occupation_Salesperson', 'Occupation_Teacher',
       'BMI Category_Normal', 'BMI Category_Obese',
       'BMI Category_Overweight']

In [None]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [None]:
sleep_scaler = CustomScaler(columns_to_scale)

In [None]:
sleep_scaler.fit(unscaled_inputs)

In [None]:
scaled_inputs = sleep_scaler.transform(unscaled_inputs)

In [None]:
scaled_inputs

In [None]:
scaled_inputs.shape

In [None]:
targets = unscaled['Sleep Disorder']