# One-Hot Encoding and Feature Scaling (CLASS-12, Module-3)

In [17]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()

In [18]:
data = pd.read_csv('Sleep_health_dataset.csv')
data.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [19]:
df = data.drop('Person ID', axis=1)
df

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [20]:
to_combine = ['Software Engineer', 'Sales Representative', 'Scientist', 'Manager']
df['Occupation'] = df['Occupation'].replace(to_combine, 'Other')

In [21]:
df['Occupation'].unique()

array(['Other', 'Doctor', 'Teacher', 'Nurse', 'Engineer', 'Accountant',
       'Lawyer', 'Salesperson'], dtype=object)

In [22]:
df = pd.concat([df, df['Blood Pressure'].str.split('/', expand=True)], axis=1).drop('Blood Pressure', axis=1)
df = df.rename(columns={0: 'bp_upper', 1: 'bp_lower'})
df['bp_upper'] = df['bp_upper'].astype('int')
df['bp_lower'] = df['bp_lower'].astype('int')
df.head()

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,bp_upper,bp_lower
0,Male,27,Other,6.1,6,42,6,Overweight,77,4200,,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,Male,28,Other,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,Male,28,Other,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


# One-Hot Encoding

* Various Machine Learning models do not work with categorical data and to fit this data into the machine learning model it needs to be converted into numerical data (encoding).  
* Suppose a dataset has a Gender column with categorical elements like Male and  Female. These labels have no specific order of preference and also since the data is string labels, machine learning models misinterpreted that there is some sort of hierarchy in them.  
* One approach to solve this problem can be label encoding where we will assign a numerical value to these labels for example Male and Female mapped to 0 and 1. But this can add bias in our model as it will start giving higher preference to the Female parameter as 1>0 but ideally, both labels are equally important in the dataset.  
* To deal with this issue we will use the One Hot Encoding technique.

### Disadvantages of usinf One-Hot Encoding
* It can lead to increased dimensionality, as a separate column is created for each category in the variable. This can make the model more complex and slow to train.  
* It can lead to overfitting, especially if there are many categories in the variable and the sample size is relatively small.  
* Dummy Variable Trap. 

### What is Dummy Variable Trap
* The Dummy variable trap is a scenario where there are attributes that are highly correlated (Multicollinear) and one variable predicts the value of others.
* When we use one-hot encoding for handling the categorical data, then one dummy variable (attribute) can be predicted with the help of other dummy variables. 
* Hence, one dummy variable is highly correlated with other dummy variables.

### Note: 
* Only encode **nominal** categorical variables.
* Always set 

In [23]:
df.head()

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,bp_upper,bp_lower
0,Male,27,Other,6.1,6,42,6,Overweight,77,4200,,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,Male,28,Other,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,Male,28,Other,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


Gender, Occupation and BMI are the categorical variables.  
BMI is ordinal categorical variable. Therefore, create dummies only for Gender and Occupation.

In [24]:
dummies = pd.get_dummies(df[['Gender', 'Occupation']], drop_first=True)
df_dummies = pd.concat([df, dummies], axis = 1)

In [25]:
df_dummies.head()

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,...,bp_upper,bp_lower,Gender_Male,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Other,Occupation_Salesperson,Occupation_Teacher
0,Male,27,Other,6.1,6,42,6,Overweight,77,4200,...,126,83,1,0,0,0,0,1,0,0
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,...,125,80,1,1,0,0,0,0,0,0
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,...,125,80,1,1,0,0,0,0,0,0
3,Male,28,Other,5.9,4,30,8,Obese,85,3000,...,140,90,1,0,0,0,0,1,0,0
4,Male,28,Other,5.9,4,30,8,Obese,85,3000,...,140,90,1,0,0,0,0,1,0,0


Now, drop the original gender and occupation columns.

In [26]:
df_dummies = df_dummies.drop(['Gender', 'Occupation'], axis = 1)

In [27]:
df_dummies.head()

Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,bp_upper,bp_lower,Gender_Male,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Other,Occupation_Salesperson,Occupation_Teacher
0,27,6.1,6,42,6,Overweight,77,4200,,126,83,1,0,0,0,0,1,0,0
1,28,6.2,6,60,8,Normal,75,10000,,125,80,1,1,0,0,0,0,0,0
2,28,6.2,6,60,8,Normal,75,10000,,125,80,1,1,0,0,0,0,0,0
3,28,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90,1,0,0,0,0,1,0,0
4,28,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90,1,0,0,0,0,1,0,0


In [28]:
df_dummies['BMI Category'] = df_dummies['BMI Category'].map({'Obese':2,'Overweight':1, 'Normal':0})
df_dummies.head()

Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,bp_upper,bp_lower,Gender_Male,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Other,Occupation_Salesperson,Occupation_Teacher
0,27,6.1,6,42,6,1.0,77,4200,,126,83,1,0,0,0,0,1,0,0
1,28,6.2,6,60,8,0.0,75,10000,,125,80,1,1,0,0,0,0,0,0
2,28,6.2,6,60,8,0.0,75,10000,,125,80,1,1,0,0,0,0,0,0
3,28,5.9,4,30,8,2.0,85,3000,Sleep Apnea,140,90,1,0,0,0,0,1,0,0
4,28,5.9,4,30,8,2.0,85,3000,Sleep Apnea,140,90,1,0,0,0,0,1,0,0


Reorder columns:

In [29]:
df_dummies.columns.values

array(['Age', 'Sleep Duration', 'Quality of Sleep',
       'Physical Activity Level', 'Stress Level', 'BMI Category',
       'Heart Rate', 'Daily Steps', 'Sleep Disorder', 'bp_upper',
       'bp_lower', 'Gender_Male', 'Occupation_Doctor',
       'Occupation_Engineer', 'Occupation_Lawyer', 'Occupation_Nurse',
       'Occupation_Other', 'Occupation_Salesperson', 'Occupation_Teacher'],
      dtype=object)

In [32]:
columns_reordered = ['Age', 'Sleep Duration', 'Quality of Sleep',
       'Physical Activity Level', 'Stress Level', 'BMI Category',
        'Heart Rate', 'Daily Steps',
       'Gender_Male', 'Occupation_Doctor', 'Occupation_Engineer',
       'Occupation_Lawyer', 'Occupation_Nurse', 'Occupation_Other',
       'Occupation_Salesperson', 'Occupation_Teacher', 'Sleep Disorder']

df_dummies_reordered = df_dummies[columns_reordered]
df_dummies_reordered.head() 

Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Gender_Male,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Other,Occupation_Salesperson,Occupation_Teacher,Sleep Disorder
0,27,6.1,6,42,6,1.0,77,4200,1,0,0,0,0,1,0,0,
1,28,6.2,6,60,8,0.0,75,10000,1,1,0,0,0,0,0,0,
2,28,6.2,6,60,8,0.0,75,10000,1,1,0,0,0,0,0,0,
3,28,5.9,4,30,8,2.0,85,3000,1,0,0,0,0,1,0,0,Sleep Apnea
4,28,5.9,4,30,8,2.0,85,3000,1,0,0,0,0,1,0,0,Sleep Apnea


# Standardization (Feature Scaling)

**Note:** Do not scale the dummy variables!

In [33]:
# this is the dunction for scaling the selected features

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std

    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

Make of copy of the data before feature scaling:

In [34]:
unscaled = df_dummies_reordered.copy()
unscaled.head()

Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Gender_Male,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Other,Occupation_Salesperson,Occupation_Teacher,Sleep Disorder
0,27,6.1,6,42,6,1.0,77,4200,1,0,0,0,0,1,0,0,
1,28,6.2,6,60,8,0.0,75,10000,1,1,0,0,0,0,0,0,
2,28,6.2,6,60,8,0.0,75,10000,1,1,0,0,0,0,0,0,
3,28,5.9,4,30,8,2.0,85,3000,1,0,0,0,0,1,0,0,Sleep Apnea
4,28,5.9,4,30,8,2.0,85,3000,1,0,0,0,0,1,0,0,Sleep Apnea


In [35]:
unscaled_inputs = unscaled.iloc[:,:-1]  # all input features

In [36]:
columns_to_omit = ['Gender_Male', 'Occupation_Doctor', 'Occupation_Engineer',
       'Occupation_Lawyer', 'Occupation_Nurse', 'Occupation_Other',
       'Occupation_Salesperson', 'Occupation_Teacher']

In [37]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [38]:
sleep_scaler = CustomScaler(columns_to_scale)

In [39]:
sleep_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Age', 'Sleep Duration', 'Quality of Sleep',
                      'Physical Activity Level', 'Stress Level', 'BMI Category',
                      'Heart Rate', 'Daily Steps'],
             copy=True, with_mean=True, with_std=True)

In [40]:
scaled_inputs = sleep_scaler.transform(unscaled_inputs)

In [41]:
scaled_inputs.head()

Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Gender_Male,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Other,Occupation_Salesperson,Occupation_Teacher
0,-1.753096,-1.298887,-1.09828,-0.825418,0.347021,0.947287,1.654719,-1.619584,1,0,0,0,0,1,0,0
1,-1.637643,-1.173036,-1.09828,0.039844,1.475592,-0.860239,1.170474,1.970077,1,1,0,0,0,0,0,0
2,-1.637643,-1.173036,-1.09828,0.039844,1.475592,-0.860239,1.170474,1.970077,1,1,0,0,0,0,0,0
3,-1.637643,-1.550588,-2.771424,-1.40226,1.475592,2.754812,3.591698,-2.362273,1,0,0,0,0,1,0,0
4,-1.637643,-1.550588,-2.771424,-1.40226,1.475592,2.754812,3.591698,-2.362273,1,0,0,0,0,1,0,0


In [42]:
scaled_inputs.shape

(374, 16)

In [43]:
targets = unscaled['Sleep Disorder']

In [44]:
targets.shape

(374,)