# One hot encoding and feature scaling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading the dataset

In [2]:
data=pd.read_csv('Titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 1 to 889
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  183 non-null    int64  
 1   Survived     183 non-null    int64  
 2   Pclass       183 non-null    int64  
 3   Name         183 non-null    object 
 4   Sex          183 non-null    object 
 5   Age          183 non-null    float64
 6   SibSp        183 non-null    int64  
 7   Parch        183 non-null    int64  
 8   Ticket       183 non-null    object 
 9   Fare         183 non-null    float64
 10  Cabin        183 non-null    object 
 11  Embarked     183 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 18.6+ KB


In [5]:
data.drop(columns={'Name','Ticket','Cabin','PassengerId'},inplace=True)

## One hot encoding

In [6]:
data['SEX']=pd.get_dummies(data=data.Sex,drop_first=True)

In [7]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SEX
1,1,1,female,38.0,1,0,71.2833,C,0
3,1,1,female,35.0,1,0,53.1,S,0
6,0,1,male,54.0,0,0,51.8625,S,1
10,1,3,female,4.0,1,1,16.7,S,0
11,1,1,female,58.0,0,0,26.55,S,0


## Converting the whole dataset

In [8]:
data_no_dummies=pd.get_dummies(data=data)

In [9]:
data_no_dummies

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,SEX,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
1,1,1,38.0,1,0,71.2833,0,1,0,1,0,0
3,1,1,35.0,1,0,53.1000,0,1,0,0,0,1
6,0,1,54.0,0,0,51.8625,1,0,1,0,0,1
10,1,3,4.0,1,1,16.7000,0,1,0,0,0,1
11,1,1,58.0,0,0,26.5500,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,47.0,1,1,52.5542,0,1,0,0,0,1
872,0,1,33.0,0,0,5.0000,1,0,1,0,0,1
879,1,1,56.0,0,1,83.1583,0,1,0,1,0,0
887,1,1,19.0,0,0,30.0000,0,1,0,0,0,1


In [10]:
data_no_dummies.shape

(183, 12)

In [11]:
data_with_dummies=pd.get_dummies(data=data,drop_first=True)
data_with_dummies

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,SEX,Sex_male,Embarked_Q,Embarked_S
1,1,1,38.0,1,0,71.2833,0,0,0,0
3,1,1,35.0,1,0,53.1000,0,0,0,1
6,0,1,54.0,0,0,51.8625,1,1,0,1
10,1,3,4.0,1,1,16.7000,0,0,0,1
11,1,1,58.0,0,0,26.5500,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
871,1,1,47.0,1,1,52.5542,0,0,0,1
872,0,1,33.0,0,0,5.0000,1,1,0,1
879,1,1,56.0,0,1,83.1583,0,0,0,0
887,1,1,19.0,0,0,30.0000,0,0,0,1


In [12]:
data_with_dummies.shape

(183, 10)

In [13]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler,LabelEncoder

## Label encoder

In [14]:
le=LabelEncoder()

In [15]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SEX
1,1,1,female,38.0,1,0,71.2833,C,0
3,1,1,female,35.0,1,0,53.1000,S,0
6,0,1,male,54.0,0,0,51.8625,S,1
10,1,3,female,4.0,1,1,16.7000,S,0
11,1,1,female,58.0,0,0,26.5500,S,0
...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,0
872,0,1,male,33.0,0,0,5.0000,S,1
879,1,1,female,56.0,0,1,83.1583,C,0
887,1,1,female,19.0,0,0,30.0000,S,0


In [16]:
data['EMBARKED']=le.fit_transform(data.Embarked)

In [17]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SEX,EMBARKED
1,1,1,female,38.0,1,0,71.2833,C,0,0
3,1,1,female,35.0,1,0,53.1000,S,0,2
6,0,1,male,54.0,0,0,51.8625,S,1,2
10,1,3,female,4.0,1,1,16.7000,S,0,2
11,1,1,female,58.0,0,0,26.5500,S,0,2
...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,0,2
872,0,1,male,33.0,0,0,5.0000,S,1,2
879,1,1,female,56.0,0,1,83.1583,C,0,0
887,1,1,female,19.0,0,0,30.0000,S,0,2


## Data Scaling or Data Normalisation

### Minmax scaler

In [18]:
minmax=MinMaxScaler()

In [19]:
data[['Age']]

Unnamed: 0,Age
1,38.0
3,35.0
6,54.0
10,4.0
11,58.0
...,...
871,47.0
872,33.0
879,56.0
887,19.0


In [20]:
data[['Age']].values

array([[38.  ],
       [35.  ],
       [54.  ],
       [ 4.  ],
       [58.  ],
       [34.  ],
       [28.  ],
       [19.  ],
       [49.  ],
       [65.  ],
       [45.  ],
       [29.  ],
       [25.  ],
       [23.  ],
       [46.  ],
       [71.  ],
       [23.  ],
       [21.  ],
       [47.  ],
       [24.  ],
       [32.5 ],
       [54.  ],
       [19.  ],
       [37.  ],
       [24.  ],
       [36.5 ],
       [22.  ],
       [61.  ],
       [56.  ],
       [50.  ],
       [ 1.  ],
       [ 3.  ],
       [44.  ],
       [58.  ],
       [ 2.  ],
       [40.  ],
       [31.  ],
       [32.  ],
       [38.  ],
       [35.  ],
       [44.  ],
       [37.  ],
       [29.  ],
       [62.  ],
       [30.  ],
       [52.  ],
       [40.  ],
       [58.  ],
       [35.  ],
       [37.  ],
       [63.  ],
       [19.  ],
       [36.  ],
       [ 2.  ],
       [50.  ],
       [ 0.92],
       [17.  ],
       [30.  ],
       [24.  ],
       [18.  ],
       [31.  ],
       [40.  ],
       [

In [21]:
age=data[['Age']].values

In [22]:
age

array([[38.  ],
       [35.  ],
       [54.  ],
       [ 4.  ],
       [58.  ],
       [34.  ],
       [28.  ],
       [19.  ],
       [49.  ],
       [65.  ],
       [45.  ],
       [29.  ],
       [25.  ],
       [23.  ],
       [46.  ],
       [71.  ],
       [23.  ],
       [21.  ],
       [47.  ],
       [24.  ],
       [32.5 ],
       [54.  ],
       [19.  ],
       [37.  ],
       [24.  ],
       [36.5 ],
       [22.  ],
       [61.  ],
       [56.  ],
       [50.  ],
       [ 1.  ],
       [ 3.  ],
       [44.  ],
       [58.  ],
       [ 2.  ],
       [40.  ],
       [31.  ],
       [32.  ],
       [38.  ],
       [35.  ],
       [44.  ],
       [37.  ],
       [29.  ],
       [62.  ],
       [30.  ],
       [52.  ],
       [40.  ],
       [58.  ],
       [35.  ],
       [37.  ],
       [63.  ],
       [19.  ],
       [36.  ],
       [ 2.  ],
       [50.  ],
       [ 0.92],
       [17.  ],
       [30.  ],
       [24.  ],
       [18.  ],
       [31.  ],
       [40.  ],
       [

In [23]:
data['AGE']=minmax.fit_transform(age)

In [24]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SEX,EMBARKED,AGE
1,1,1,female,38.0,1,0,71.2833,C,0,0,0.468892
3,1,1,female,35.0,1,0,53.1,S,0,2,0.430956
6,0,1,male,54.0,0,0,51.8625,S,1,2,0.671219
10,1,3,female,4.0,1,1,16.7,S,0,2,0.038948
11,1,1,female,58.0,0,0,26.55,S,0,2,0.721801


In [25]:
data.AGE.min()

0.0

In [26]:
data.AGE.max()

1.0

## Standard scaler

In [27]:
sc=StandardScaler()

In [28]:
data['AGE_SC']=sc.fit_transform(age)

In [29]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SEX,EMBARKED,AGE,AGE_SC
1,1,1,female,38.0,1,0,71.2833,C,0,0,0.468892,0.149065
3,1,1,female,35.0,1,0,53.1000,S,0,2,0.430956,-0.043230
6,0,1,male,54.0,0,0,51.8625,S,1,2,0.671219,1.174636
10,1,3,female,4.0,1,1,16.7000,S,0,2,0.038948,-2.030273
11,1,1,female,58.0,0,0,26.5500,S,0,2,0.721801,1.431029
...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,0,2,0.582701,0.725949
872,0,1,male,33.0,0,0,5.0000,S,1,2,0.405665,-0.171426
879,1,1,female,56.0,0,1,83.1583,C,0,0,0.696510,1.302832
887,1,1,female,19.0,0,0,30.0000,S,0,2,0.228629,-1.068801


In [30]:
data.AGE_SC.min()

-2.2276958075858446

In [31]:
data.AGE_SC.max()

2.841189038923718

## Robust scaler

In [32]:
rc=RobustScaler()

In [33]:
data['AGE_RC']=rc.fit_transform(age)

In [34]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SEX,EMBARKED,AGE,AGE_SC,AGE_RC
1,1,1,female,38.0,1,0,71.2833,C,0,0,0.468892,0.149065,0.085106
3,1,1,female,35.0,1,0,53.1000,S,0,2,0.430956,-0.043230,-0.042553
6,0,1,male,54.0,0,0,51.8625,S,1,2,0.671219,1.174636,0.765957
10,1,3,female,4.0,1,1,16.7000,S,0,2,0.038948,-2.030273,-1.361702
11,1,1,female,58.0,0,0,26.5500,S,0,2,0.721801,1.431029,0.936170
...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,0,2,0.582701,0.725949,0.468085
872,0,1,male,33.0,0,0,5.0000,S,1,2,0.405665,-0.171426,-0.127660
879,1,1,female,56.0,0,1,83.1583,C,0,0,0.696510,1.302832,0.851064
887,1,1,female,19.0,0,0,30.0000,S,0,2,0.228629,-1.068801,-0.723404


In [35]:
data.AGE_RC.min()

-1.4927659574468084

In [36]:
data.AGE_RC.max()

1.872340425531915