In [2]:
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 18 07:37:58 2020

@author: hwicaksono
"""


import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer

import plotly.express as px


# Load customer dataset

In [3]:

dataset = pd.read_csv(r"..\datasets\Preprocessing.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,US,43.0,71000.0,No
1,UK,28.0,48000.0,Yes
2,Japan,30.0,54000.0,No
3,UK,38.0,60000.0,No
4,Japan,40.0,,Yes
5,US,34.0,58000.0,Yes
6,UK,,53000.0,No
7,US,46.0,79000.0,Yes
8,Japan,50.0,83000.0,No
9,US,37.0,66000.0,Yes


# Determine the independent variables

In [4]:
# first three columns as matrix. independent variable
X = dataset.iloc[:,0:3].values
display(pd.DataFrame(X))

Unnamed: 0,0,1,2
0,US,43.0,71000.0
1,UK,28.0,48000.0
2,Japan,30.0,54000.0
3,UK,38.0,60000.0
4,Japan,40.0,
5,US,34.0,58000.0
6,UK,,53000.0
7,US,46.0,79000.0
8,Japan,50.0,83000.0
9,US,37.0,66000.0


# Determine the dependent variable

In [5]:
# last column as vector, dependent variable
y = dataset.iloc[:,-1].values
display(pd.DataFrame(y))

Unnamed: 0,0
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


# Handling missing data

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
imputer = imputer.fit(X[:,1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
display(pd.DataFrame(X))

Unnamed: 0,0,1,2
0,US,43,71000
1,UK,28,48000
2,Japan,30,54000
3,UK,38,60000
4,Japan,40,60000
5,US,34,58000
6,UK,38,53000
7,US,46,79000
8,Japan,50,83000
9,US,37,66000


# Encoding nominal data using label encoder

In [7]:
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
display(pd.DataFrame(X))

Unnamed: 0,0,1,2
0,2,43,71000
1,1,28,48000
2,0,30,54000
3,1,38,60000
4,0,40,60000
5,2,34,58000
6,1,38,53000
7,2,46,79000
8,0,50,83000
9,2,37,66000


# Encoding categorial data with onehot encoding

In [8]:
onehotencoder = LabelBinarizer()
onehotencoder.fit(dataset['Country'])
transformed = onehotencoder.transform(dataset['Country'])
ohe_df = pd.DataFrame(transformed)
dataset = pd.concat([ohe_df, dataset], axis=1).drop(['Country'], axis=1)
dataset

Unnamed: 0,0,1,2,Age,Salary,Purchased
0,0,0,1,43.0,71000.0,No
1,0,1,0,28.0,48000.0,Yes
2,1,0,0,30.0,54000.0,No
3,0,1,0,38.0,60000.0,No
4,1,0,0,40.0,,Yes
5,0,0,1,34.0,58000.0,Yes
6,0,1,0,,53000.0,No
7,0,0,1,46.0,79000.0,Yes
8,1,0,0,50.0,83000.0,No
9,0,0,1,37.0,66000.0,Yes


# Define matrix of independent variables

In [9]:
# first five columns as matrix. independent variable
X = dataset.iloc[:,0:5].values
display(pd.DataFrame(X))

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,1.0,43.0,71000.0
1,0.0,1.0,0.0,28.0,48000.0
2,1.0,0.0,0.0,30.0,54000.0
3,0.0,1.0,0.0,38.0,60000.0
4,1.0,0.0,0.0,40.0,
5,0.0,0.0,1.0,34.0,58000.0
6,0.0,1.0,0.0,,53000.0
7,0.0,0.0,1.0,46.0,79000.0
8,1.0,0.0,0.0,50.0,83000.0
9,0.0,0.0,1.0,37.0,66000.0


# Define the vector of the dependent variable

In [10]:
# last column as vector, dependent variable
y = dataset.iloc[:,-1].values
display(pd.DataFrame(X))
display(pd.DataFrame(y))

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,1.0,43.0,71000.0
1,0.0,1.0,0.0,28.0,48000.0
2,1.0,0.0,0.0,30.0,54000.0
3,0.0,1.0,0.0,38.0,60000.0
4,1.0,0.0,0.0,40.0,
5,0.0,0.0,1.0,34.0,58000.0
6,0.0,1.0,0.0,,53000.0
7,0.0,0.0,1.0,46.0,79000.0
8,1.0,0.0,0.0,50.0,83000.0
9,0.0,0.0,1.0,37.0,66000.0


Unnamed: 0,0
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


# Handling missing data in the independent variable matrix

In [11]:
# Handling missing data of X
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
imputer = imputer.fit(X[:,:])
X[:, :] = imputer.transform(X[:, :])
display(pd.DataFrame(X))

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,1.0,43.0,71000.0
1,0.0,1.0,0.0,28.0,48000.0
2,1.0,0.0,0.0,30.0,54000.0
3,0.0,1.0,0.0,38.0,60000.0
4,1.0,0.0,0.0,40.0,60000.0
5,0.0,0.0,1.0,34.0,58000.0
6,0.0,1.0,0.0,38.0,53000.0
7,0.0,0.0,1.0,46.0,79000.0
8,1.0,0.0,0.0,50.0,83000.0
9,0.0,0.0,1.0,37.0,66000.0


# Handling missing data in the dependent variable vector

In [12]:
# Label encoding for y
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
display(pd.DataFrame(y))

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


# Splitting the dataset into training and test set

In [13]:
#Splitting into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
display(pd.DataFrame(X_train))
display(pd.DataFrame(y_train))

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,40.0,60000.0
1,0.0,0.0,1.0,37.0,66000.0
2,0.0,1.0,0.0,28.0,48000.0
3,0.0,1.0,0.0,38.0,53000.0
4,0.0,0.0,1.0,46.0,79000.0
5,0.0,1.0,0.0,38.0,60000.0
6,0.0,0.0,1.0,43.0,71000.0
7,0.0,0.0,1.0,34.0,58000.0


Unnamed: 0,0
0,1
1,1
2,1
3,0
4,1
5,0
6,0
7,1


# Feature scaling

## MinMax Scaler

In [48]:
mmsc_X = MinMaxScaler()
X_train1 = mmsc_X.fit_transform(X_train)
display(pd.DataFrame(X_train1))

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,0.666667,0.387097
1,0.0,0.0,1.0,0.5,0.580645
2,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.555556,0.16129
4,0.0,0.0,1.0,1.0,1.0
5,0.0,1.0,0.0,0.555556,0.387097
6,0.0,0.0,1.0,0.833333,0.741935
7,0.0,0.0,1.0,0.333333,0.322581


## Robust Scaler

In [49]:
rs_X = RobustScaler()
X_train2 = rs_X.fit_transform(X_train)
display(pd.DataFrame(X_train2))

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,-0.5,0.444444,0.0
1,0.0,0.0,0.5,-0.222222,0.571429
2,0.0,1.0,-0.5,-2.222222,-1.142857
3,0.0,1.0,-0.5,0.0,-0.666667
4,0.0,0.0,0.5,1.777778,1.809524
5,0.0,1.0,-0.5,0.0,0.0
6,0.0,0.0,0.5,1.111111,1.047619
7,0.0,0.0,0.5,-0.888889,-0.190476


# Standard Scaler

In [50]:
sc_X = StandardScaler()
X_train3 = sc_X.fit_transform(X_train)
display(pd.DataFrame(X_train3))

Unnamed: 0,0,1,2,3,4
0,2.645751,-0.774597,-1.0,0.39036,-0.202352
1,-0.377964,-0.774597,1.0,-0.19518,0.445174
2,-0.377964,1.290994,-1.0,-1.9518,-1.497404
3,-0.377964,1.290994,-1.0,0.0,-0.957799
4,-0.377964,-0.774597,1.0,1.56144,1.848148
5,-0.377964,1.290994,-1.0,0.0,-0.202352
6,-0.377964,-0.774597,1.0,0.9759,0.98478
7,-0.377964,-0.774597,1.0,-0.78072,-0.418194


In [27]:
import plotly.figure_factory as ff

cols = ['Age', 'Salary']
#'Age MinMax', 'Salary MinMax', 'Age Robust', 'Salary Robust', 'Age Standard', 'Salary Standard']
df_plot = pd.DataFrame(columns = cols)

#df_plot['Age'] = X[:,3]
#df_plot['Salary'] = X[:,4]

x_Age = X[:,3]
x_Salary = X[:,4]

hist_data = [x_Age, x_Salary]

#df_plot

fig = ff.create_distplot(hist_data, cols, bin_size=.2)
fig.show()