# Data Pre-processing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


Load Data Set 

In [2]:
os.chdir ("E:\Data science\Data preprocessing")
df1= pd.read_excel("Data Preprocessing Data File.xlsx")	
print (df1)


   Country   Age   Salary  Purchased 
0   France   44.0  72000.0         No
1    Spain   27.0  48000.0        Yes
2  Germany    NaN  54000.0         No
3    Spain   39.0      NaN         No
4  Germany    NaN  64000.0        Yes
5   France   35.0  58000.0        Yes
6    Spain   39.0  52000.0         No
7   France   48.0      NaN        Yes
8  Germany   50.0  83000.0         No
9   France   37.0  67000.0        Yes


Load independent variables and dependent variables to two separate arrays 

In [3]:
x = df1.iloc[:,:-1].values
print (x)


[['France ' 44.0 72000.0]
 ['Spain ' 27.0 48000.0]
 ['Germany ' nan 54000.0]
 ['Spain ' 39.0 nan]
 ['Germany ' nan 64000.0]
 ['France ' 35.0 58000.0]
 ['Spain ' 39.0 52000.0]
 ['France ' 48.0 nan]
 ['Germany ' 50.0 83000.0]
 ['France ' 37.0 67000.0]]


In [4]:
y = df1.iloc[:,3].values
print (y)


['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Missing value treatment – Impute Values 

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer = imputer.fit(x[:,1:3])             #perform fit
x [:,1:3]= imputer.transform(x[:,1:3])      #perform transform-extract column 1 &2 as it contain nan values
print (x)

[['France ' 44.0 72000.0]
 ['Spain ' 27.0 48000.0]
 ['Germany ' 39.875 54000.0]
 ['Spain ' 39.0 62250.0]
 ['Germany ' 39.875 64000.0]
 ['France ' 35.0 58000.0]
 ['Spain ' 39.0 52000.0]
 ['France ' 48.0 62250.0]
 ['Germany ' 50.0 83000.0]
 ['France ' 37.0 67000.0]]


Label Encoding #as system not understand the text data so need to convert into label encoder

In [6]:
from sklearn.preprocessing import LabelEncoder
label_x = LabelEncoder()
x[:,0]= label_x.fit_transform(x[:,0])
print (x)


[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 39.875 54000.0]
 [2 39.0 62250.0]
 [1 39.875 64000.0]
 [0 35.0 58000.0]
 [2 39.0 52000.0]
 [0 48.0 62250.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


One hot encoding /Column Transformation 

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print (x)


[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 39.875 54000.0]
 [0.0 0.0 1.0 39.0 62250.0]
 [0.0 1.0 0.0 39.875 64000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 39.0 52000.0]
 [1.0 0.0 0.0 48.0 62250.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


Label Encoding Y

In [8]:
label_y = LabelEncoder()
y = label_y.fit_transform(y)
print (y)


[0 1 0 0 1 1 0 1 0 1]


Standardization to have data in same digit format

In [9]:
from sklearn.preprocessing import StandardScaler
std_sca= StandardScaler()
x_STD = std_sca.fit_transform(x)
print (pd.DataFrame(x_STD))


          0         1         2         3         4
0  1.224745 -0.654654 -0.654654  0.664912  1.002707
1 -0.816497 -0.654654  1.527525 -2.075331 -1.465494
2 -0.816497  1.527525 -0.654654  0.000000 -0.848444
3 -0.816497 -0.654654  1.527525 -0.141042  0.000000
4 -0.816497  1.527525 -0.654654  0.000000  0.179973
5  1.224745 -0.654654 -0.654654 -0.785805 -0.437077
6 -0.816497 -0.654654  1.527525 -0.141042 -1.054127
7  1.224745 -0.654654 -0.654654  1.309675  0.000000
8 -0.816497  1.527525 -0.654654  1.632056  2.133965
9  1.224745 -0.654654 -0.654654 -0.463423  0.488498


Normalization - MinMaxScaler

In [10]:
from sklearn.preprocessing import MinMaxScaler
Nm_x= MinMaxScaler()
x_NOR = Nm_x.fit_transform(x)
print (pd.DataFrame(x_NOR))


     0    1    2         3         4
0  1.0  0.0  0.0  0.739130  0.685714
1  0.0  0.0  1.0  0.000000  0.000000
2  0.0  1.0  0.0  0.559783  0.171429
3  0.0  0.0  1.0  0.521739  0.407143
4  0.0  1.0  0.0  0.559783  0.457143
5  1.0  0.0  0.0  0.347826  0.285714
6  0.0  0.0  1.0  0.521739  0.114286
7  1.0  0.0  0.0  0.913043  0.407143
8  0.0  1.0  0.0  1.000000  1.000000
9  1.0  0.0  0.0  0.434783  0.542857


Normalization 

In [11]:
from sklearn.preprocessing import Normalizer
Nm_x= Normalizer()
x_NOR = Nm_x.fit_transform(x)
print (pd.DataFrame(x_NOR))


          0         1         2         3    4
0  0.000014  0.000000  0.000000  0.000611  1.0
1  0.000000  0.000000  0.000021  0.000562  1.0
2  0.000000  0.000019  0.000000  0.000738  1.0
3  0.000000  0.000000  0.000016  0.000627  1.0
4  0.000000  0.000016  0.000000  0.000623  1.0
5  0.000017  0.000000  0.000000  0.000603  1.0
6  0.000000  0.000000  0.000019  0.000750  1.0
7  0.000016  0.000000  0.000000  0.000771  1.0
8  0.000000  0.000012  0.000000  0.000602  1.0
9  0.000015  0.000000  0.000000  0.000552  1.0


Model Creation 

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split (x_NOR,y,test_size = 0.2)
print (pd.DataFrame(x_train), y_train)


          0         1         2         3    4
0  0.000000  0.000000  0.000021  0.000562  1.0
1  0.000000  0.000012  0.000000  0.000602  1.0
2  0.000000  0.000000  0.000019  0.000750  1.0
3  0.000000  0.000016  0.000000  0.000623  1.0
4  0.000000  0.000019  0.000000  0.000738  1.0
5  0.000016  0.000000  0.000000  0.000771  1.0
6  0.000014  0.000000  0.000000  0.000611  1.0
7  0.000015  0.000000  0.000000  0.000552  1.0 [1 0 0 1 0 1 0 1]
