# Data Preprocessing

In [7]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
data=pd.read_csv('./Data.csv')
df=pd.DataFrame(data)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
x=df.iloc[:,:-1].values #Input Values
y=df.iloc[:,-1].values  # Output Values
print(x)


[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


- to find all the missing values

In [5]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

-to drop the rows of missing values

In [9]:
df1=df.copy()
df1.dropna(inplace=True)
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [18]:
import warnings
warnings.filterwarnings('ignore')

- to fill in the missing values (imputation)

In [53]:
df2=df.copy()
df2.fillna("Nothing",inplace=True)
# df2.fillna(df2["Age"].mean(),inplace=True)
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,Nothing,Yes
5,France,35.0,58000.0,Yes
6,Spain,Nothing,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


-get_dummies

In [45]:
pd.get_dummies(df1)

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain,Purchased_No,Purchased_Yes
0,44.0,72000.0,True,False,False,True,False
1,27.0,48000.0,False,False,True,False,True
2,30.0,54000.0,False,True,False,True,False
3,38.0,61000.0,False,False,True,True,False
5,35.0,58000.0,True,False,False,False,True
7,48.0,79000.0,True,False,False,False,True
8,50.0,83000.0,False,True,False,True,False
9,37.0,67000.0,True,False,False,False,True


- Label Encoder

In [47]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [49]:
data=df.apply(LabelEncoder().fit_transform)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,0,6,6,0
1,2,0,0,1
2,1,1,2,0
3,2,4,4,0
4,1,5,9,1
5,0,2,3,1
6,2,9,1,0
7,0,7,7,1
8,1,8,8,0
9,0,3,5,1


- Simple Imputaion

In [63]:
from sklearn.impute import SimpleImputer
imp=SimpleImputer(missing_values=np.nan,strategy='mean')
imp.fit(x[:,1:3])
(x[:,1:3])=imp.transform((x[:,1:3]))
print(x)
df

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


- Column Transformer

In [70]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], 
                       remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x)


[[0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]


- Splitting the dataset: [train_test_split]

In [76]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
print(x_train)


[[0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 37.0 67000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 35.0 58000.0]]


In [79]:
print(x_test)

[[1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]]


In [77]:
print(y_train)

[1 0 0 1 0 1 0 1]


In [78]:
print(y_test)

[1 0]


# SCALING
    - minmax:{xnew=x-xmin/xmax-xmin}
    - Standard Scaler
    - Normalizer
    - Robust Scaler

In [82]:
df=pd.read_csv('./pima_indians_diabetes.csv')
df

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
762,10,101,76,48,180,32.9,0.171,63,0
763,2,122,70,27,0,36.8,0.340,27,0
764,5,121,72,23,112,26.2,0.245,30,0
765,1,126,60,0,0,30.1,0.349,47,1


- Adding column names

In [84]:
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
df=pd.read_csv('./pima_indians_diabetes.csv',names=names)
df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [89]:
data=df.values
data

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [93]:
data.shape

(768, 9)

In [95]:
x=data[:,0:8] # - slicing
y=data[:,8]

In [96]:
print(x)
print(y)

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]
[1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0.
 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0.
 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0.
 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1.
 1. 0. 

<h2>Scaling Data (Formula) <br></h2>
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))<br><br>X_scaled = X_std *(max - min)+min</p>

- MinMaxScaler

In [99]:
from sklearn.preprocessing import MinMaxScaler
mmScaler=MinMaxScaler(feature_range=(0,1))

In [100]:
rescalex=mmScaler.fit_transform(x)
rescalex

array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938,
        0.18333333],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556,
        0.03333333]])

 - Standard Scaler

In [103]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
rescalex=ss.fit_transform(x)
rescalex

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

- Normalizer

In [105]:
from sklearn.preprocessing import Normalizer
norm=Normalizer().fit_transform(x)
norm

array([[0.03355237, 0.82762513, 0.40262844, ..., 0.18789327, 0.00350622,
        0.27960308],
       [0.008424  , 0.71604034, 0.55598426, ..., 0.22407851, 0.00295683,
        0.26114412],
       [0.04039768, 0.92409698, 0.32318146, ..., 0.11765825, 0.00339341,
        0.16159073],
       ...,
       [0.02691539, 0.65135243, 0.38758161, ..., 0.14103664, 0.00131885,
        0.16149234],
       [0.00665306, 0.83828547, 0.39918356, ..., 0.20025708, 0.00232192,
        0.31269379],
       [0.00791454, 0.73605211, 0.55401772, ..., 0.24060198, 0.00249308,
        0.18203439]])

- Binarizer

In [107]:
from sklearn.preprocessing import Binarizer
bin=Binarizer().fit_transform(x)
bin

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [108]:
from sklearn.preprocessing import RobustScaler
rb=RobustScaler().fit_transform(x)
rb

array([[ 0.6       ,  0.75151515,  0.        , ...,  0.17204301,
         0.66535948,  1.23529412],
       [-0.4       , -0.77575758, -0.33333333, ..., -0.58064516,
        -0.05620915,  0.11764706],
       [ 1.        ,  1.6       , -0.44444444, ..., -0.93548387,
         0.78300654,  0.17647059],
       ...,
       [ 0.4       ,  0.0969697 ,  0.        , ..., -0.62365591,
        -0.33333333,  0.05882353],
       [-0.4       ,  0.21818182, -0.66666667, ..., -0.20430108,
        -0.06143791,  1.05882353],
       [-0.4       , -0.58181818, -0.11111111, ..., -0.17204301,
        -0.1503268 , -0.35294118]])