# Data Preprocessing

In [55]:
# Importing the libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler

In [9]:
# Loading the dataset onto a dataframe
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [45]:
# Sorting and storing the 2nd column of the dataframe
column_1 = X.iloc[:, 1].sort_values()
column_1

60     2.0
62     2.2
119    2.2
68     2.2
41     2.3
      ... 
16     3.9
14     4.0
32     4.1
33     4.2
15     4.4
Name: sepal width (cm), Length: 150, dtype: float64

## Data Smoothing by Binning

In [46]:
# Creating empty bins

bin_1 = np.zeros((30, 5))
bin_2 = np.zeros((30, 5))
bin_3 = np.zeros((30, 5))

In [47]:
# Smoothing by bin mean
def bin_mean(data, bins):
    for i in range(0, 150, 5):
        k = i//5
        mean = data[i:i+5].mean()
        for j in range(5):
            bins[k, j] = mean
    return bins

In [48]:
bin_1 = bin_mean(column_1, bin_1)
bin_1

array([[2.18, 2.18, 2.18, 2.18, 2.18],
       [2.34, 2.34, 2.34, 2.34, 2.34],
       [2.48, 2.48, 2.48, 2.48, 2.48],
       [2.52, 2.52, 2.52, 2.52, 2.52],
       [2.62, 2.62, 2.62, 2.62, 2.62],
       [2.7 , 2.7 , 2.7 , 2.7 , 2.7 ],
       [2.74, 2.74, 2.74, 2.74, 2.74],
       [2.8 , 2.8 , 2.8 , 2.8 , 2.8 ],
       [2.8 , 2.8 , 2.8 , 2.8 , 2.8 ],
       [2.86, 2.86, 2.86, 2.86, 2.86],
       [2.9 , 2.9 , 2.9 , 2.9 , 2.9 ],
       [2.96, 2.96, 2.96, 2.96, 2.96],
       [3.  , 3.  , 3.  , 3.  , 3.  ],
       [3.  , 3.  , 3.  , 3.  , 3.  ],
       [3.  , 3.  , 3.  , 3.  , 3.  ],
       [3.  , 3.  , 3.  , 3.  , 3.  ],
       [3.04, 3.04, 3.04, 3.04, 3.04],
       [3.1 , 3.1 , 3.1 , 3.1 , 3.1 ],
       [3.12, 3.12, 3.12, 3.12, 3.12],
       [3.2 , 3.2 , 3.2 , 3.2 , 3.2 ],
       [3.2 , 3.2 , 3.2 , 3.2 , 3.2 ],
       [3.26, 3.26, 3.26, 3.26, 3.26],
       [3.34, 3.34, 3.34, 3.34, 3.34],
       [3.4 , 3.4 , 3.4 , 3.4 , 3.4 ],
       [3.4 , 3.4 , 3.4 , 3.4 , 3.4 ],
       [3.5 , 3.5 , 3.5 ,

In [49]:
# Smoothing by bin median
def bin_median(data, bins):
    for i in range(0, 150, 5):
        k = i//5
        median = data[i:i+5].median()
        for j in range(5):
            bins[k, j] = median
    return bins

In [50]:
bin_2 = bin_median(column_1, bin_2)
bin_2

array([[2.2, 2.2, 2.2, 2.2, 2.2],
       [2.3, 2.3, 2.3, 2.3, 2.3],
       [2.5, 2.5, 2.5, 2.5, 2.5],
       [2.5, 2.5, 2.5, 2.5, 2.5],
       [2.6, 2.6, 2.6, 2.6, 2.6],
       [2.7, 2.7, 2.7, 2.7, 2.7],
       [2.7, 2.7, 2.7, 2.7, 2.7],
       [2.8, 2.8, 2.8, 2.8, 2.8],
       [2.8, 2.8, 2.8, 2.8, 2.8],
       [2.9, 2.9, 2.9, 2.9, 2.9],
       [2.9, 2.9, 2.9, 2.9, 2.9],
       [3. , 3. , 3. , 3. , 3. ],
       [3. , 3. , 3. , 3. , 3. ],
       [3. , 3. , 3. , 3. , 3. ],
       [3. , 3. , 3. , 3. , 3. ],
       [3. , 3. , 3. , 3. , 3. ],
       [3. , 3. , 3. , 3. , 3. ],
       [3.1, 3.1, 3.1, 3.1, 3.1],
       [3.1, 3.1, 3.1, 3.1, 3.1],
       [3.2, 3.2, 3.2, 3.2, 3.2],
       [3.2, 3.2, 3.2, 3.2, 3.2],
       [3.3, 3.3, 3.3, 3.3, 3.3],
       [3.3, 3.3, 3.3, 3.3, 3.3],
       [3.4, 3.4, 3.4, 3.4, 3.4],
       [3.4, 3.4, 3.4, 3.4, 3.4],
       [3.5, 3.5, 3.5, 3.5, 3.5],
       [3.6, 3.6, 3.6, 3.6, 3.6],
       [3.7, 3.7, 3.7, 3.7, 3.7],
       [3.8, 3.8, 3.8, 3.8, 3.8],
       [4.1, 4

In [53]:
# Smoothing by bin boundaries
def bin_boundaries(data, bins):
    for i in range(0, 150, 5):
        k = i//5
        for j in range(5):
            if (data[i+j]-data[i]) < (data[i+4]-data[i+j]):
                bins[k, j] = data[i]
            else:
                bins[k, j] = data[i+4]
    return bins

In [54]:
bin_boundaries(column_1, bin_3)

array([[3.5, 3.5, 3.5, 3.5, 3.6],
       [3.1, 3.9, 3.9, 3.9, 3.9],
       [3.7, 3.7, 3.7, 3.7, 4. ],
       [3.8, 4.4, 4.4, 4.4, 4.4],
       [3.4, 3.4, 3.4, 3.4, 3.4],
       [3. , 3.2, 3.2, 3.2, 3.2],
       [3.1, 3.1, 3.1, 3.1, 3.1],
       [3.2, 3.4, 3.4, 3.2, 3.4],
       [3.5, 3.5, 3.5, 3.5, 3.8],
       [3. , 3.3, 3.3, 3.3, 3.3],
       [2.8, 2.8, 2.8, 3.2, 3.2],
       [2.7, 2.7, 2.8, 2.7, 2.8],
       [2. , 2.9, 2. , 2.9, 2.9],
       [2.5, 2.5, 3.1, 3.1, 3.1],
       [2.9, 3.2, 3.2, 3.2, 3.2],
       [2.6, 3. , 2.6, 2.6, 3. ],
       [2.4, 2.4, 3. , 3. , 3. ],
       [2.5, 2.5, 3.4, 2.5, 3.4],
       [2.6, 2.7, 2.6, 2.6, 2.7],
       [2.8, 2.8, 2.8, 3. , 3. ],
       [3. , 3.3, 3.3, 3.3, 3.3],
       [3. , 3. , 3. , 3. , 3.6],
       [2.8, 3.2, 2.8, 3.2, 3.2],
       [2.2, 2.2, 2.2, 3.2, 3.2],
       [3.2, 3.2, 3.2, 3.2, 3.3],
       [3. , 3.2, 3.2, 3.2, 3.2],
       [2.6, 2.6, 2.6, 2.6, 2.8],
       [3. , 3.1, 3.1, 3. , 3.1],
       [3.1, 3.1, 3.1, 3.3, 3.3],
       [3. , 3

## Data Normalization

In [58]:
X = data.data
print(X.shape)
X[:5]

(150, 4)


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [60]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [61]:
print(X_scaled.min(axis=0), X_scaled.max(axis=0))

[0. 0. 0. 0.] [1. 1. 1. 1.]


In [67]:
def normalize(data):
    X_scaled = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))
    return X_scaled

In [68]:
X_scaled_manual = normalize(X)

In [71]:
print(X_scaled_manual.min(axis=0), X_scaled_manual.max(axis=0))

[0. 0. 0. 0.] [1. 1. 1. 1.]


In [70]:
# Verifying Manual vs Sk-Learn
print(np.allclose(X_scaled, X_scaled_manual))

True
