# Data Pre-Processing Technique

### Importing basic libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_theme(style='whitegrid')

## Feature Extraction

### Dictvectorizer
          > Converting list of dictionaries into matrix

In [None]:
data = [{'age':18,'weight':80},
        {'age':23,'weight':85},
        {'age':28,'weight':88},
        {'age':30,'weight':75},
        {'age':35,'weight':66}]

data    # 4 Samples and 2 features

[{'age': 18, 'weight': 80},
 {'age': 23, 'weight': 85},
 {'age': 28, 'weight': 88},
 {'age': 30, 'weight': 75},
 {'age': 35, 'weight': 66}]

In [None]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
data_transformed = dv.fit_transform(data)

data_transformed      # It is in data-matrix form

array([[18., 80.],
       [23., 85.],
       [28., 88.],
       [30., 75.],
       [35., 66.]])

## Data Imputation
         > Data imputation identifies missing values and replace with appropriate value.

In [None]:
from sklearn.impute import SimpleImputer

### data imputation on real dataset

In [None]:
cols = ['age', 'sex', 'cp', 'trestbps', 'chol',  'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope','ca', 'thal', 'num']
heart_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None, names=cols)

In [None]:
# Check missing values
heart_data.info()           # It works for only numeric data column only

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [None]:
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

There are two non-numeric column values

In [None]:
# list their unique value
print('Unique values in ca :', heart_data.ca.unique())
print('Unique values in ca :', heart_data.thal.unique())

Unique values in ca : ['0.0' '3.0' '2.0' '1.0' '?']
Unique values in ca : ['6.0' '3.0' '7.0' '?']


Both of them contain '?' value , which is missing value

In [None]:
# Replace '?' with 'nan'
heart_data.replace('?',np.nan,inplace=True)

In [None]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
heart_data_imputed = imputer.fit_transform(heart_data)

# Feature Scaling

Feature Scaling transform feature values such that all features are on the same scale

In [None]:
cols = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', header=None, names=cols)

Feature scaling are performed in only numeric attribute , so we must check attribute of columns

In [None]:
abalone_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


 only sex is non-attribute meanwhile others are numeric attribute

In [None]:
abalone_data.Sex.unique()

array(['M', 'F', 'I'], dtype=object)

In [None]:
# Assigning number on sex
abalone_data = abalone_data.replace({"Sex":{"M":1,"F":2,"I":3}})

We do not perform feature scaling on label(output).So, separate label

In [None]:
y = abalone_data.pop('Rings')
abalone_data.info()       # dataframe after deleting label

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   int64  
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 261.2 KB


In [None]:
 # Examine feature scale
 abalone_data.describe()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,1.95547,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831
std,0.827815,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203
min,1.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015
25%,1.0,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13
50%,2.0,0.545,0.425,0.14,0.7995,0.336,0.171,0.234
75%,3.0,0.615,0.48,0.165,1.153,0.502,0.253,0.329
max,3.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005


### Scaling

In [None]:
x = np.array([4,2,5,-2,-100]).reshape(-1,1)
x

array([[   4],
       [   2],
       [   5],
       [  -2],
       [-100]])

In [None]:
# Max Absolute Value
from sklearn.preprocessing import MaxAbsScaler
abs = MaxAbsScaler()

In [None]:
x_new = abs.fit_transform(x)
x_new

array([[ 0.04],
       [ 0.02],
       [ 0.05],
       [-0.02],
       [-1.  ]])

In [None]:
# Min Max Scalar
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()

In [None]:
X = abalone_data

In [None]:
x_new2 = min_max.fit_transform(X)
x_new2[:5]

array([[0.        , 0.51351351, 0.5210084 , 0.0840708 , 0.18133522,
        0.15030262, 0.1323239 , 0.14798206],
       [0.        , 0.37162162, 0.35294118, 0.07964602, 0.07915707,
        0.06624075, 0.06319947, 0.06826109],
       [0.5       , 0.61486486, 0.61344538, 0.11946903, 0.23906499,
        0.17182246, 0.18564845, 0.2077728 ],
       [0.        , 0.49324324, 0.5210084 , 0.11061947, 0.18204356,
        0.14425017, 0.14944042, 0.15296462],
       [1.        , 0.34459459, 0.33613445, 0.07079646, 0.07189658,
        0.0595158 , 0.05134957, 0.0533134 ]])

In [None]:
# Standardization
from sklearn.preprocessing import StandardScaler
Std_scaler = StandardScaler()
X_standard = Std_scaler.fit_transform(X)

In [None]:
X_standard[:5]      # All the feature have mean=0 and standard_deviation=1

array([[-1.15434629, -0.57455813, -0.43214879, -1.06442415, -0.64189823,
        -0.60768536, -0.72621157, -0.63821689],
       [-1.15434629, -1.44898585, -1.439929  , -1.18397831, -1.23027711,
        -1.17090984, -1.20522124, -1.21298732],
       [ 0.05379815,  0.05003309,  0.12213032, -0.10799087, -0.30946926,
        -0.4634999 , -0.35668983, -0.20713907],
       [-1.15434629, -0.69947638, -0.43214879, -0.34709919, -0.63781934,
        -0.64823753, -0.60759966, -0.60229374],
       [ 1.26194258, -1.61554351, -1.54070702, -1.42308663, -1.27208566,
        -1.2159678 , -1.28733718, -1.32075677]])

#Add Dummy Features

In [None]:
x = np.array([[7,1],[4,6],[8,2],[4,9]])

In [None]:
from sklearn.preprocessing import add_dummy_feature
x_new3 = add_dummy_feature(x)
x_new3

array([[1., 7., 1.],
       [1., 4., 6.],
       [1., 8., 2.],
       [1., 4., 9.]])

#Function Transform

In [None]:
wine_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',sep=';')

In [None]:
wine_data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
transformer = FunctionTransformer(np.log1p,validate=True)
transformed_wine_data = transformer.transform(np.array(wine_data))
pd.DataFrame(transformed_wine_data,columns=wine_data.columns).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,2.215842,0.1781,1.722767,2.091864,2.186051,2.322388,2.827314
volatile acidity,1599.0,0.417173,0.114926,0.113329,0.329304,0.41871,0.494696,0.947789
citric acid,1599.0,0.228147,0.152423,0.0,0.086178,0.231112,0.350657,0.693147
residual sugar,1599.0,1.218131,0.269969,0.641854,1.064711,1.163151,1.280934,2.80336
chlorides,1599.0,0.083038,0.038991,0.011929,0.067659,0.076035,0.086178,0.476855
free sulfur dioxide,1599.0,2.639013,0.62379,0.693147,2.079442,2.70805,3.091042,4.290459
total sulfur dioxide,1599.0,3.63475,0.682575,1.94591,3.135494,3.663562,4.143135,5.669881
density,1599.0,0.691519,0.000945,0.68817,0.690945,0.691521,0.692064,0.69499
pH,1599.0,1.460557,0.03576,1.319086,1.437463,1.460938,1.481605,1.611436
sulphates,1599.0,0.501073,0.093731,0.285179,0.438255,0.482426,0.548121,1.098612


# Polynomial Feature

In [None]:
from sklearn.preprocessing import PolynomialFeatures
wine_data_copy = wine_data.copy()
wine_data_copy = wine_data_copy.drop(['quality'],axis=1)
print('Number of feature before transformation :',wine_data_copy.shape[1])

# Applying polynomial feature
poly = PolynomialFeatures(degree=2)
poly_wine_data = poly.fit_transform(wine_data_copy)
print('Number of feature after transformation :',poly_wine_data.shape[1])


Number of feature before transformation : 11
Number of feature after transformation : 78


# Discretization

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
wine_data = wine_data.copy()

#transform dataset with kBins
enc = KBinsDiscretizer(n_bins=2,encode="onehot")
X = np.array(wine_data['chlorides']).reshape(-1,1)
X_bined = enc.fit_transform(X)

In [None]:
X_bined.toarray()[:5]

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])

# Categorical Handling

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
cols = ['sepal length', 'sepal width', 'petal length', 'petal width','label']
iris_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None, names=cols)
iris_data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
onehotencoder = OneHotEncoder()
print('Shape of y before encoding :',iris_data.label.shape)
iris_labels = onehotencoder.fit_transform(iris_data.label.values.reshape(-1,1))
print('Shape of y after encoding :',iris_labels.shape)

Shape of y before encoding : (150,)
Shape of y after encoding : (150, 3)


In [None]:
iris_labels.toarray()[:5]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [None]:
ordinalencoder = OrdinalEncoder()
iris_labels = np.array(iris_data['label']).reshape(-1,1)

iris_labels_transformed = ordinalencoder.fit_transform(iris_labels)
print('Unique Labels :',np.unique(iris_labels_transformed))

iris_labels_transformed[:5]

Unique Labels : [0. 1. 2.]


array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [None]:
labelencoder = LabelEncoder()
iris_labels = np.array(iris_data['label']).reshape(-1,1)

iris_labels_labelencode = labelencoder.fit_transform(iris_labels)
iris_labels_labelencode

  y = column_or_1d(y, warn=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
movie_genre = [{'action','comedy'},
               {'comedy'},
               {'action','thriller'},
               {'science-fiction','action','thriller'}]

In [None]:
multilabel = MultiLabelBinarizer()
movie_genre_multi = multilabel.fit_transform(movie_genre)
movie_genre_multi

array([[1, 1, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 1],
       [1, 0, 1, 1]])

### Composite Transformer

In [None]:
x = [
   [20.0, 'male',],
   [11.2, 'female',],
   [15.6, 'female',],
   [13.0, 'male',],
   [18.6, 'male',],
   [16.4, 'female',]
]
x = np.array(x)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler , OneHotEncoder

ct = ColumnTransformer([('Scaler',MinMaxScaler(),[0]),
                           ('Pass Through','passthrough',[0]),
                           ('encoder',OneHotEncoder(),[1])])
ct.fit_transform(x)

array([['0.9999999999999998', '20.0', '0.0', '1.0'],
       ['0.0', '11.2', '1.0', '0.0'],
       ['0.5', '15.6', '1.0', '0.0'],
       ['0.20454545454545459', '13.0', '0.0', '1.0'],
       ['0.840909090909091', '18.6', '0.0', '1.0'],
       ['0.5909090909090906', '16.4', '1.0', '0.0']], dtype='<U32')

# Feature Selection

## Filter Based Method

### Variance Threshold

In [None]:
data = [{'age':18,'weight':80},
        {'age':23,'weight':85},
        {'age':28,'weight':88},
        {'age':30,'weight':75},
        {'age':35,'weight':66}]

data    # 4 Samples and 2 features

[{'age': 18, 'weight': 80},
 {'age': 23, 'weight': 85},
 {'age': 28, 'weight': 88},
 {'age': 30, 'weight': 75},
 {'age': 35, 'weight': 66}]

In [None]:
dv = DictVectorizer(sparse=False)
transformed_data = dv.fit_transform(data)
print(transformed_data)
np.var(transformed_data,axis=0)

[[18. 80.]
 [23. 85.]
 [28. 88.]
 [30. 75.]
 [35. 66.]]


array([34.16, 60.56])

In [None]:
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=40)
data_new = vt.fit_transform(transformed_data)
data_new        # we select only those features(columns) for which threshold > 40

array([[80.],
       [85.],
       [88.],
       [75.],
       [66.]])

SelectKBest

In [None]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,mutual_info_regression

In [None]:
x_california , y_california = fetch_california_housing(return_X_y = True)

X , y = x_california[:2000] , y_california[:2000]
print('Shape of feature matrix before feature selection :',X.shape)

Shape of feature matrix before feature selection : (2000, 8)


In [None]:
skb = SelectKBest(mutual_info_regression,k=3)
X_new = skb.fit_transform(X,y)

print('Shape of feature matrix before feature selection :',X_new.shape)

Shape of feature matrix before feature selection : (2000, 3)


In [None]:
# name of feature selected as top feature
skb.get_feature_names_out()     # Feature 1 , Feature 7 , Feature 8

array(['x0', 'x6', 'x7'], dtype=object)

Select Percentile

In [None]:
from sklearn.feature_selection import SelectPercentile
sp = SelectPercentile(mutual_info_regression,percentile=30)
X_new = sp.fit_transform(X,y)
print('Shape of feature matrix before feature selection :',X_new.shape)

Shape of feature matrix before feature selection : (2000, 3)


In [None]:
sp.get_feature_names_out()      # Feature 1 , Feature 7 , Feature 8

array(['x0', 'x6', 'x7'], dtype=object)

## Wraper Based selection

# PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
pca.fit_transform(X)

array([[-921.39140547,   -4.64409386],
       [1157.68501435,    4.36053189],
       [-747.45259583,  -16.45779391],
       ...,
       [-422.42092376,  -11.56404156],
       [ 142.6029406 ,   -8.58194808],
       [-152.29308051,   11.90303535]])

In [None]:
pca.components_         # two components of PCA

array([[ 3.58746278e-04, -5.26626273e-03, -3.55739438e-04,
        -1.04636644e-04,  9.99985993e-01,  1.07189337e-04,
        -3.72926795e-05, -3.18638902e-05],
       [ 2.83042309e-02, -9.96787172e-01,  7.04618553e-02,
         1.11176863e-02, -5.23215008e-03, -1.02817045e-03,
         1.45200590e-02,  1.69252465e-02]])

# Chaining Transformation
        > sklearn.pipeline module rovides utilities to build a composite estimator , as a chain of transformer and estimators.

## Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

$1^st$ Method

In [None]:
estimators = [('simple_imputer',SimpleImputer()),
              ('Standard_scaler',StandardScaler()),
              ('pca',PCA(n_components=4))]
pipe = Pipeline(steps = estimators)

In [None]:
data_pipe = pipe.fit_transform(X)

In [None]:
data_pipe[:5]

array([[-0.26053874,  0.06579383,  1.68055807, -0.77069282],
       [-0.10709658,  1.99767063,  0.99434582, -1.64408012],
       [-0.30232474, -0.46858865,  1.85865104, -0.1842106 ],
       [-0.62216234, -0.89000134,  1.2813819 , -0.26584164],
       [-0.57294251, -1.46474484,  0.80753691, -0.46005408]])

$2^nd$ Method

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
pipe = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='mean'),StandardScaler(),PCA(n_components=4))

In [None]:
data_pipe2 = pipe.fit_transform(X)

In [None]:
data_pipe[:5]

array([[-0.26053874,  0.06579383,  1.68055807, -0.77069282],
       [-0.10709658,  1.99767063,  0.99434582, -1.64408012],
       [-0.30232474, -0.46858865,  1.85865104, -0.1842106 ],
       [-0.62216234, -0.89000134,  1.2813819 , -0.26584164],
       [-0.57294251, -1.46474484,  0.80753691, -0.46005408]])