In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
df = pd.read_csv("C:\\Users\\Nitro\\Desktop\\Inteligencia_Artificial\\PreProcessamento_Udemy\\sample_dataset.csv")

In [3]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

Transformation of numerical and categorical columns

-Numerical variables:
* Blank filling with median value
* Standardization

-Categorical variables:
* Blank filling with most frequent value
* One-hot encoding

In [4]:
transformer = ColumnTransformer([
    ('numerical', make_pipeline(SimpleImputer(strategy='median'), StandardScaler()), make_column_selector(dtype_exclude='object')),
    ('categorical', make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse_output=False)), make_column_selector(dtype_include='object'))
])

PCA

In [7]:
pca = PCA(n_components=10)

Feature selection

In [5]:
selector = SelectKBest(f_classif, k = 5)

Pipeline

In [8]:
pipeline = Pipeline([
    ('transformation', transformer),
    ('pca', pca),
    ('feature_selection', selector)
])

In [9]:
pipeline.fit_transform(X, y)

array([[ 8.52256696,  2.64397044, -1.57565677, -3.52420989, -2.60947195],
       [ 2.79623958, -3.89824767,  0.10426929, -1.61476221, -0.15804423],
       [ 4.56985298, -1.18416154, -0.23154002, -0.95078422,  0.12680809],
       ...,
       [ 1.05053099, -2.22225232,  1.11958438,  2.06978788,  1.97780945],
       [10.21620878,  0.39525002, -2.47257669,  1.09076934, -0.72400782],
       [-5.32259512, -0.24553988,  1.22278786,  1.40541489,  0.49452765]],
      shape=(569, 5))

In [10]:
X1 = transformer.fit_transform(X)

In [11]:
X2 = pca.fit_transform(X1)

In [12]:
X3 = selector.fit_transform(X2, y)
X3

array([[ 8.52256696,  2.64397044, -1.57565677, -3.52420989, -2.60947195],
       [ 2.79623958, -3.89824767,  0.10426929, -1.61476221, -0.15804423],
       [ 4.56985298, -1.18416154, -0.23154002, -0.95078422,  0.12680809],
       ...,
       [ 1.05053099, -2.22225232,  1.11958438,  2.06978788,  1.97780945],
       [10.21620878,  0.39525002, -2.47257669,  1.09076934, -0.72400782],
       [-5.32259512, -0.24553988,  1.22278786,  1.40541489,  0.49452765]],
      shape=(569, 5))

Change PCA components to 15 and feature selection variables to 3

In [13]:
pipeline.set_params(pca__n_components=15, feature_selection__k=3)

0,1,2
,steps,"[('transformation', ...), ('pca', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_components,15
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,score_func,<function f_c...0022D6BFAAFC0>
,k,3


In [14]:
pipeline.fit_transform(X, y)

array([[ 8.52256696,  2.64397044, -1.57565677],
       [ 2.79623958, -3.89824767,  0.10426929],
       [ 4.56985298, -1.18416154, -0.23154002],
       ...,
       [ 1.05053099, -2.22225232,  1.11958438],
       [10.21620878,  0.39525002, -2.47257669],
       [-5.32259512, -0.24553988,  1.22278786]], shape=(569, 3))

Change the numerical cleaning strategy to 'mean'

In [None]:
pipeline.set_params(transformation__numerical__strategy='mean')
pipeline.fit_transform(X, y)