## Transformações Usando Pipeline - ColumnTransformer

Para validação desse estudo foi extraída base de dados do Kaggle:
https://www.kaggle.com/datasets/kabure/german-credit-data-with-risk

Pipeline serve para organizar as etapas de normalizar e tratar os dados.

In [1]:
## importando as bibliotecas

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

## importando dataset

df = pd.read_csv('german_credit_data.csv')

## lendo o arquivo

df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [2]:
df.shape

(1000, 11)

## 01 - Alterando A Coluna Risk Para Uma Dummy

In [3]:
df['Risk'] = np.where(df['Risk']=='bad', 1, 0)

df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,0
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,2,49,male,1,own,little,,2096,12,education,0
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,4,53,male,2,free,little,little,4870,24,car,1


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration,Risk
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,35.546,1.904,3271.258,20.903,0.3
std,288.819436,11.375469,0.653614,2822.736876,12.058814,0.458487
min,0.0,19.0,0.0,250.0,4.0,0.0
25%,249.75,27.0,2.0,1365.5,12.0,0.0
50%,499.5,33.0,2.0,2319.5,18.0,0.0
75%,749.25,42.0,2.0,3972.25,24.0,1.0
max,999.0,75.0,3.0,18424.0,72.0,1.0


In [5]:
df.nunique()

Unnamed: 0          1000
Age                   53
Sex                    2
Job                    4
Housing                3
Saving accounts        4
Checking account       3
Credit amount        921
Duration              33
Purpose                8
Risk                   2
dtype: int64

## 02 - Filtrando O Dataset Para Trabalhar Apenas Com As Colunas De Predição.

In [6]:
## filtrando o dataset para trabalhar apenas com colunas numéricas que importam na predição

df = df[['Age', 'Credit amount', 'Duration', 'Risk']].copy()

df.head()

Unnamed: 0,Age,Credit amount,Duration,Risk
0,67,1169,6,0
1,22,5951,48,1
2,49,2096,12,0
3,45,7882,42,0
4,53,4870,24,1


In [7]:
## separando o target e as features

features = df[['Age', 'Credit amount', 'Duration']]
labels = df[['Risk']]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state=0)

## 03 - Pipeline

In [9]:
from sklearn.preprocessing import StandardScaler ## normalização dos dados
from sklearn.impute import SimpleImputer ## impute para os missings
from sklearn.tree import DecisionTreeClassifier

## sem pipeline, seguir os passos abaixo

pipe = Pipeline([('scaler', StandardScaler()),
                ('imputer', SimpleImputer(strategy='median')),
                ('clf', DecisionTreeClassifier())])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.61

## 04 - Make Pipeline

In [10]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


pipe2 = make_pipeline(MinMaxScaler(),
                SimpleImputer(strategy='mean'),
                LogisticRegression())

pipe2.fit(X_train, y_train)
pipe2.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.7166666666666667

## 05 -  ColumnTransformer

In [11]:
df = pd.read_csv('german_credit_data.csv')

In [12]:
df['Risk'] = np.where(df['Risk']=='bad', 1, 0)

df = df[['Age', 'Credit amount', 'Duration', 'Purpose', 'Risk']].copy()

df.head()

Unnamed: 0,Age,Credit amount,Duration,Purpose,Risk
0,67,1169,6,radio/TV,0
1,22,5951,48,radio/TV,1
2,49,2096,12,education,0
3,45,7882,42,furniture/equipment,0
4,53,4870,24,car,1


In [13]:
## separando o target e as features

features = df[['Age', 'Credit amount', 'Duration', 'Purpose']]
labels = df[['Risk']]

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size = 0.3, random_state=0
)

In [14]:
numericas = ['Age', 'Credit amount', 'Duration']
categoricas = ['Purpose']

In [15]:
## OneHotEncoder - transforma em dummy

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

t = [('num', StandardScaler(), numericas),
    ('cat', OneHotEncoder(), categoricas)]

preprocessor = ColumnTransformer(transformers=t)

In [16]:
pipe_transformer = Pipeline(steps=[('preprocessor', preprocessor), ('clf', DecisionTreeClassifier())])

In [17]:
pipe_transformer.fit(X_train, y_train)

In [18]:
pipe_transformer.score(X_test, y_test)

0.62

## 06 - Loop para Testar vários Modelos

In [19]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, 
    AdaBoostClassifier,
    GradientBoostingClassifier,)

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

In [20]:
classifiers_list = [
    KNeighborsClassifier(3),
    SVC(kernel='rbf', C=0.025, probability=True),
    SVC(),
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
]

In [21]:
for classifier in classifiers_list:
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])
    pipe.fit(X_train, y_train)
    print(classifier)
    print('Model score: %.3f' % pipe.score(X_test, y_test))

  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


KNeighborsClassifier(n_neighbors=3)
Model score: 0.690
SVC(C=0.025, probability=True)
Model score: 0.713
SVC()
Model score: 0.720
LogisticRegression()
Model score: 0.697
DecisionTreeClassifier()
Model score: 0.607


  return fit_method(estimator, *args, **kwargs)


RandomForestClassifier()
Model score: 0.690
AdaBoostClassifier()
Model score: 0.720


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GradientBoostingClassifier()
Model score: 0.710
