<a href="https://colab.research.google.com/github/MpRonald/Machine-Learning/blob/main/Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get Data

In [122]:
#!pip install category_encoders

## Imports

In [212]:
from sklearn.pipeline import Pipeline
from sklearn import datasets, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
import pandas as pd

In [124]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [125]:
data = pd.read_csv('https://github.com/MpRonald/datasets/blob/main/adult_census.csv?raw=true')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


# Clean, Prepare and Manipulate Data

In [126]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [127]:
data.workclass.value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [128]:
data['workclass'] = data['workclass'].replace(['?'], 'undefined')
data.workclass.value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
undefined            1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [129]:
data.occupation.value_counts()

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
?                    1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64

In [130]:
data['occupation'] = data['occupation'].replace(['?'], 'undefined')
data.workclass.value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
undefined            1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [131]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,undefined,77053,HS-grad,9,Widowed,undefined,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,undefined,186061,Some-college,10,Widowed,undefined,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [132]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [133]:
data.duplicated().sum()

24

In [134]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()

0

In [135]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32537 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32537 non-null  int64 
 1   workclass       32537 non-null  object
 2   fnlwgt          32537 non-null  int64 
 3   education       32537 non-null  object
 4   education.num   32537 non-null  int64 
 5   marital.status  32537 non-null  object
 6   occupation      32537 non-null  object
 7   relationship    32537 non-null  object
 8   race            32537 non-null  object
 9   sex             32537 non-null  object
 10  capital.gain    32537 non-null  int64 
 11  capital.loss    32537 non-null  int64 
 12  hours.per.week  32537 non-null  int64 
 13  native.country  32537 non-null  object
 14  income          32537 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB


In [136]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32537.0,38.585549,13.637984,17.0,28.0,37.0,48.0,90.0
fnlwgt,32537.0,189780.848511,105556.471009,12285.0,117827.0,178356.0,236993.0,1484705.0
education.num,32537.0,10.081815,2.571633,1.0,9.0,10.0,12.0,16.0
capital.gain,32537.0,1078.443741,7387.957424,0.0,0.0,0.0,0.0,99999.0
capital.loss,32537.0,87.368227,403.101833,0.0,0.0,0.0,0.0,4356.0
hours.per.week,32537.0,40.440329,12.346889,1.0,40.0,40.0,45.0,99.0


In [137]:
data['capital.gain'].value_counts()

0        29825
15024      347
7688       284
7298       246
99999      159
         ...  
1111         1
4931         1
7978         1
5060         1
2538         1
Name: capital.gain, Length: 119, dtype: int64

# Train Model

In [138]:
X = data.iloc[:,:13]
y = data.income

In [139]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week
0,90,undefined,77053,HS-grad,9,Widowed,undefined,Not-in-family,White,Female,0,4356,40
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18
2,66,undefined,186061,Some-college,10,Widowed,undefined,Unmarried,Black,Female,0,4356,40
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40


In [140]:
y

0        <=50K
1        <=50K
2        <=50K
3        <=50K
4        <=50K
         ...  
32556    <=50K
32557    <=50K
32558     >50K
32559    <=50K
32560    <=50K
Name: income, Length: 32537, dtype: object

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=42)

In [151]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((27656, 13), (4881, 13), (27656,), (4881,))

In [152]:
# selecionando colunas não numéricas
data.select_dtypes(include='object')

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
0,undefined,HS-grad,Widowed,undefined,Not-in-family,White,Female,United-States,<=50K
1,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,United-States,<=50K
2,undefined,Some-college,Widowed,undefined,Unmarried,Black,Female,United-States,<=50K
3,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,United-States,<=50K
4,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,United-States,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Some-college,Never-married,Protective-serv,Not-in-family,White,Male,United-States,<=50K
32557,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32558,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32559,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K


In [160]:
# one hot encoder
one = OneHotEncoder(use_cat_names=True)
X_train = one.fit_transform(X_train)
X_test = one.transform(X_test)
X_train.shape, X_test.shape



((27656, 66), (4881, 66))

In [166]:
scaler_train = StandardScaler().fit(X_train)
scaler_test = StandardScaler().fit(X_test)
X_train = scaler_train.transform(X_train)
X_test = scaler_train.transform(X_test)

In [167]:
clf_tree = tree.DecisionTreeClassifier().fit(X_train, y_train)

# Test Data

In [168]:
clf_tree.predict(X_test)

array(['<=50K', '<=50K', '<=50K', ..., '<=50K', '<=50K', '<=50K'],
      dtype=object)

In [169]:
accuracy = clf_tree.score(X_test, y_test)
accuracy

0.8172505634091375

# Creating Pipelines

In [170]:
pip_1 = Pipeline([
    ('one', OneHotEncoder()),
    ('scaler', StandardScaler()),
    ('clf', tree.DecisionTreeClassifier())])

In [171]:
pip_1.steps

[('one', OneHotEncoder()),
 ('scaler', StandardScaler()),
 ('clf', DecisionTreeClassifier())]

In [172]:
X = data.iloc[:,:13]
y = data.income
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=42)

In [173]:
pip_1.fit(X_train, y_train)

Pipeline(steps=[('one',
                 OneHotEncoder(cols=['workclass', 'education', 'marital.status',
                                     'occupation', 'relationship', 'race',
                                     'sex'])),
                ('scaler', StandardScaler()),
                ('clf', DecisionTreeClassifier())])

In [174]:
accuracy_pip = pip_1.score(X_test, y_test)
accuracy, accuracy_pip

(0.8172505634091375, 0.8170456873591477)

# Creating Another Pipelines

In [189]:
pip_min_max = Pipeline([
    ('one', OneHotEncoder()),
    ('scaler', MinMaxScaler()),
    ('clf', tree.DecisionTreeClassifier())])

pip_max_depth = Pipeline([
    ('one', OneHotEncoder()),
    ('scaler', StandardScaler()),
    ('clf', tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf=4))])

clf_rf = RandomForestClassifier(n_estimators=200, random_state=42)
pip_random_forest = Pipeline([
    ('one', OneHotEncoder()),
    ('scaler', StandardScaler()),
    ('clf', clf_rf)])

In [190]:
pip_min_max.fit(X_train, y_train)
pip_max_depth.fit(X_train, y_train)
pip_random_forest.fit(X_train, y_train)

Pipeline(steps=[('one',
                 OneHotEncoder(cols=['workclass', 'education', 'marital.status',
                                     'occupation', 'relationship', 'race',
                                     'sex'])),
                ('scaler', StandardScaler()),
                ('clf',
                 RandomForestClassifier(n_estimators=200, random_state=42))])

In [191]:
print(f'Accuracy Random Forest: {pip_random_forest.score(X_test, y_test)}')
print(f'Accuracy Min-Max: {pip_min_max.score(X_test, y_test)}')
print(f'Accuracy Max Depth: {pip_max_depth.score(X_test, y_test)}')

Accuracy Random Forest: 0.8553575087072322
Accuracy Min-Max: 0.8154066789592297
Accuracy Max Depth: 0.8645769309567711


## Processing Distincts Columns

In [203]:
# using median
pip_median = Pipeline(steps=[
    ('median_', SimpleImputer(strategy='median'))])

# using frequence
frequence = Pipeline(steps=[(
    'frequence', SimpleImputer(strategy='most_frequent'))])

In [204]:
data_cleaning = ColumnTransformer(
    transformers=[
        ('median_', pip_median, ['education.num']),
        ('frequence', frequence, ['race'])
    ]
)

## Final Pipeline

In [205]:
final_pipeline = Pipeline([
    ('data_cleaning', data_cleaning),
    ('one', OneHotEncoder()),
    ('std', StandardScaler()),
    ('clf', tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf=4))])

In [206]:
final_pipeline.fit(X_train, y_train)

Pipeline(steps=[('data_cleaning',
                 ColumnTransformer(transformers=[('median_',
                                                  Pipeline(steps=[('median_',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['education.num']),
                                                 ('frequence',
                                                  Pipeline(steps=[('frequence',
                                                                   SimpleImputer(strategy='most_frequent'))]),
                                                  ['race'])])),
                ('one', OneHotEncoder(cols=[0, 1])), ('std', StandardScaler()),
                ('clf',
                 DecisionTreeClassifier(max_depth=10, min_samples_leaf=4))])

In [208]:
final_pipeline.predict(X_test)

array(['<=50K', '<=50K', '<=50K', ..., '<=50K', '<=50K', '<=50K'],
      dtype=object)

In [211]:
final_pipeline.score(X_test, y_test)

0.7826265109608687

## Grid Search Pipelines

In [237]:
param_grid = dict(clf__max_depth = [5,10,15,20,25,30,35],
                  clf__criterion = ['gini', 'entropy', 'log_loss'],
                  clf__splitter = ['best', 'random'])


grid = GridSearchCV(final_pipeline, param_grid=param_grid, cv=5, scoring='accuracy').fit(X, y)
grid.cv_results_

70 fits failed out of a total of 210.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py", line 937, in fit
    super().fit(
  File "/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py", line 352, in fit
    criterion = CRITERIA_CLF[self.criterion](
KeyError: 'log_loss'

 0.77

{'mean_fit_time': array([0.16282816, 0.15719633, 0.16749773, 0.16381431, 0.17002759,
        0.16440792, 0.16499567, 0.16148357, 0.16744194, 0.16724277,
        0.16730289, 0.16620965, 0.16980457, 0.16323781, 0.1603611 ,
        0.15783195, 0.16576176, 0.16183295, 0.16562076, 0.16296754,
        0.16968536, 0.16493454, 0.17582231, 0.17234836, 0.1746551 ,
        0.16631999, 0.17127891, 0.16461391, 0.14867411, 0.21683331,
        0.15932841, 0.18388224, 0.41112318, 0.25681968, 0.2329412 ,
        0.16303577, 0.14912038, 0.15146704, 0.15152011, 0.15315981,
        0.15110321, 0.15175099]),
 'std_fit_time': array([0.00529781, 0.00487013, 0.00627481, 0.00310272, 0.00592231,
        0.00295746, 0.00356304, 0.0035215 , 0.00854464, 0.00550241,
        0.00562133, 0.00408486, 0.01095079, 0.00573969, 0.00254364,
        0.00465124, 0.00682195, 0.0050988 , 0.00655538, 0.00295857,
        0.00398752, 0.00422418, 0.00808549, 0.00531668, 0.01244755,
        0.00510918, 0.00791447, 0.0039217 , 0.007

In [238]:
grid.best_params_

{'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__splitter': 'best'}

In [239]:
grid.best_score_

0.7792048778446624