# Machine Learning Pipeline

## Imports

In [78]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OrdinalEncoder

# feature engine
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

## Data

In [79]:
# Carga de datos
data = pd.read_csv('../data/raw/diabetes_data_upload.csv')
data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [80]:
# Split de datos
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['class'], axis=1), # predictive variables
    data['class'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=0, # ramdom seed
)

X_train.shape, X_test.shape

((416, 16), (104, 16))

## Config

In [81]:
# Variables categoricas
CATEGORICAL_VARS = [
    'Gender ', 'Polyuria ', 'Polydipsia ', 'sudden weight loss ','weakness ', 'Polyphagia ',
    'Genital thrush ', 'visual blurring ', 'Itching ', 'Irritability ', 'delayed healing ',
    'partial paresis ', 'muscle stiffness ', 'Alopecia ', 'Obesity '
    ]

FEATURES = [
    'Age ', 
    'Gender ', 
    'Polyuria ', 
    'Polydipsia ', 
    'partial paresis '
    ]

TARGET = ['class']

In [82]:
# Aplicacion de las features
X_train = X_train[FEATURES]
X_test = X_test[FEATURES]

In [83]:
# Creacion del PipeLine
diabetes_class_pipe = Pipeline([
    # ===== Estandarizacion de datos =====
    #('stip_values', lambda df: df.applymap(lambda x: x.strip() if isinstance(x, str) else x)),


    # ==== Mapeo de variables categoricas ======
    ('categorical_encoder', OrdinalEncoder()),


    # ==== Scaler ==========
    ('scaler', MinMaxScaler()),


    # ==== Model =======
    ('randomForestClass', RandomForestClassifier(
    bootstrap=True, ccp_alpha=0.0, class_weight=None,
    criterion='gini', max_depth=None, max_features='sqrt',
    max_leaf_nodes=None, max_samples=None,
    min_impurity_decrease=0.0, min_samples_leaf=1,
    min_samples_split=2, min_weight_fraction_leaf=0.0,
    monotonic_cst=None, n_estimators=100, n_jobs=-1,
    oob_score=False, random_state=123, verbose=0,
    warm_start=False
    ))

])

In [84]:
# Ajuste del pipeLine
diabetes_class_pipe.fit(X_train, y_train)

In [85]:
# Metricas
# Train
pred = diabetes_class_pipe.predict(X_train)

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
pred_encoded = encoder.transform(pred)

print('TRAIN')
print('accuracy: {}'.format(accuracy_score(y_train_encoded, pred_encoded)))
print('recall: {}'.format(recall_score(y_train_encoded, pred_encoded)))
print('F1: {}'.format(recall_score(y_train_encoded, pred_encoded)))


TRAIN
accuracy: 0.9639423076923077
recall: 0.953125
F1: 0.953125


In [46]:
# Export
joblib.dump(diabetes_class_pipe, '../pipelines/pipeLineClassificationModel.joblib') 

['../pipelines/pipeLineClassificationModel.joblib']

# PipeLine V2

In [1]:
# Imports
import pandas as pd
import numpy as np

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler


In [6]:
# Carga de datos
data = pd.read_csv('../data/raw/diabetes_data_upload.csv')

In [3]:
# Quitar espacios en blanco
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

  data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [4]:
# Mapeo del target
data['class'] = data['class'].map({'Positive': 1, 'Negative': 0})

In [None]:
# Split de datos
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['class'], axis=1), # predictive variables
    data['class'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=0, # ramdom seed
)

X_train.shape, X_test.shape

((416, 17), (104, 17))

In [6]:
# Config
FEATURES = [
    'Age ', 
    'Gender ', 
    'Polyuria ', 
    'Polydipsia ', 
    'partial paresis '
    ]

NUMERICAL_VAR = ['Age ']

CATEGORICAL_VAR = [
    'Gender ', 
    'Polyuria ', 
    'Polydipsia ', 
    'partial paresis '
    ]

TARGET = ['class']

In [7]:
# Aplicacion de las features
X_train = X_train[FEATURES]
X_test = X_test[FEATURES]

In [8]:
# Creacion del PipeLine
diabetes_class_pipe_v2 = Pipeline([
    
    # ==== Preprossesor ======
    ('preprocessing', ColumnTransformer(
        transformers=[
            ('categorical', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_VAR),
            ('numerical', MinMaxScaler(), NUMERICAL_VAR)
        ])),

    # ==== Model =======
    ('randomForestClass', RandomForestClassifier(
    bootstrap=True, ccp_alpha=0.0, class_weight=None,
    criterion='gini', max_depth=None, max_features='sqrt',
    max_leaf_nodes=None, max_samples=None,
    min_impurity_decrease=0.0, min_samples_leaf=1,
    min_samples_split=2, min_weight_fraction_leaf=0.0,
    monotonic_cst=None, n_estimators=100, n_jobs=-1,
    oob_score=False, random_state=123, verbose=0,
    warm_start=False))
])

In [9]:
# Ajuste del pipeLine
diabetes_class_pipe_v2.fit(X_train, y_train)

In [10]:
# Metricas
# Train
pred = diabetes_class_pipe_v2.predict(X_train)

print('TRAIN')
print('accuracy: {}'.format(accuracy_score(y_train, pred)))
print('recall: {}'.format(recall_score(y_train, pred)))
print('F1: {}'.format(f1_score(y_train, pred)))

# Test
pred = diabetes_class_pipe_v2.predict(X_test)

print('\nTEST')
print('accuracy: {}'.format(accuracy_score(y_test, pred)))
print('recall: {}'.format(recall_score(y_test, pred)))
print('F1: {}'.format(f1_score(y_test, pred)))


TRAIN
accuracy: 0.9639423076923077
recall: 0.953125
F1: 0.9701789264413518

TEST
accuracy: 0.9519230769230769
recall: 0.984375
F1: 0.9618320610687023


In [105]:
pred

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1], dtype=int64)

In [11]:
# Desmapeo
resultado_pred = np.where(pred == 1, "POSITIVE", "NEGATIVE")
resultado_pred

array(['POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE',
       'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE',
       'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE',
       'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE',
       'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE',
       'POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE',
       'NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE',
       'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE',
       'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE',
       'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATI

In [103]:
# Export
joblib.dump(diabetes_class_pipe_v2, '../pipelines/pipeLineClassificationModel_v2.joblib')

['../pipelines/pipeLineClassificationModel_v2.joblib']