In [1]:
import os

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV

import MLOps.DataGoverner as DG
from MLOps import DBManager
from MLOps import Model

import warnings
warnings.filterwarnings("ignore")

In [2]:
IN_DOCKER = os.getenv('IN_DOCKER') == 'Yes'

dbm = DBManager(dev_db=True, in_docker=IN_DOCKER)
df = DG.load_tabular_dataset(dbm, dataset_name='Iris', in_docker=IN_DOCKER)

dataroles = {
    'petal_length' : 'input', 
    'petal_width'  : 'input', 
    'sepal_length' : 'input', 
    'sepal_width'  : 'input',
    'class'        : 'target'
}
X = df[[col for col in dataroles if dataroles[col] == 'input']]
y = df[[col for col in dataroles if dataroles[col] == 'target']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs', 'liblinear']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='balanced_accuracy', verbose=1)
grid_search.fit(X_train, y_train)

best = grid_search.best_estimator_

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: ", grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters found:  {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best cross-validation accuracy:  0.9571428571428573


In [5]:
model = Model(
    name                ='Iris LR pipeline',
    estimator_class     =LogisticRegression,
    dataset_name        ='Iris',
    estimator_parameters={
        'C'       : 1,
        'penalty' : 'l2',
        'solver'  : 'lbfgs'
    },
    estimator           =best,
    features_names      =[col for col in dataroles if dataroles[col] == 'input'],
    dataroles           =dataroles
)