In [1]:
# load necessary modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import category_encoders as ce
import mlflow

from sklearn import metrics
from sklearn import datasets
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# load data
df = pd.read_csv('../data/penguins.csv')

# data preprocessing
df=df.drop_duplicates()
df.dropna(axis=0,inplace=True)

# one hot encoding - nominal data
encoder=ce.OneHotEncoder(cols=['species','island'],handle_unknown='return_nan', return_df=True,use_cat_names=True)
df_enc=encoder.fit_transform(df)

# add a new column with labels
df_enc.loc[df_enc.sex == 'male', 'label'] = int(1)
df_enc.loc[df_enc.sex == 'female', 'label'] = int(0)
df_enc['label'].astype('float')

# check number of males/females to inspect if dataset is imbalanced
df_enc['label'].value_counts()

# get labels
labels = df_enc[['label']]
y = labels.to_numpy().reshape(-1,)

# drop useless columns
df_enc.drop(columns=['rowid','sex'],axis=1,inplace=True)

# get features
X = df_enc.iloc[:,:11].values

# split dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)

In [2]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")
mlflow.list_experiments()

tracking URI: 'file:///Users/mikhailkuklin/repos/mlfow_tracking_example/notebooks/mlruns'


[<Experiment: artifact_location='file:///Users/mikhailkuklin/repos/mlfow_tracking_example/notebooks/mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>]

In [3]:
def train_log_reg(X_train, y_train):
      scaler = StandardScaler()
      log_reg = LogisticRegression(C=0.01,
                                    fit_intercept=True, intercept_scaling=1,
                                    max_iter=100,
                                    multi_class='ovr', 
                                    penalty='l2', 
                                    solver='lbfgs', tol=0.0001)
      pipe_def_model = Pipeline([('scaler', scaler), ('log_reg', log_reg)])
      pipe_def_model.fit(X_train, y_train)

      return pipe_def_model

In [4]:
def predict(model, X_val):
      y_pred = model.predict(X_val)
      return y_pred

In [5]:
def predict_prob(model, X_val):
      y_pred = model.predict_proba(X_val)
      return y_pred

In [14]:
def get_metrics(y_true, y_pred, y_pred_proba):
      from sklearn.metrics import accuracy_score,precision_score,recall_score,log_loss
      acc = accuracy_score(y_true, y_pred)
      prec = precision_score(y_true, y_pred)
      recall = recall_score(y_true, y_pred)
      entropy = log_loss(y_true, y_pred_proba)

      return {'accuracy': acc, 'precision': prec, 'recall': recall, 'entropy': entropy}

In [7]:
def get_confusion_matrix(clf, X_val, y_val):
      plot.confusion_matrix(clf, X_val, y_val)
      plt.savefig('confusion_matrix.png')

In [15]:
model =  train_log_reg(X_train, y_train)

y_pred = predict(model, X_val)

y_pred_proba = predict_prob(model, X_val)

metrics_run = get_metrics(y_val, y_pred, y_pred_proba)

{'accuracy': 0.8805970149253731,
 'precision': 0.8421052631578947,
 'recall': 0.9411764705882353,
 'entropy': 0.5198353321439548}

In [5]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('penguins_log_reg_v2')

# standartization, fitting and estimation of the errors
scaler = StandardScaler()
log_reg = LogisticRegression(C=0.01,
                                    fit_intercept=True, intercept_scaling=1,
                                    max_iter=100,
                                    multi_class='ovr', 
                                    penalty='l2', 
                                    solver='lbfgs', tol=0.0001)
pipe_def_model = Pipeline([('scaler', scaler), ('log_reg', log_reg)])
pipe_def_model.fit(X_train, y_train)
err_train_def_model = pipe_def_model.score(X_train, y_train)
err_val_def_model = pipe_def_model.score(X_val, y_val)
y_pred_def_model = pipe_def_model.predict(X)
accuracy_def_model = pipe_def_model.score(X, y)

# printing
print(f"Accuracy of classification: {round(100*accuracy_def_model, 2)}%")
print(f"Training error: {err_train_def_model}")
print(f"Validation error: {err_val_def_model}")

Accuracy of classification: 90.69%
Training error: 0.9135338345864662
Validation error: 0.8805970149253731


: 