**Install and load necessary modules/libraries**

In [341]:
# load necessary modules

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import category_encoders as ce
import mlflow

from sklearn import metrics
from sklearn import datasets
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import classification_report

In [412]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('penguin_logistic_regression_v5')

<Experiment: artifact_location='./mlruns/6', experiment_id='6', lifecycle_stage='active', name='penguin_logistic_regression_v5', tags={}>

**Data preprocessing**

In [413]:
# load csv file

df = pd.read_csv('../data/penguins.csv')

Unnamed: 0.1,Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,3,4,Adelie,Torgersen,,,,,,2007
4,4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [415]:
# drop duplicated if exist

df=df.drop_duplicates()

In [416]:
# check null values

df.isnull().sum()

Unnamed: 0            0
rowid                 0
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [419]:
# drop columns with missing values

df.dropna(axis=0,inplace=True)

(333, 10)

In [422]:
# one hot encoding - nominal data

encoder=ce.OneHotEncoder(cols=['species','island'],handle_unknown='return_nan', return_df=True,use_cat_names=True)

df_enc=encoder.fit_transform(df)

Unnamed: 0.1,Unnamed: 0,rowid,species_Adelie,species_Gentoo,species_Chinstrap,island_Torgersen,island_Biscoe,island_Dream,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,1,1.0,0.0,0.0,1.0,0.0,0.0,39.1,18.7,181.0,3750.0,male,2007
1,1,2,1.0,0.0,0.0,1.0,0.0,0.0,39.5,17.4,186.0,3800.0,female,2007
2,2,3,1.0,0.0,0.0,1.0,0.0,0.0,40.3,18.0,195.0,3250.0,female,2007
4,4,5,1.0,0.0,0.0,1.0,0.0,0.0,36.7,19.3,193.0,3450.0,female,2007
5,5,6,1.0,0.0,0.0,1.0,0.0,0.0,39.3,20.6,190.0,3650.0,male,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,339,340,0.0,0.0,1.0,0.0,0.0,1.0,55.8,19.8,207.0,4000.0,male,2009
340,340,341,0.0,0.0,1.0,0.0,0.0,1.0,43.5,18.1,202.0,3400.0,female,2009
341,341,342,0.0,0.0,1.0,0.0,0.0,1.0,49.6,18.2,193.0,3775.0,male,2009
342,342,343,0.0,0.0,1.0,0.0,0.0,1.0,50.8,19.0,210.0,4100.0,male,2009


In [423]:
# add a new column with labels

df_enc.loc[df_enc.sex == 'male', 'label'] = int(1)
df_enc.loc[df_enc.sex == 'female', 'label'] = int(0)
df_enc['label'].astype('float')

Unnamed: 0.1,Unnamed: 0,rowid,species_Adelie,species_Gentoo,species_Chinstrap,island_Torgersen,island_Biscoe,island_Dream,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,label
0,0,1,1.0,0.0,0.0,1.0,0.0,0.0,39.1,18.7,181.0,3750.0,male,2007,1.0
1,1,2,1.0,0.0,0.0,1.0,0.0,0.0,39.5,17.4,186.0,3800.0,female,2007,0.0
2,2,3,1.0,0.0,0.0,1.0,0.0,0.0,40.3,18.0,195.0,3250.0,female,2007,0.0
4,4,5,1.0,0.0,0.0,1.0,0.0,0.0,36.7,19.3,193.0,3450.0,female,2007,0.0
5,5,6,1.0,0.0,0.0,1.0,0.0,0.0,39.3,20.6,190.0,3650.0,male,2007,1.0


In [424]:
# check number of males/females to inspect if dataset is imbalanced

df_enc['label'].value_counts()

1.0    168
0.0    165
Name: label, dtype: int64

In [425]:
# get labels

labels = df_enc[['label']]
y = labels.to_numpy().reshape(-1,)

In [427]:
# drop useless columns

df_enc.drop(columns=['rowid','sex'],axis=1,inplace=True)

In [428]:
# get features

X = df_enc.iloc[:,:11].values



**ML: Logistic regression**

In [429]:
# split dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)

In [448]:
# build a model

#mlflow.sklearn.autolog()

with mlflow.start_run():
    
    #mlflow.set_tag("author", "Mikhail")
    #mlflow.set_tag("logistic-regression","model")
    mlflow.log_param("data", "../data/penguins.csv")
    
    #regularization = 0.01
    #mlflow.log_param("regulatization", regularization)

    # standartization, fitting and estimation of the errors
    scaler = StandardScaler()
    log_reg = LogisticRegression(C=regularization, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='ovr', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False)
    pipe_def_model = Pipeline([('scaler', scaler), ('log_reg', log_reg)])
    pipe_def_model.fit(X_train, y_train)
    #err_train_def_model = pipe_def_model.score(X_train, y_train)
    #err_val_def_model = pipe_def_model.score(X_val, y_val)
    #y_pred_def_model = pipe_def_model.predict(X)
    #accuracy_def_model = pipe_def_model.score(X, y)

    #mlflow.log_metric("training-error", err_train_def_model)
    #mlflow.log_metric("validation-error", err_val_def_model)
    #mlflow.log_metric("accuracy", accuracy_def_model)
    
    # printing
    print(f"Accuracy of classification: {round(100*accuracy_def_model, 2)}%")
    print(f"Training error: {err_train_def_model}")
    print(f"Validation error: {err_val_def_model}")
    
    #model_name = "model_lr.bin"
    #with open("/models/" + model_name, 'wb') as fout:
    #    pickle.dump((cv, log_reg), fout)
        
    #mlflow.log_artifact(local_path="/models" + model_name, artifact_path="/")

Accuracy of classification: 90.69%
Training error: 0.9135338345864662
Validation error: 0.8805970149253731
