**Import bibliotek:**

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, f1_score, roc_auc_score

**Tworzymy funkcję calculate_metrics i generate_model_LR:**

In [2]:
metrics_dataframe = pd.DataFrame(columns = ['Model', 'F1_score', 'AUC'])
metrics_dataframe
models = []
models_names = []
predictions_proba_list = []

def calculate_metrics(model, name, X_checked, y_checked):
    models.append(model)
    models_names.append(name)
    global metrics_dataframe
    predictions = model.predict(X_checked)
    predictions_proba = model.predict_proba(X_checked)
    predictions_proba_list.append(predictions_proba[:,1])

    f1_metric = f1_score(y_checked, predictions)
    auc_metric = roc_auc_score(y_checked, predictions_proba[:,1])
    new_row = pd.DataFrame([{'Model': name, 'F1_score': f1_metric, 'AUC': auc_metric}])
    metrics_dataframe = pd.concat([metrics_dataframe, new_row], ignore_index=True)
    

def generate_model_LR(penalty,C): 
    if penalty=="l1":
        model = LogisticRegression(penalty=penalty,C=C,solver='liblinear')
    elif penalty=="l2":
        model = LogisticRegression(penalty=penalty,C=C)
    elif penalty=="elasticnet":
        model = LogisticRegression(penalty=penalty,C=C,solver='saga', l1_ratio=0.1)
    else:
        raise ValueError("Nieprawidłowa wartość 'penalty'")
    
    name = f"LogisticRegression_penalty:{penalty},C:{C}"
    
    model.fit(X_train_standardized,y_train)
    metrics_dataframe = calculate_metrics(model, name, X_test_standardized, y_test)
    return metrics_dataframe

**Import bazy danych:**

In [3]:
diabetes = pd.read_csv('diabetes.csv')

X = diabetes.drop(['Diabetic'], axis=1).copy()
y = diabetes['Diabetic'].copy()

num_features = ['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure', 'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age']

**Podział danych na zmienne objaśniające (X) i objaśniane (y):**

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
scaler = StandardScaler()

X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

**Tworzymy zbiory parametrów: penalties oraz parametr C oraz urchamiamy w pętli funkcją generate_model_LR:**

In [6]:
penalties = ['l1','l2','elasticnet']
parametrs_C = [0.01,0.1,1,10,100]

for penalty in penalties:
    for C in parametrs_C:
      generate_model_LR(penalty,C)

  metrics_dataframe = pd.concat([metrics_dataframe, new_row], ignore_index=True)


**Tworzenie pipeline'ów i jego trenowanie:**

In [7]:
#Tworzenie pipeline zawierającego różne 'preprocessory' danych:
num_preparation = Pipeline(steps=[
                                ('fill_missing', SimpleImputer(strategy='median')),
                                ('polynomial_features', PolynomialFeatures(degree=3)),
                                ('scaler', StandardScaler())])


#Tworzenie transformatora danych w kolumnach - data_preparation:
data_preparation = ColumnTransformer(transformers=[('numeric_preprocessing', num_preparation, num_features)])


#Tworzymy pipeline zawierający - transormator danych w kolumnach i model ML - LogisticRegression
model_pipeline = Pipeline(steps=[
                                ('preprocessor',data_preparation),
                                ('model', LogisticRegression(max_iter=10000))
                                ])

model_pipeline.fit(X_train,y_train)

In [8]:
calculate_metrics(model_pipeline, "Pipeline", X_test, y_test)

In [9]:
display(metrics_dataframe)

Unnamed: 0,Model,F1_score,AUC
0,"LogisticRegression_penalty:l1,C:0.01",0.640573,0.85629
1,"LogisticRegression_penalty:l1,C:0.1",0.649863,0.855614
2,"LogisticRegression_penalty:l1,C:1",0.650576,0.85556
3,"LogisticRegression_penalty:l1,C:10",0.650576,0.855543
4,"LogisticRegression_penalty:l1,C:100",0.650576,0.855543
5,"LogisticRegression_penalty:l2,C:0.01",0.648199,0.855638
6,"LogisticRegression_penalty:l2,C:0.1",0.649835,0.855567
7,"LogisticRegression_penalty:l2,C:1",0.650576,0.855552
8,"LogisticRegression_penalty:l2,C:10",0.651316,0.855552
9,"LogisticRegression_penalty:l2,C:100",0.651316,0.855552


**Wnioski:**

Pipeline osiągną lepsze wyniki w porównaniu z modelami regresji logistycznej z różnymi parametrami regularyzacji - prawdopodobnie jest to efekt lepszego przygotowania danych na etapie preprocessingu. W przypadku modeli LR - model uczył się na danych które zostały tylko zeskalowane, natomiast pipeline uczył się na danych które były kolejno: uzupełniane w przypadku braku wartością mediany, następnie dane zostały przekształcowe w wielomiany, a następnie zostały zeskalowane.