In [1]:
import openml
import numpy as np

# Import danych

In [2]:
datasets = {}

### Diabetes

In [3]:
diabetes_dataset = openml.datasets.get_dataset(37)

In [4]:
X, y, _, columns = diabetes_dataset.get_data(target=diabetes_dataset.default_target_attribute)

In [5]:
datasets['diabetes'] = [X, y, columns]

### Credit-g

In [6]:
creditg_dataset = openml.datasets.get_dataset(31)

In [7]:
X, y, _, columns = creditg_dataset.get_data(target=creditg_dataset.default_target_attribute)

In [8]:
datasets['creditg'] = [X, y, columns]

### Spambase

In [9]:
spambase_dataset = openml.datasets.get_dataset(44)

In [10]:
X, y, _, columns = spambase_dataset.get_data(target=spambase_dataset.default_target_attribute)

In [11]:
datasets['spambase'] = [X, y, columns]

### Yeast

In [12]:
yeast_dataset = openml.datasets.get_dataset(40597)

In [13]:
#X, y, _, columns = yeast_dataset.get_data()

In [14]:
#datasets['yeast'] = [X, y, columns]

# Preprocessing

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector

In [16]:
# Setting preprocessing and model

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

col_trans = ColumnTransformer(transformers=[
        ('num_pipeline', num_pipeline, make_column_selector(dtype_include=np.number)),
        ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include='category'))
    ],
    remainder='passthrough',
    n_jobs=-1
)

def label_encode(y):
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(y)

target_transformer = FunctionTransformer(label_encode, validate=False)

LR = LogisticRegression(penalty='elasticnet', solver='saga', class_weight='balanced', l1_ratio=0.5)
model_pipe = Pipeline([('preprocessing', col_trans), ('model', LR)])
model_pipe.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(n_jobs=-1, remainder='passthrough',
                     transformers=[('num_pipeline',
                                    Pipeline(steps=[('impute', SimpleImputer()),
                                                    ('scale', MinMaxScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x000002177FF759D0>),
                                   ('cat_pipeline',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('one-hot',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse_output=False))]),
                                    <sklearn.compose._column_transformer.make_column_selector objec

# Szukanie hiperparametrów domyślnych

In [None]:
import pandas as pd
from time import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical

In [52]:
# Definicja modelu i parametrów do przeszukiwania
rf = RandomForestClassifier(random_state=42)

param_distributions = {
    'n_estimators': [50, 100, 200, 300, 1000, 2000],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [53]:
# Przeprowadzenie Random Search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=20,
    scoring='roc_auc',
    cv=5,
    random_state=42,
    n_jobs=-1
)

In [57]:
# Przechowujemy wyniki dla każdego zbioru danych
all_results = []

# Przeprowadzamy Random Search dla każdego zbioru danych
for dataset_name in datasets:
    start_time_iter = time()
    print(f"Dataset: {dataset_name}")
    
    # Ekstrakcja danych
    dataset = datasets[dataset_name]
    X = dataset[0]
    y = target_transformer.fit_transform(dataset[1])  # Target transformation
    
    # Transformacja cech
    X = col_trans.fit_transform(X)
    
    # Podział na dane treningowe i testowe
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
    
    # Dopasowanie modelu
    random_search.fit(X_train, y_train)
    
    # Zbieranie wyników każdej iteracji z Random Search dla bieżącego zbioru danych
    dataset_results = pd.DataFrame(random_search.cv_results_)
    dataset_results['dataset'] = dataset_name
    
    # Zamiana słownika z parametrami na tekstowy format JSON
    dataset_results['params'] = dataset_results['params'].apply(lambda x: str(x))
    
    # Zapis wyników dla bieżącego zbioru danych do ogólnej listy
    all_results.append(dataset_results[['params', 'mean_test_score', 'dataset']])

    # Odliczanie czasu całego algorytmu
        elapsed_time_iter = time() - start_time_iter
        print(f"Czas wykonaniu dla datasetu {dataset_name}: {elapsed_time_iter}")

Processing dataset: diabetes
Processing dataset: creditg
Processing dataset: spambase


In [61]:
# Konwersja wyników do DataFrame
all_results_df = pd.concat(all_results, ignore_index=True)

# Grupowanie po parametrach i obliczenie średniej AUC
mean_results_df = (
    all_results_df.groupby('params')['mean_test_score']
    .mean()
    .reset_index()
    .rename(columns={'mean_test_score': 'mean_auc'})
)

# Znalezienie najlepszego zestawu hiperparametrów
default_params = mean_results_df.loc[mean_results_df['mean_auc'].idxmax()]

# Wyświetlanie najlepszego zestawu hiperparametrów i jego średniej wartości AUC
print("Best Parameters:")
print(default_params)

Best Parameters:
params      {'n_estimators': 300, 'min_samples_split': 10,...
mean_auc                                             0.866087
Name: 11, dtype: object


# Random Search Optymalizacja

In [40]:
import warnings

# Wyłączenie FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.exceptions import ConvergenceWarning

# Ignorowanie wszystkich UserWarnings (np. z skopt) oraz ConvergenceWarnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [51]:
# Obliczanie dla każdego datasetu
for i, dataset_name in enumerate(datasets):
    start_time_iter = time()
    
    print(f"Dataset: {dataset_name}")
    # Eksportowanie datasetu
    dataset = datasets[dataset_name]
    X = dataset[0]
    y = dataset[1]
    columns = dataset[2]
    
    y = target_transformer.fit_transform(y)

    X = col_trans.fit_transform(X)

    # Konwersja do DataFrame, jeśli X nie jest DataFrame
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
        
    # Podział na dane treningowe i testowe
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
    
    # Unoptimized
    start_time = time()
    model_pipe.fit(X_train,y_train)
    elapsed_time = time() - start_time
    
    y_pred = model_pipe.predict(X_test)
    auc_scr = roc_auc_score(y_test, y_pred)
    r2_scr = r2_score(y_test, y_pred)
    
    new_row = pd.DataFrame({
      'method': "Unoptimized",
      'elapsed_time': [elapsed_time],
      'best_score': None,
      'test_score': None,
      'best_params': f"[\'model__C\': {LR.get_params()['C']}, \'model__l1_ratio\':{LR.get_params()['l1_ratio']}]",
      'auc_score': [auc_scr],
      'r2_score': [r2_scr]
    })
    main_results_df = pd.DataFrame(columns=['method','elapsed_time', 'best_score', 'test_score', 'best_params', 'auc_score', 'r2_score'])
    main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)

    # Random Search
    start_time = time()
    random_search.fit(X_train, y_train)
    elapsed_time = time() - start_time
    
    best_score = random_search.best_score_
    test_score = random_search.score(X_test, y_test)
    best_params = str(random_search.best_params_)
    
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    auc_scr = roc_auc_score(y_test, y_pred)
    r2_scr = r2_score(y_test, y_pred)
    
    random_results = pd.DataFrame(random_search.cv_results_)
    random_results['search_type'] = 'RandomizedSearchCV'
    
    new_row = pd.DataFrame({
        'method': "RandomSearchCV",
        'elapsed_time': [elapsed_time],
        'best_score': [best_score],
        'test_score': [test_score],
        'best_params': [best_params],
        'auc_score': [auc_scr],
        'r2_score': [r2_scr]
    })
    main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)

    name = "random_search"
    main_results_df.to_csv(f"output/{dataset_name}_{name}_main_res.csv", index=False)
    random_results.to_csv(f"output/{dataset_name}_{name}_random_iter_res.csv", index=False)

    # Odliczanie czasu całego algorytmu
    elapsed_time_iter = time() - start_time_iter
    print(f"Czas wykonaniu dla datasetu {dataset_name}: {elapsed_time_iter}")


Dataset: diabetes
Czas wykonaniu dla datasetu diabetes: 20.550113201141357
Dataset: creditg
Czas wykonaniu dla datasetu creditg: 19.454925537109375
Dataset: spambase
Czas wykonaniu dla datasetu spambase: 65.23516201972961


# Bayesian Optimization

In [38]:
# Definicja przestrzeni hiperparametrów
param_space = {
    "n_estimators": Integer(50, 300),
    #"max_depth": Integer(5, 20),
    #"min_samples_split": Integer(2, 10),
    #"min_samples_leaf": Integer(1, 4)
    #,"bootstrap": Categorical([True, False])
}

param_space = {
    'model__C': [0.1, 1, 10],                # Parametr C dla LogisticRegression
    'model__penalty': ['elasticnet'],        # Parametr penalty
    'model__l1_ratio': [0.2, 0.5, 0.8]       # Wartość l1_ratio dla elasticnet
}

bayes_search = BayesSearchCV(
    model_pipe,
    param_space,
    n_iter=60,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    random_state=8,
    return_train_score=True
)

In [41]:
# Obliczanie dla każdego datasetu
for i, dataset_name in enumerate(datasets):
    start_time_iter = time()
    
    print(f"Dataset: {dataset_name}")
    # Eksportowanie datasetu
    dataset = datasets[dataset_name]
    X = dataset[0]
    y = dataset[1]
    columns = dataset[2]
    
    y = target_transformer.fit_transform(y)

    X = col_trans.fit_transform(X)

    # Konwersja do DataFrame, jeśli X nie jest DataFrame
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
        
    # Podział na dane treningowe i testowe
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
    
    # Unoptimized
    start_time = time()
    model_pipe.fit(X_train,y_train)
    elapsed_time = time() - start_time
    
    y_pred = model_pipe.predict(X_test)
    auc_scr = roc_auc_score(y_test, y_pred)
    r2_scr = r2_score(y_test, y_pred)
    
    new_row = pd.DataFrame({
      'method': "Unoptimized",
      'elapsed_time': [elapsed_time],
      'best_score': None,
      'test_score': None,
      'best_params': f"[\'model__C\': {LR.get_params()['C']}, \'model__l1_ratio\':{LR.get_params()['l1_ratio']}]",
      'auc_score': [auc_scr],
      'r2_score': [r2_scr]
    })
    main_results_df = pd.DataFrame(columns=['method','elapsed_time', 'best_score', 'test_score', 'best_params', 'auc_score', 'r2_score'])
    main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)

    # Bayes Search    
    start_time = time()
    bayes_search.fit(X_train, y_train)
    elapsed_time = time() - start_time
    
    best_score = bayes_search.best_score_
    test_score = bayes_search.score(X_test, y_test)
    best_params = str(bayes_search.best_params_)
    
    best_model = bayes_search.best_estimator_
    y_pred = best_model.predict(X_test)
    auc_scr = roc_auc_score(y_test, y_pred)
    r2_scr = r2_score(y_test, y_pred)
    
    bayes_results = pd.DataFrame(bayes_search.cv_results_)
    bayes_results['search_type'] = 'BayesSearchCV'
    
    new_row = pd.DataFrame({
        'method': "BayesSearchCV",
        'elapsed_time': [elapsed_time],
        'best_score': [best_score],
        'test_score': [test_score],
        'best_params': [best_params],
        'auc_score': [auc_scr],
        'r2_score': [r2_scr]
    })
    main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)

    name = "bayes_search"
    main_results_df.to_csv(f"output/{dataset_name}_{name}_main_res_{i}.csv", index=False)
    bayes_results.to_csv(f"output/{dataset_name}_{name}_bayes_iter_res_{i}.csv", index=False)

Dataset: diabetes
Dataset: creditg
Dataset: spambase
