In [1]:
import openml
import numpy

# Import danych

In [2]:
datasets = {}

### Diabetes

In [3]:
diabetes_dataset = openml.datasets.get_dataset(37)

In [4]:
X, y, _, columns = diabetes_dataset.get_data(target=diabetes_dataset.default_target_attribute)

In [5]:
datasets['diabetes'] = [X, y, columns]

### Credit-g

In [6]:
creditg_dataset = openml.datasets.get_dataset(31)

In [7]:
X, y, _, columns = creditg_dataset.get_data(target=creditg_dataset.default_target_attribute)

In [8]:
datasets['creditg'] = [X, y, columns]

### Spambase

In [9]:
spambase_dataset = openml.datasets.get_dataset(44)

In [10]:
X, y, _, columns = spambase_dataset.get_data(target=spambase_dataset.default_target_attribute)

In [11]:
datasets['spambase'] = [X, y, columns]

### Yeast

In [14]:
yeast_dataset = openml.datasets.get_dataset(40597)

In [18]:
X, y, _, columns = yeast_dataset.get_data()

In [21]:
datasets['yeast'] = [X, y, columns]

# Preprocessing

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X = datasets['diabetes'][0]
y = datasets['diabetes'][1]

In [39]:
# Podział na dane treningowe i testowe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Search

In [28]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from optuna import create_study
from optuna.samplers import TPESampler

In [41]:
# Definicja modelu i parametrów do przeszukiwania
rf = RandomForestClassifier(random_state=42)
param_distributions = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [42]:
# Przeprowadzenie Random Search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    random_state=42,
    n_jobs=-1
)

In [43]:
random_search.fit(X_train, y_train)
best_rf_random = random_search.best_estimator_

In [44]:
# Wynik
print("Najlepsze parametry z Random Search:", random_search.best_params_)
y_pred = best_rf_random.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Najlepsze parametry z Random Search: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': False}
Accuracy: 0.7337662337662337
                 precision    recall  f1-score   support

tested_negative       0.80      0.78      0.79        99
tested_positive       0.62      0.65      0.64        55

       accuracy                           0.73       154
      macro avg       0.71      0.72      0.71       154
   weighted avg       0.74      0.73      0.74       154



# Bayesian Optimization

In [45]:
def objective(trial):
    # Definicja zakresów dla hiperparametrów
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Konfiguracja modelu z parametrami z próbki
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs=-1
    )

    # Trening modelu i walidacja krzyżowa
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [46]:
# Tworzenie i uruchamianie optymalizacji
study = create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)

[I 2024-11-12 22:05:10,090] A new study created in memory with name: no-name-bc169c5f-5c24-4f52-a1d2-31f0d692e9a1
[I 2024-11-12 22:05:10,336] Trial 0 finished with value: 0.7597402597402597 and parameters: {'n_estimators': 144, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3, 'bootstrap': True}. Best is trial 0 with value: 0.7597402597402597.
[I 2024-11-12 22:05:10,430] Trial 1 finished with value: 0.7272727272727273 and parameters: {'n_estimators': 64, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 3, 'bootstrap': False}. Best is trial 0 with value: 0.7597402597402597.
[I 2024-11-12 22:05:10,693] Trial 2 finished with value: 0.7467532467532467 and parameters: {'n_estimators': 258, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': False}. Best is trial 0 with value: 0.7597402597402597.
[I 2024-11-12 22:05:10,874] Trial 3 finished with value: 0.7337662337662337 and parameters: {'n_estimators': 158, 'max_depth': 9, 'min_samples_split

In [47]:
print("Najlepsze parametry z Bayesian Optimization:", study.best_params)
print("Najlepsza dokładność:", study.best_value)


Najlepsze parametry z Bayesian Optimization: {'n_estimators': 199, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 3, 'bootstrap': True}
Najlepsza dokładność: 0.7662337662337663


In [48]:
# Test z najlepszym modelem
best_rf_bayes = RandomForestClassifier(**study.best_params, random_state=42)
best_rf_bayes.fit(X_train, y_train)
y_pred_bayes = best_rf_bayes.predict(X_test)


In [49]:
print("Accuracy (Bayesian Optimization):", accuracy_score(y_test, y_pred_bayes))
print(classification_report(y_test, y_pred_bayes))

Accuracy (Bayesian Optimization): 0.7662337662337663
                 precision    recall  f1-score   support

tested_negative       0.81      0.84      0.82        99
tested_positive       0.69      0.64      0.66        55

       accuracy                           0.77       154
      macro avg       0.75      0.74      0.74       154
   weighted avg       0.76      0.77      0.76       154

