In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import hdbscan

In [None]:
data = pd.read_csv("train.csv")
data.head()

In [None]:
x = data.iloc[:, 1:-1].values
x = np.array(x, dtype=float)
y = data.iloc[:, -1].values

In [None]:
print(set(y))

In [None]:
#converting strings to numeric labels
label = 0
labels = {}
inverse_labels = {}
for i in set(y):
    labels[i] = label
    inverse_labels[label] = i
    label += 1

for i in range(len(y)):
    y[i] = labels[y[i]]

y = np.array(y, dtype=float)

print(f"Number of classes: {label}")
pd.DataFrame(y).head()

In [None]:
search_space = [
    Integer(1, 20, name='pca_n_components'),
    Integer(50, 500, name='gradient_boosting_n_estimators'),
    Integer(2, 10, name='gradient_boosting_max_depth'),
    Real(10**-4, 10**-1, name='gradient_boosting_learning_rate'),
    Integer(50, 500, name='isolation_forest_n_estimators'),
    Real(0.1, 0.5, name='isolation_forest_contamination'),
    Integer(5, 100, name='hdbscan_min_samples'),
    Integer(50, 500, name='hdbscan_min_cluster_size'),
]

In [None]:
def objective(pca_n_components, gradient_boosting_learning_rate, gradient_boosting_n_estimators, gradient_boosting_max_depth, isolation_forest_n_estimators, isolation_forest_contamination, hdbscan_min_samples, hdbscan_min_cluster_size):
    #pre-processing
    scaler = StandardScaler()
    pca = PCA(n_components=pca_n_components)
    x_processed = pca.fit_transform(scaler.fit_transform(x))

    #outlier-detection
    isolation_forest = IsolationForest(n_estimators=isolation_forest_n_estimators, contamination=isolation_forest_contamination)
    isolation_forest.fit(x_processed)
    x_transformed = x_processed[np.where(isolation_forest.predict(x_processed) != -1)[0]]

    #clustering
    clusterer = hdbscan.HDBSCAN(min_samples=hdbscan_min_samples, min_cluster_size=hdbscan_min_cluster_size)
    cluster_labels = clusterer.fit_predict(x_transformed)
    x_clustered = np.concatenate((x_transformed, cluster_labels.reshape(-1, 1)), axis=1)

    #classification
    gbm = GradientBoostingClassifier(n_estimators=gradient_boosting_n_estimators, learning_rate=gradient_boosting_learning_rate, max_depth=gradient_boosting_max_depth, random_state=42)
    gbm.fit(x_clustered, y)

    #accuracy
    return -np.mean(cross_val_score(gbm, x_clustered, y, cv=5, n_jobs=1, scoring='accuracy'))

In [None]:
result = gp_minimize(objective, search_space, n_calls=50, random_state=42)

In [None]:
print('Best hyperparameters:', dict(zip(['min_samples_leaf', 'n_estimators', 'max_features'], result.x)))
print('Best score:', -result.fun)

In [None]:
#pre-processing
scaler = StandardScaler()
pca = PCA(n_components=result.x[0])
x_test_processed = pca.fit_transform(scaler.fit_transform(x_test))

#outlier detection
isolation_forest = IsolationForest(n_estimators=result.x[1], contamination=result.x[2])
isolation_forest.fit(x_test_processed)
x_test_transformed = x_test_processed[np.where(isolation_forest.predict(x_test_processed) != -1)[0]]

#clustering
clusterer = hdbscan.HDBSCAN(min_samples=result.x[3], min_cluster_size=result.x[4])
cluster_labels = clusterer.fit_predict(x_test_transformed)
x_test_clustered = np.concatenate((x_test_transformed, cluster_labels.reshape(-1, 1)), axis=1)

#classification
gbm = GradientBoostingClassifier(n_estimators=result.x[5], learning_rate=result.x[6], max_depth=result.x[7], random_state=42)
gbm.fit(x_test_clustered, y)

#predict labels
y_pred = gbm.predict(x_test_clustered)