In [14]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import hdbscan
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.decomposition import KernelPCA
from sklearn.manifold import MDS
from sklearn.linear_model import LogisticRegression as LR
from sklearn import metrics
from sklearn.cluster import KMeans


In [5]:
data = pd.read_csv("train.csv")
x = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

x = np.array(x, dtype='float')
y = np.array(y)

In [17]:
def objective(params):
    max_depth, max_features, min_samples_split, hdbscan_min_samples, hdbscan_min_cluster_size = params[0], params[1], params[2], params[3], params[4]
 
    
    #performing kfold cross validation
    num_folds = 5
    kfold = KFold(n_splits=num_folds, shuffle=True)

    cv_scores = []
    model = RandomForestClassifier(n_estimators= 1500, criterion = "entropy",max_depth=max_depth, max_features = max_features, min_samples_split = min_samples_split)

    for train_idx, test_idx in kfold.split(x):
        x_train, y_train = x[train_idx], y[train_idx]
        x_val, y_val = x[test_idx], y[test_idx]

        #performing dimensionality reduction
        pca = PCA(n_components=900)
        # pca = KernelPCA(n_components=300, kernel='rbf', gamma=15, random_state=42)

        p = pca.fit(x_train)

        x_train = p.transform(x_train)
        x_val = p.transform(x_val)
        pca = PCA(n_components= 250)
        p = pca.fit(x_train)

        x_train = p.transform(x_train)
        x_val = p.transform(x_val)
        #performing lda
        lda = LDA(n_components=19)
        l = lda.fit(x_train, y_train)

        x_train = l.transform(x_train)
        x_val = l.transform(x_val)

        #clustering
        clusterer = hdbscan.HDBSCAN(min_samples=hdbscan_min_samples, min_cluster_size=hdbscan_min_cluster_size)
        train_labels = clusterer.fit_predict(x_train)
        num_labels = len(set(train_labels))

        test_labels = []
        
        cluster_centroids = []
        for i in list(set(train_labels)):
            cluster_centroids.append(np.mean(x_train[train_labels == i], axis=0))
        cluster_centroids = np.array(cluster_centroids, dtype='float')
        
        for i in range(len(x_val)):
            test_labels.append(np.argmin(np.sum((x_val[i]-cluster_centroids)**2, axis=1)))
        test_labels = np.array(test_labels)
        test_labels[test_labels == num_labels-1] = -1

        x_train = np.concatenate((x_train, train_labels.reshape(-1, 1)), axis=1)
        x_val = np.concatenate((x_val, test_labels.reshape(-1, 1)), axis=1)

        #fitting the model to the training data for this fold
        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)
        a = accuracy_score(y_val, y_pred)
        cv_scores.append(a)
        print(a)
    # print(max_depth, max_features, min_samples_split, hdbscan_min_samples, hdbscan_min_cluster_size)
    print('Cross-validation accuracy scores:', cv_scores)
    print('Mean cross-validation accuracy:', np.mean(cv_scores))
    print('Standard deviation of cross-validation accuracy:', np.std(cv_scores))
    return (-np.mean(cv_scores))

In [18]:
objective()



0.7786885245901639




0.7695473251028807




0.7736625514403292




0.7983539094650206




0.7818930041152263
Cross-validation accuracy scores: [0.7786885245901639, 0.7695473251028807, 0.7736625514403292, 0.7983539094650206, 0.7818930041152263]
Mean cross-validation accuracy: 0.7804290629427241
Standard deviation of cross-validation accuracy: 0.009906266871917863


-0.7804290629427241