In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from keras.utils import to_categorical
import hdbscan
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from skopt import gp_minimize
from skopt.space import Real, Integer

In [22]:
print(tf.__version__)

2.10.1


In [23]:
#loading the training data
data = pd.read_csv("train.csv")
x = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

x = np.array(x, dtype='float')
y = np.array(y)

num_classes = len(set(y))

In [24]:
#converting strings to numeric labels
label = 0
labels = {}
inverse_labels = {}
for i in set(y):
    labels[i] = label
    inverse_labels[label] = i
    label += 1

for i in range(len(y)):
    y[i] = labels[y[i]]

y = np.array(y, dtype='float')

In [25]:
#loading the testing data
x_test = pd.read_csv("test.csv").iloc[:, 1:].values

x_test = np.array(x_test, dtype='float')

In [26]:
#standardizing the data
# x = (x - np.mean(x, axis=0))/np.std(x)
# x_test = (x_test - np.mean(x_test, axis=0))/np.std(x_test)

In [27]:
#clearing previous tensorflow session
tf.keras.backend.clear_session()

In [28]:
search_space = [
    Integer(128, 1024, name='layer1_neurons'),
    Integer(64, 512, name='layer2_neurons'),
    Integer(32, 256, name='layer3_neurons'),
    Real(0, 0.9, name='dropout_value'),
    Integer(20, 1000, name='pca_n_components'),
    Integer(1, 19, name='lda_n_components'),
    Integer(1, 60, name='hdbscan_min_samples'),
    Integer(5, 80, name='hdbscan_min_cluster_size'),
    Integer(10, 1000, name='nn_epochs'),
    Integer(1, 64, name='nn_batch_size')
]

parameter_names = [
    'layer1_neurons', 
    'layer2_neurons', 
    'layer3_neurons', 
    'dropout_value', 
    'pca_n_components', 
    'lda_n_components', 
    'hdbscan_min_samples', 
    'hdbscan_min_cluster_size', 
    'nn_epochs', 
    'nn_batch_size'
]

In [29]:
def objective(params):
    layer1_neurons, layer2_neurons, layer3_neurons, dropout_value, pca_n_components, lda_n_components, hdbscan_min_samples, hdbscan_min_cluster_size, nn_epochs, nn_batch_size = params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9]
    #clearing previous tensorflow session
    tf.keras.backend.clear_session()
    
    #configuring the model
    model = tf.keras.models.Sequential(
        [
        tf.keras.layers.Flatten(input_shape=(lda_n_components+1,)),
        tf.keras.layers.Dense(layer1_neurons, activation=tf.nn.relu),
        tf.keras.layers.Dropout(dropout_value),
        tf.keras.layers.Dense(layer2_neurons, activation=tf.nn.relu),
        tf.keras.layers.Dropout(dropout_value),
        tf.keras.layers.Dense(layer3_neurons, activation=tf.nn.relu),
        tf.keras.layers.Dropout(dropout_value),
        tf.keras.layers.Dense(num_classes, activation=tf.nn.softmax),
        ]
    )

    #compiling the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    #performing kfold cross validation
    num_folds = 5
    kfold = KFold(n_splits=num_folds, shuffle=True)

    cv_scores = []

    for train_idx, test_idx in kfold.split(x):
        x_train, y_train = x[train_idx], y[train_idx]
        x_val, y_val = x[test_idx], y[test_idx]

        #performing dimensionality reduction
        pca = PCA(n_components=pca_n_components)
        p = pca.fit(x_train)

        x_train = p.transform(x_train)
        x_val = p.transform(x_val)

        #performing lda
        lda = LinearDiscriminantAnalysis(n_components=lda_n_components)
        l = lda.fit(x_train, y_train)

        x_train = l.transform(x_train)
        x_val = l.transform(x_val)

        #clustering
        clusterer = hdbscan.HDBSCAN(min_samples=hdbscan_min_samples, min_cluster_size=hdbscan_min_cluster_size)
        train_labels = clusterer.fit_predict(x_train)
        num_labels = len(set(train_labels))

        test_labels = []
        
        cluster_centroids = []
        for i in list(set(train_labels)):
            cluster_centroids.append(np.mean(x_train[train_labels == i], axis=0))
        cluster_centroids = np.array(cluster_centroids, dtype='float')
        
        for i in range(len(x_val)):
            test_labels.append(np.argmin(np.sum((x_val[i]-cluster_centroids)**2, axis=1)))
        test_labels = np.array(test_labels)
        test_labels[test_labels == num_labels-1] = -1

        x_train = np.concatenate((x_train, train_labels.reshape(-1, 1)), axis=1)
        x_val = np.concatenate((x_val, test_labels.reshape(-1, 1)), axis=1)

        #fitting the model to the training data for this fold
        model.fit(x_train, y_train, epochs=nn_epochs, batch_size=nn_batch_size, verbose=0)

        _, accuracy = model.evaluate(x_val, y_val, verbose=0)
        cv_scores.append(accuracy)

    # print('Cross-validation accuracy scores:', cv_scores)
    # print('Mean cross-validation accuracy:', np.mean(cv_scores))
    # print('Standard deviation of cross-validation accuracy:', np.std(cv_scores))
    return (-np.mean(cv_scores))

In [30]:
result = gp_minimize(objective, search_space, n_calls=100, random_state=42, verbose=10)

Iteration No: 1 started. Evaluating function at random point.


In [None]:
print("Best score: ", -result.fun)

In [None]:
print("Best hyper-parameters: ", dict(zip(parameter_names, result.x)))

In [None]:
# model.fit(x, y, epochs=50, batch_size=4)

In [None]:
# predicted_probabilities = model.predict(x_test)

In [None]:
# y_pred = np.argmax(predicted_probabilities, axis=1)

# predicted_categories = []

# for i in range(len(y_pred)):
#     predicted_categories.append(inverse_labels[y_pred[i]])

# ids = range(415)

# previous_submission = pd.read_csv("submission.csv")
# previous_submission.to_csv('prev.csv', index=False)
# results = pd.DataFrame({'ID': ids, 'Category': predicted_categories})
# results.to_csv('submission.csv', index=False)