In [45]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn.metrics import accuracy_score
import hdbscan
from sklearn.model_selection import train_test_split

In [46]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,ID,n0,n1,n2,n3,n4,n5,n6,n7,n8,...,n4087,n4088,n4089,n4090,n4091,n4092,n4093,n4094,n4095,category
0,0,0.0,0.0,1.272801,0.290501,0.581446,0.0,0.0,0,0.0,...,1.645888,0.86964,0.302432,0.953719,0.022545,0.498048,0.0,0.034988,0.692382,Orange_Ripe
1,1,0.0,0.0,1.542096,0.0,0.896557,0.049978,0.0,0,0.117847,...,1.50422,0.622686,0.588427,0.524415,0.305426,0.386204,0.0,0.0,0.668196,Banana_Ripe
2,2,0.0,0.0,1.098595,0.571866,0.500355,0.0,0.0,0,0.493137,...,1.169341,0.913239,0.064404,0.53127,0.0,0.471604,0.0,0.0,0.65825,Mango_Raw
3,3,0.0,0.101666,1.159194,0.599216,0.893206,0.0,0.200139,0,0.645675,...,0.560686,1.243676,0.432523,0.701881,0.0,0.589985,0.0,0.0,0.591165,Leeche_Raw
4,4,0.0,0.0,1.178603,0.362568,0.577602,0.0,0.0,0,0.079862,...,1.206032,0.736831,0.345906,0.878515,0.119,0.261441,0.0,0.0,0.458905,Mango_Ripe


In [47]:
x = data.iloc[:, 1:-1].values
x = np.array(x, dtype=float)
y = data.iloc[:, -1].values

In [48]:
print(set(y))

{'Guava_Ripe', 'Papaya_Raw', 'Pomengranate_Ripe', 'Strawberry_Raw', 'Banana_Ripe', 'Strawberry_Ripe', 'Papaya_Ripe', 'Leeche_Ripe', 'Orange_Raw', 'Guava_Raw', 'Pomengranate_Raw', 'Coconut_Ripe', 'Orange_Ripe', 'Mango_Ripe', 'Banana_Raw', 'Mango_Raw', 'Leeche_Raw', 'Apple_Raw', 'Coconut_Raw', 'Apple_Ripe'}


In [49]:
#converting strings to numeric labels
# label = 0
# labels = {}
# inverse_labels = {}
# for i in set(y):
#     labels[i] = label
#     inverse_labels[label] = i
#     label += 1

# for i in range(len(y)):
#     y[i] = labels[y[i]]

# y = np.array(y, dtype=float)

# print(f"Number of classes: {label}")
# pd.DataFrame(y).head()
# print(labels)
# print(inverse_labels)

In [50]:
search_space = [
    Integer(20, 100, name='pca_n_components'),
    Integer(500, 1000, name='random_forest_n_estimators'),
    Integer(10, 20, name='random_forest_max_depth'),
    Integer(250, 500, name='isolation_forest_n_estimators'),
    Real(0.1, 0.5, name='isolation_forest_contamination'),
    Integer(1, 20, name='hdbscan_min_samples'),
    Integer(20, 80, name='hdbscan_min_cluster_size'),
]

In [51]:
def objective(params):
    pca_n_components, random_forest_n_estimators, random_forest_max_depth, isolation_forest_n_estimators, isolation_forest_contamination, hdbscan_min_samples, hdbscan_min_cluster_size = params[0], params[1], params[2], params[3], params[4], params[5], params[6]
    #pre-processing
    # scaler = StandardScaler()
    pca = PCA(n_components=pca_n_components)
    x_processed = pca.fit_transform(x)

    #outlier-detection
    isolation_forest = IsolationForest(n_estimators=isolation_forest_n_estimators, contamination=isolation_forest_contamination)
    isolation_forest.fit(x_processed)
    x_transformed = x_processed[np.where(isolation_forest.predict(x_processed) != -1)[0]]
    y_transformed = y[np.where(isolation_forest.predict(x_processed) != -1)[0]]

    #clustering
    clusterer = hdbscan.HDBSCAN(min_samples=hdbscan_min_samples, min_cluster_size=hdbscan_min_cluster_size)
    cluster_labels = clusterer.fit_predict(x_transformed)
    x_clustered = np.concatenate((x_transformed, cluster_labels.reshape(-1, 1)), axis=1)

    #classification
    rf = RandomForestClassifier(n_estimators=random_forest_n_estimators, max_depth=random_forest_max_depth)

    #accuracy
    return -np.mean(cross_val_score(rf, x_clustered, y_transformed, cv=5, n_jobs=-1, scoring='accuracy'))

In [52]:
# result = gp_minimize(objective, search_space, n_calls=50, random_state=42, verbose=10)

In [53]:
# print('Best hyperparameters:', result.x)
# print('Best score:', -result.fun)

In [54]:
data2 = pd.read_csv('test.csv')
x_test = data2.iloc[:, 1:].values
x_test = np.array(x_test, dtype='float')
print(x_test[:5, :])

[[0.         0.         0.908889   ... 0.         0.         0.65567034]
 [0.         0.         1.1910553  ... 0.         0.         0.61449343]
 [0.         0.26190305 0.99278164 ... 0.         0.         0.39215815]
 [0.         0.         1.35240054 ... 0.         0.         0.62836468]
 [0.         0.         1.11428118 ... 0.         0.         0.83567119]]


In [55]:
#pre-processing
scaler1 = StandardScaler()
scaler1.fit(x)
pca1 = PCA(n_components=100)
pca1.fit(x)
x_processed = pca1.transform(scaler1.transform(x))
x_test_processed = pca1.transform(scaler1.transform(x_test))

#outlier-detection
isolation_forest = IsolationForest(n_estimators=500, contamination=0.1)
isolation_forest.fit(x_processed)
indices = np.where(isolation_forest.predict(x_processed) != -1)[0]
x_transformed = x_processed[indices]
y_transformed = y[indices]

#clustering
clusterer1 = hdbscan.HDBSCAN(min_samples=1, min_cluster_size=20)
cluster_labels1 = clusterer1.fit_predict(x_transformed)
x_clustered = np.concatenate((x_transformed, cluster_labels1.reshape(-1, 1)), axis=1)

clusterer2 = hdbscan.HDBSCAN(min_samples=1, min_cluster_size=20)
cluster_labels2 = clusterer2.fit_predict(x_test_processed)
x_test_clustered = np.concatenate((x_test_processed, cluster_labels2.reshape(-1, 1)), axis=1)

# x_train, x_validate, y_train, y_validate = train_test_split(x_clustered, y_transformed, test_size=0.3, random_state=1)
# print(set(x_test_clustered[:, -1]))
#classification
rf = RandomForestClassifier(n_estimators=1000, max_depth=17)

# print(labels)
# print(inverse_labels)
rf.fit(x_clustered, y_transformed)

#predict labels
y_pred = rf.predict(x_test_clustered)

# predicted_categories = []
# print(y_pred)

# for i in range(len(y_pred)):
#     predicted_categories.append(inverse_labels[y_pred[i]])
print(y_pred)
ids = range(415)
# print(predicted_categories)
results = pd.DataFrame({'ID': ids, 'Category': y_pred})
results.to_csv('submission.csv', index=False)

['Leeche_Raw' 'Orange_Raw' 'Pomengranate_Ripe' 'Strawberry_Raw'
 'Apple_Ripe' 'Banana_Ripe' 'Coconut_Ripe' 'Apple_Ripe' 'Coconut_Ripe'
 'Coconut_Ripe' 'Strawberry_Ripe' 'Banana_Ripe' 'Mango_Raw' 'Papaya_Ripe'
 'Guava_Raw' 'Leeche_Ripe' 'Orange_Raw' 'Banana_Raw' 'Papaya_Ripe'
 'Mango_Raw' 'Guava_Raw' 'Banana_Ripe' 'Orange_Ripe' 'Apple_Raw'
 'Guava_Ripe' 'Strawberry_Ripe' 'Apple_Raw' 'Orange_Ripe' 'Coconut_Ripe'
 'Orange_Ripe' 'Strawberry_Raw' 'Apple_Ripe' 'Strawberry_Raw' 'Banana_Raw'
 'Mango_Raw' 'Coconut_Raw' 'Apple_Ripe' 'Orange_Raw' 'Orange_Raw'
 'Pomengranate_Ripe' 'Guava_Raw' 'Banana_Raw' 'Leeche_Raw' 'Papaya_Ripe'
 'Leeche_Raw' 'Apple_Ripe' 'Leeche_Raw' 'Apple_Ripe' 'Banana_Ripe'
 'Mango_Raw' 'Coconut_Ripe' 'Mango_Ripe' 'Mango_Raw' 'Mango_Ripe'
 'Apple_Ripe' 'Leeche_Ripe' 'Papaya_Ripe' 'Mango_Raw' 'Coconut_Raw'
 'Strawberry_Ripe' 'Banana_Ripe' 'Guava_Ripe' 'Apple_Raw'
 'Pomengranate_Raw' 'Pomengranate_Raw' 'Apple_Raw' 'Coconut_Raw'
 'Guava_Raw' 'Guava_Raw' 'Pomengranate_Raw' 'App