In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("train.csv")
data.head()

In [None]:
x = data.iloc[:, 1:-1].values
x = np.array(x, dtype=float)
y = data.iloc[:, -1].values

In [None]:
print(set(y))

In [None]:
#converting strings to numeric labels
label = 0
labels = {}
inverse_labels = {}
for i in set(y):
    labels[i] = label
    inverse_labels[label] = i
    label += 1

for i in range(len(y)):
    y[i] = labels[y[i]]

y = np.array(y, dtype=float)

print(f"Number of classes: {label}")
pd.DataFrame(y).head()

In [None]:
class Clustering(BaseEstimator, TransformerMixin):
    def __init__(self, eps, min_samples):
        self.eps = eps
        self.min_samples = min_samples

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        self.dbscan_ = DBSCAN(eps=self.eps, min_samples=self.min_samples)
        labels = self.dbscan_.fit_predict(X)
        
        return np.concatenate((X, labels.reshape(-1, 1)), axis=1)

In [None]:
class OutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors, contamination):
        self.n_neighbors = n_neighbors
        self.contamination = contamination

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        self.lof = LocalOutlierFactor(n_neighbors=self.n_neighbors, contamination=self.contamination)
        self.lof.fit(X)
        is_inlier = self.lof.predict(X) == 1

        return X[is_inlier]

In [None]:
pipe = Pipeline(
    [
    ('scale', StandardScaler()),
    ('pca', PCA()),
    ('lda', LinearDiscriminantAnalysis()),
    ('lof', OutlierRemoval(n_neighbors=5, contamination=0.1)),
    ('dbscan', Clustering(eps=5, min_samples=5)),
    ('knn', KNeighborsClassifier()),
    ]
)

In [None]:
parameter_grid = {
    'pca__n_components': np.arange(15, 20),
    'lda__n_components': np.arange(15, 20),
    'dbscan__eps': np.arange(1, 20),
    'dbscan__min_samples': np.arange(1, 10),
    'lof__n_neighbors': np.arange(5, 100, 10),
    'lof__contamination': [0.1, 0.2, 0.3],
    'knn__n_neighbors': np.arange(5, 100, 10),
    'knn__weights': ['uniform', 'distance'],
}

In [None]:
grid = GridSearchCV(pipe, param_grid=parameter_grid, cv=5, scoring='accuracy', n_jobs=1, verbose=10)
grid.fit(x, y)

In [None]:
print(f"Best parameters: {grid.best_params_}")
print("Best score: {:.2f}".format(grid.best_score_))

In [None]:
#generating the submission file
x_test = pd.read_csv('test.csv')

y_pred = grid.best_estimator_.predict(x_test)
predicted_categories = []

for i in range(len(y_pred)):
    predicted_categories.append(inverse_labels[y_pred[i]])

results = pd.DataFrame({'ID': x_test.values[:, 0], 'Category': predicted_categories})
results.to_csv('submission.csv', index=False)