In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
import hdbscan
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.decomposition import KernelPCA
from sklearn.manifold import MDS
from sklearn.linear_model import LogisticRegression as LR
from sklearn import metrics
from sklearn.cluster import KMeans


In [6]:
data = pd.read_csv("train.csv")
x = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

x = np.array(x, dtype='float')
y = np.array(y)

In [19]:
def objective():
    #performing kfold cross validation
    num_folds = 5
    kfold = KFold(n_splits=num_folds, shuffle=True)

    cv_scores = []
    model = LR(multi_class='ovr', solver='liblinear')

    for train_idx, test_idx in kfold.split(x):
        x_train, y_train = x[train_idx], y[train_idx]
        x_val, y_val = x[test_idx], y[test_idx]



        #performing dimensionality reduction
        pca = PCA(n_components=300)
        # pca = KernelPCA(n_components=300, kernel='rbf', gamma=15, random_state=42)

        p = pca.fit(x_train)
        x_train = p.transform(x_train)
        x_val = p.transform(x_val)

        #performing lda
        lda = LDA(n_components=19)
        l = lda.fit(x_train, y_train)

        x_train = l.transform(x_train)
        x_val = l.transform(x_val)
        # Removing Outliers
        isolation_forest = IsolationForest(n_estimators=500, contamination=0.1)
        isolation_forest.fit(x_train)
        indices = np.where(isolation_forest.predict(x_train) != -1)[0]
        x_train = x_train[indices]
        y_train = y_train[indices]
        #clustering
        
        kmeans = KMeans(n_clusters = 20, n_init= "auto")
        kmeans1 = kmeans.fit(x_train)
        cluster_labels1 = kmeans1.predict(x_train)
        x_train = np.concatenate((x_train, cluster_labels1.reshape(-1, 1)), axis=1)
        cluster_labels2 = kmeans1.predict(x_val)
        x_val = np.concatenate((x_val, cluster_labels2.reshape(-1, 1)), axis=1)
        #fitting the model to the training data for this fold
        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)
        a = accuracy_score(y_val, y_pred)
        cv_scores.append(a)
        print(a)
    # print(max_depth, max_features, min_samples_split, hdbscan_min_samples, hdbscan_min_cluster_size)
    print('Cross-validation accuracy scores:', cv_scores)
    print('Mean cross-validation accuracy:', np.mean(cv_scores))
    print('Standard deviation of cross-validation accuracy:', np.std(cv_scores))
    return (-np.mean(cv_scores))

In [20]:
objective()

0.7827868852459017
0.823045267489712
0.7695473251028807
0.7942386831275721
0.7983539094650206
Cross-validation accuracy scores: [0.7827868852459017, 0.823045267489712, 0.7695473251028807, 0.7942386831275721, 0.7983539094650206]
Mean cross-validation accuracy: 0.7935944140862173
Standard deviation of cross-validation accuracy: 0.01780722220482124


-0.7935944140862173

In [22]:
data = pd.read_csv("train.csv")
data.head()
testData = pd.read_csv("test.csv")
testData.head()

Unnamed: 0,ID,n0,n1,n2,n3,n4,n5,n6,n7,n8,...,n4086,n4087,n4088,n4089,n4090,n4091,n4092,n4093,n4094,n4095
0,0,0.0,0.0,0.908889,0.251257,0.662262,0.042495,0.0,0,0.964784,...,0.694072,1.146161,1.483842,0.717836,0.472616,0.0,0.488022,0.0,0.0,0.65567
1,1,0.0,0.0,1.191055,0.40735,0.441898,0.0,0.334858,0,0.295357,...,0.273436,1.466932,0.94085,0.470344,1.032085,0.0,0.65407,0.0,0.0,0.614493
2,2,0.0,0.261903,0.992782,0.301102,0.636006,0.009558,0.009448,0,0.974949,...,0.0,0.769983,0.83436,0.369656,1.000858,0.431571,0.361993,0.0,0.0,0.392158
3,3,0.0,0.0,1.352401,0.346003,0.401412,0.0,0.0,0,0.450667,...,0.339935,1.325595,0.981124,0.486731,0.747392,0.0,0.300671,0.0,0.0,0.628365
4,4,0.0,0.0,1.114281,0.69614,0.121505,0.0,0.0,0,0.591384,...,0.093661,0.875113,0.360689,0.65923,0.546044,0.0,0.427255,0.0,0.0,0.835671


In [24]:
x = data.iloc[:, 1:-1].values
xTest = testData.iloc[:, 1:].values
xFeatures = np.array(x, dtype=float)
xTest = np.array(xTest, dtype=float)
xLabels = data.iloc[:, -1].values
xLabels = np.array(xLabels)
y_train = xLabels

In [26]:
pca = PCA(n_components=300)
# pca = KernelPCA(n_components=300, kernel='rbf', gamma=15, random_state=42)

p = pca.fit(xFeatures)
x_train = p.transform(xFeatures)
x_val = p.transform(xTest)

#performing lda
lda = LDA(n_components=19)
l = lda.fit(x_train, y_train)

x_train = l.transform(x_train)
x_val = l.transform(x_val)

In [27]:
# Removing Outliers
isolation_forest = IsolationForest(n_estimators=500, contamination=0.1)
isolation_forest.fit(x_train)
indices = np.where(isolation_forest.predict(x_train) != -1)[0]
x_train = x_train[indices]
y_train = y_train[indices]

In [28]:
kmeans = KMeans(n_clusters = 20, n_init= "auto")
kmeans1 = kmeans.fit(x_train)
cluster_labels1 = kmeans1.predict(x_train)
x_train = np.concatenate((x_train, cluster_labels1.reshape(-1, 1)), axis=1)
cluster_labels2 = kmeans1.predict(x_val)
x_val = np.concatenate((x_val, cluster_labels2.reshape(-1, 1)), axis=1)

In [None]:
model = LR(multi_class='ovr', solver='liblinear')
model.fit(x_train, y_train)


In [31]:

y_pred = model.predict(x_val)
ids = range(415)
results = pd.DataFrame({'ID': ids, 'Category': y_pred})
results.to_csv('submission.csv', index=False)