First load the libs and data

In [1]:
%load_ext nb_black

import pandas as pd

X_train = pd.read_csv("x_train.csv", index_col="Index")
X_train.shape

(199523, 398)

<IPython.core.display.Javascript object>

In [2]:
y_train = pd.read_csv("y_train.csv", index_col="Index", squeeze=True)
y_train.shape

(199523,)

<IPython.core.display.Javascript object>

In [3]:
X_test = pd.read_csv("x_test.csv", index_col="Index")
X_test.shape

(99762, 398)

<IPython.core.display.Javascript object>

Now use a parameter search engine to find the optimal settings.
Per classifier select the parameter values.
I also reduce dimensions with PCA to speed up processing.

Processing takes a looooong time so I included the results in a markup below.

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

classifiers = {
    "SDG": (
        SGDClassifier(),
        {
            "loss": ["hinge", "log", "modified_huber", "perceptron"],
            "penalty": ["l2", "l1", "elasticnet"],
            "n_jobs": [-1],
        },
    ),
    "Ridge": (
        RidgeClassifier(),
        {
            "alpha": [0.0001, 0.5, 1.0],
            "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"],
        },
    ),
    "Random Forest": (
        RandomForestClassifier(),
        {
            "n_estimators": [30, 100, 300],
            "min_samples_split": [2 ** i for i in range(1, 4)],
            "max_features": ["sqrt", "log2"],
            "max_depth": [None] + [2 ** i for i in range(1, 7)],
            "n_jobs": [-1],
        },
    ),
    "Extra Trees": (
        ExtraTreesClassifier(),
        {
            "n_estimators": [30, 100, 300],
            "min_samples_split": [2 ** i for i in range(1, 4)],
            "max_features": ["sqrt", "log2"],
            "max_depth": [None] + [2 ** i for i in range(1, 7)],
            "n_jobs": [-1],
        },
    ),
    "Nearest Neighbours": (
        KNeighborsClassifier(),
        {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"],
            "algorithm": ["ball_tree", "kd_tree"], #, "auto", "brute"]
            "leaf_size": [10, 30, 50],
            "p": [1, 2],
            "n_jobs": [-1],
        },
    ),
}

X_train_pca = {}
for n_components in [2**i for i in range(1, 8)]:
    pca = PCA(n_components=n_components)
    X_train_pca[n_components] = pca.fit_transform(X_train)

for n_components, X in X_train_pca.items():
    for name, (classifier, param_grid) in classifiers.items():
    #     search = GridSearchCV(pipeline, pca_param_grid, n_jobs=-1)
        search = RandomizedSearchCV(classifier, param_grid, n_iter=10)
        search.fit(X, y_train)
        print(f"{name}:")
        print(f"Dimensions={n_components}")
        print(f"Best parameter (CV score={search.best_score_:.4f}):")
        print(search.best_params_)
        print(search.best_estimator_)


Results:
```
SDG:
Dimensions=2
Best parameter (CV score=0.9342):
{'penalty': 'l1', 'n_jobs': -1, 'loss': 'hinge'}
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l1',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
Ridge:
Dimensions=2
Best parameter (CV score=0.9401):
{'solver': 'cholesky', 'alpha': 0.0001}
RidgeClassifier(alpha=0.0001, class_weight=None, copy_X=True,
                fit_intercept=True, max_iter=None, normalize=False,
                random_state=None, solver='cholesky', tol=0.001)
Random Forest:
Dimensions=2
Best parameter (CV score=0.9468):
{'n_jobs': -1, 'n_estimators': 100, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 8}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
Extra Trees:
Dimensions=2
Best parameter (CV score=0.9474):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 4, 'max_features': 'log2', 'max_depth': 32}
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=32, max_features='log2',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=4,
                     min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Nearest Neighbours:
Dimensions=2
Best parameter (CV score=0.9448):
{'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'leaf_size': 10, 'algorithm': 'ball_tree'}
KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=1,
                     weights='uniform')
SDG:
Dimensions=4
Best parameter (CV score=0.9292):
{'penalty': 'elasticnet', 'n_jobs': -1, 'loss': 'perceptron'}
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='perceptron',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1,
              penalty='elasticnet', power_t=0.5, random_state=None,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)
Ridge:
Dimensions=4
Best parameter (CV score=0.9400):
{'solver': 'saga', 'alpha': 1.0}
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='saga', tol=0.001)
Random Forest:
Dimensions=4
Best parameter (CV score=0.9485):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 8}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
Extra Trees:
Dimensions=4
Best parameter (CV score=0.9491):
{'n_jobs': -1, 'n_estimators': 100, 'min_samples_split': 4, 'max_features': 'log2', 'max_depth': 32}
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=32, max_features='log2',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=4,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Nearest Neighbours:
Dimensions=4
Best parameter (CV score=0.9460):
{'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'leaf_size': 30, 'algorithm': 'kd_tree'}
KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=1,
                     weights='uniform')
SDG:
Dimensions=8
Best parameter (CV score=0.9253):
{'penalty': 'l1', 'n_jobs': -1, 'loss': 'perceptron'}
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='perceptron',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l1',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
Ridge:
Dimensions=8
Best parameter (CV score=0.9405):
{'solver': 'auto', 'alpha': 0.5}
RidgeClassifier(alpha=0.5, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)
Random Forest:
Dimensions=8
Best parameter (CV score=0.9514):
{'n_jobs': -1, 'n_estimators': 100, 'min_samples_split': 4, 'max_features': 'log2', 'max_depth': 16}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=16, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
Extra Trees:
Dimensions=8
Best parameter (CV score=0.9508):
{'n_jobs': -1, 'n_estimators': 100, 'min_samples_split': 8, 'max_features': 'sqrt', 'max_depth': 32}
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=32, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=8,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Nearest Neighbours:
Dimensions=8
Best parameter (CV score=0.9474):
{'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'leaf_size': 30, 'algorithm': 'ball_tree'}
KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=1,
                     weights='uniform')
SDG:
Dimensions=16
Best parameter (CV score=0.9241):
{'penalty': 'l2', 'n_jobs': -1, 'loss': 'modified_huber'}
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

Ridge:
Dimensions=16
Best parameter (CV score=0.9408):
{'solver': 'svd', 'alpha': 1.0}
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None, solver='svd',
                tol=0.001)
Random Forest:
Dimensions=16
Best parameter (CV score=0.9535):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'log2', 'max_depth': 16}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=16, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
Extra Trees:
Dimensions=16
Best parameter (CV score=0.9531):
{'n_jobs': -1, 'n_estimators': 100, 'min_samples_split': 8, 'max_features': 'sqrt', 'max_depth': 32}
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=32, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=8,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Nearest Neighbours:
Dimensions=16
Best parameter (CV score=0.9484):
{'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'leaf_size': 50, 'algorithm': 'kd_tree'}
KNeighborsClassifier(algorithm='kd_tree', leaf_size=50, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=1,
                     weights='uniform')
SDG:
Dimensions=32
Best parameter (CV score=0.9292):
{'penalty': 'l1', 'n_jobs': -1, 'loss': 'log'}
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='l1', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
Ridge:
Dimensions=32
Best parameter (CV score=0.9418):
{'solver': 'auto', 'alpha': 0.5}
RidgeClassifier(alpha=0.5, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)
Random Forest:
Dimensions=32
Best parameter (CV score=0.9536):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 4, 'max_features': 'log2', 'max_depth': 64}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=64, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
Extra Trees:
Dimensions=32
Best parameter (CV score=0.9524):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 4, 'max_features': 'sqrt', 'max_depth': 32}
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=32, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=4,
                     min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Nearest Neighbours:
Dimensions=32
Best parameter (CV score=0.9487):
{'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'leaf_size': 10, 'algorithm': 'ball_tree'}
KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=1,
                     weights='uniform')
SDG:
Dimensions=64
Best parameter (CV score=0.9276):
{'penalty': 'elasticnet', 'n_jobs': -1, 'loss': 'modified_huber'}
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1,
              penalty='elasticnet', power_t=0.5, random_state=None,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)
Ridge:
Dimensions=64
Best parameter (CV score=0.9424):
{'solver': 'auto', 'alpha': 0.0001}
RidgeClassifier(alpha=0.0001, class_weight=None, copy_X=True,
                fit_intercept=True, max_iter=None, normalize=False,
                random_state=None, solver='auto', tol=0.001)
Random Forest:
Dimensions=64
Best parameter (CV score=0.9539):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 4, 'max_features': 'sqrt', 'max_depth': None}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
Extra Trees:
Dimensions=64
Best parameter (CV score=0.9516):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 8, 'max_features': 'sqrt', 'max_depth': 32}
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=32, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=8,
                     min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Nearest Neighbours:
Dimensions=64
Best parameter (CV score=0.9486):
{'weights': 'distance', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'leaf_size': 10, 'algorithm': 'kd_tree'}
KNeighborsClassifier(algorithm='kd_tree', leaf_size=10, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=1,
                     weights='distance')
SDG:
Dimensions=128
Best parameter (CV score=0.9239):
{'penalty': 'elasticnet', 'n_jobs': -1, 'loss': 'perceptron'}
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='perceptron',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1,
              penalty='elasticnet', power_t=0.5, random_state=None,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)
Ridge:
Dimensions=128
Best parameter (CV score=0.9444):
{'solver': 'auto', 'alpha': 0.0001}
RidgeClassifier(alpha=0.0001, class_weight=None, copy_X=True,
                fit_intercept=True, max_iter=None, normalize=False,
                random_state=None, solver='auto', tol=0.001)
Random Forest:
Dimensions=128
Best parameter (CV score=0.9532):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 4, 'max_features': 'sqrt', 'max_depth': 64}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=64, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
Extra Trees:
Dimensions=128
Best parameter (CV score=0.9493):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 4, 'max_features': 'sqrt', 'max_depth': 64}
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=64, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=4,
                     min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Nearest Neighbours:
Dimensions=128
Best parameter (CV score=0.9491):
{'weights': 'distance', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'leaf_size': 10, 'algorithm': 'ball_tree'}
KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=7, p=1,
                     weights='distance')
```

Best classifier:
```
Random Forest:
Dimensions=16
Best parameter (CV score=0.9535):
{'n_jobs': -1, 'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'log2', 'max_depth': 16}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=16, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
```

So let us do that and predict y_test.