In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
X = np.load("src/data/classification/inputs.npy")
y = np.load("src/data/classification/labels.npy")

In [3]:
print(f"Shape of X: {X.shape}")

Shape of X: (1000, 20)


In [11]:
class FitModelClassification:
    def __init__(self, X, y, scoring):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=20)

        self.pipeline = Pipeline([('scaler', StandardScaler())])
        self.pipeline.fit(self.X_train, self.y_train)

        self.X_train = self.pipeline.transform(self.X_train)
        self.X_test = self.pipeline.transform(self.X_test)
        
        self.scoring = scoring

    def find_best_model(self, model, params, n_iter=0):
        if not n_iter:
            grid_search = GridSearchCV(model, params, cv=5, scoring=self.scoring[0],
                                       return_train_score=True, n_jobs=-1)
            grid_search.fit(self.X_train, self.y_train.flatten())
            return grid_search

        rnd_search = RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter, cv=5,
                                        scoring=self.scoring[0], random_state=20)
        rnd_search.fit(self.X_train, self.y_train.flatten())

        return rnd_search

    def fit_new_model(self, model, params, name, n_iter=0):
        best_model = self.find_best_model(model, params, n_iter)
        print(f"{name} best model: {best_model.best_params_}")

        y_train_pred = best_model.predict(self.X_train)
        y_test_pred = best_model.predict(self.X_test)

        print(f"{name} {self.scoring[0]}: Train = {self.scoring[1](self.y_train, y_train_pred)} | "
              f"Test = {self.scoring[1](self.y_test, y_test_pred)}")

        return Pipeline([('pipeline', self.pipeline), (name, best_model.best_estimator_)])

In [12]:
model_regression = FitModelClassification(X, y, ("accuracy", accuracy_score))

In [13]:
params_lr = [
    {"penalty": ["l1", "l2"], 
     "C": np.arange(0.01, 3, 0.01),
     "solver": ["liblinear"]},

    {"penalty": ["l2"], 
     "C": np.arange(0.01, 3, 0.01),
     "solver": ["sag"]},

    {"penalty": ["l1", "l2"],
     "C": np.arange(0.01, 3, 0.01),
     "solver": ["saga"]},

    {"penalty": ["elasticnet"],
     "C": np.arange(0.01, 3, 0.01),
     "l1_ratio": np.arange(0, 1, 0.01),
     "solver": ["saga"]},
]

lr_pipeline = model_regression.fit_new_model(LogisticRegression(random_state=20, max_iter=2000),
                                             params_lr, "Logistic Regression", n_iter=10)

Logistic Regression best model: {'solver': 'saga', 'penalty': 'elasticnet', 'l1_ratio': 0.72, 'C': 0.29000000000000004}
Logistic Regression ('accuracy', <function accuracy_score at 0x7fce26b6fc10>): Train = 0.9053333333333333 | Test = 0.884


In [14]:
params_svc = [
    {"C": np.arange(0.01, 3, 0.01),
     "kernel": ["linear", "poly", "rbf", "sigmoid"]}
]

svc_pipeline = model_regression.fit_new_model(SVC(random_state=20), params_svc, "SVC")

SVC best model: {'C': 0.04, 'kernel': 'linear'}
SVC ('accuracy', <function accuracy_score at 0x7fce26b6fc10>): Train = 0.9053333333333333 | Test = 0.884


In [15]:
params_perceptron = [
    {"penalty": ["elasticnet"],
     "alpha": np.arange(0.001, 0.1, 0.001),
     "l1_ratio": np.arange(0.01, 1, 0.01),
     "eta0": np.arange(0.01, 2.5, 0.01),
    }
]

perceptron_pipeline = model_regression.fit_new_model(Perceptron(random_state=20, n_jobs=-1), params_perceptron,
                                              "Perceptron", n_iter=6000)

Perceptron best model: {'penalty': 'elasticnet', 'l1_ratio': 0.22, 'eta0': 0.34, 'alpha': 0.002}
Perceptron ('accuracy', <function accuracy_score at 0x7fce26b6fc10>): Train = 0.872 | Test = 0.86


In [16]:
params_ridge = [
    {"alpha": np.arange(0.01, 4, 0.01),
     "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"],
    }
]

rfc_pipeline = model_regression.fit_new_model(RidgeClassifier(random_state=20), params_ridge, "Ridge Classifier")

Ridge Classifier best model: {'alpha': 2.78, 'solver': 'lsqr'}
Ridge Classifier ('accuracy', <function accuracy_score at 0x7fce26b6fc10>): Train = 0.904 | Test = 0.884


In [17]:
params_knn = [
    {"n_neighbors": np.arange(4, 20, 1),
     "algorithm": ["ball_tree", "kd_tree", "brute"],
     "leaf_size": np.arange(1, 15, 2),
    }
]

knn_pipeline = model_regression.fit_new_model(KNeighborsClassifier(n_jobs=-1), params_knn, "KNN")

KNN best model: {'algorithm': 'ball_tree', 'leaf_size': 1, 'n_neighbors': 18}
KNN ('accuracy', <function accuracy_score at 0x7fce26b6fc10>): Train = 0.8906666666666667 | Test = 0.86
