## Nearest Centroid

Train: Calculate the centroids for the samples of each label
Run: Find which centroid is the closest

## k-NN classifier

Train: Nothing
Run: Look at the nearest neighbors from the training set

## Naive Bayes classifier

Train: For each category and for the whole set of samples, we build
distributions for each component (we assume those are independent from each
others) so that we can later calculate the probability of a component to be a
certain value knowing what category it may belong to.
Run: For each possible category, we calculate the probability of a sample
with components $x_i$ being a particular category $y$ by calculating:

$P(y | x_i) = \cfrac {\prod_i P(x_i | y) \times P(y)}{P(x_i)}$

## Decision tree classifier

Train: Given the training set, we build a decision tree. At each node, find the
component that best separate two groups through brute force, using the Gini
index. Use binning for continuous data.
Run: Walk through the tree.

## Random forest classifier

Train: Generate multiple decision tree. For each of them, select a subset of the
features and generate a different training set (allowing duplicates of samples).
Run: Walk through the trees and using voting between them.

## SVM

Train: Create a maximum margin separation between the samples. Basic case is
linear, but can be changed using kernels.
Run: See on what side of the hyperplane the new sample is


# Iris dataset

In [None]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    print(" predictions  : ", clf.predict(x_test))
    print(" actual labels: ", y_test)
    print(" score        : %0.4f" % clf.score(x_test, y_test))
    print()

def main():
    x = np.load("iris/iris_features.npy")
    y = np.load("iris/iris_labels.npy")
    N = 120
    x_train = x[:N]
    x_test = x[N:]
    y_train = y[:N]
    y_test = y[N:]
    xa_train = np.load("iris/iris_train_features_augmented.npy")
    ya_train = np.load("iris/iris_train_labels_augmented.npy")
    xa_test =np.load("iris/iris_test_features_augmented.npy")
    ya_test =np.load("iris/iris_test_labels_augmented.npy")

    print("Nearest centroid:")
    run(x_train, y_train, x_test, y_test, NearestCentroid())
    print("k-NN classifier (k=3):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    print("Naive Bayes classifier (Gaussian):")
    run(x_train, y_train, x_test, y_test, GaussianNB())
    print("Naive Bayes classifier (Multinomial):")
    run(x_train, y_train, x_test, y_test, MultinomialNB())
    print("Decision Tree classifier:")
    run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    print("Random Forest classifier (estimators=5):")
    run(xa_train, ya_train, xa_test, ya_test, RandomForestClassifier(n_estimators=5))

    print("SVM (linear, C=1.0):")
    run(xa_train, ya_train, xa_test, ya_test, SVC(kernel="linear", C=1.0))
    print("SVM (RBF, C=1.0, gamma=0.25):")
    run(xa_train, ya_train, xa_test, ya_test, SVC(kernel="rbf", C=1.0, gamma=0.25))
    print("SVM (RBF, C=1.0, gamma=0.001, augmented)")
    run(xa_train, ya_train, xa_test, ya_test, SVC(kernel="rbf", C=1.0, gamma=0.001))
    print("SVM (RBF, C=1.0, gamma=0.001, original)")
    run(x_train, y_train, x_test, y_test, SVC(kernel="rbf", C=1.0, gamma=0.001))

main()

# Breast Cancer dataset

In [None]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    print("    score = %0.4f" % clf.score(x_test, y_test))
    print()

def main():
    x = np.load("breast/bc_features_standard.npy")
    y = np.load("breast/bc_labels.npy")
    N = 455
    x_train = x[:N];  x_test = x[N:]
    y_train = y[:N];  y_test = y[N:]

    print("Nearest centroid:")
    run(x_train, y_train, x_test, y_test, NearestCentroid())
    print("k-NN classifier (k=3):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    print("k-NN classifier (k=7):")
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
    print("Naive Bayes classifier (Gaussian):")
    run(x_train, y_train, x_test, y_test, GaussianNB())
    print("Decision Tree classifier:")
    run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    print("Random Forest classifier (estimators=5):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
    print("Random Forest classifier (estimators=50):")
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
    print("SVM (linear, C=1.0):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))
    print("SVM (RBF, C=1.0, gamma=0.03333):")
    run(x_train, y_train, x_test, y_test, SVC(kernel="rbf", C=1.0, gamma=0.03333))

main()

Using k-Fold

In [None]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import sys

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    return clf.score(x_test, y_test)

def split(x,y,k,m):
    ns = int(y.shape[0]/m)
    s = []
    for i in range(m):
    	s.append([x[(ns*i):(ns*i+ns)],
                  y[(ns*i):(ns*i+ns)]])
    x_test, y_test = s[k]
    x_train = []
    y_train = []
    for i in range(m):
        if (i==k):
            continue
        else:
            a,b = s[i]
            x_train.append(a)
            y_train.append(b)
    x_train = np.array(x_train).reshape(((m-1)*ns,30))
    y_train = np.array(y_train).reshape((m-1)*ns)
    return [x_train, y_train, x_test, y_test]

def pp(z,k,s):
    m = z.shape[1]
    print("%-19s: %0.4f +/- %0.4f | " % (s, z[k].mean(), z[k].std()/np.sqrt(m)), end='')
    for i in range(m):
        print("%0.4f " % z[k,i], end='')
    print()

def main():
    x = np.load("breast/bc_features_standard.npy")
    y = np.load("breast/bc_labels.npy")
    idx = np.argsort(np.random.random(y.shape[0]))
    x = x[idx]
    y = y[idx]
    m = 5 # <- number of folds
    z = np.zeros((8,m))

    for k in range(m):
        x_train, y_train, x_test, y_test = split(x,y,k,m)
        z[0,k] = run(x_train, y_train, x_test, y_test, NearestCentroid())
        z[1,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
        z[2,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
        z[3,k] = run(x_train, y_train, x_test, y_test, GaussianNB())
        z[4,k] = run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
        z[5,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
        z[6,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
        z[7,k] = run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))

    pp(z,0,"Nearest")
    pp(z,1,"3-NN")
    pp(z,2,"7-NN")
    pp(z,3,"Naive Bayes")
    pp(z,4,"Decision Tree")
    pp(z,5,"Random Forest (5)")
    pp(z,6,"Random Forest (50)")
    pp(z,7,"SVM (linear)")

main()

# MNIST dataset

In [None]:
import time
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import decomposition

def run(x_train, y_train, x_test, y_test, clf):
    s = time.time()
    clf.fit(x_train, y_train)
    e_train = time.time() - s
    s = time.time()
    score = clf.score(x_test, y_test)
    e_test = time.time() - s
    print("score = %0.4f (time, train=%8.3f, test=%8.3f)" % (score, e_train, e_test))

def train(x_train, y_train, x_test, y_test):
    print("    Nearest centroid          : ", end='')
    run(x_train, y_train, x_test, y_test, NearestCentroid())
    print("    k-NN classifier (k=3)     : ", end='')
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    print("    k-NN classifier (k=7)     : ", end='')
    run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
    print("    Naive Bayes (Gaussian)    : ", end='')
    run(x_train, y_train, x_test, y_test, GaussianNB())
    print("    Decision Tree             : ", end='')
    run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    print("    Random Forest (trees=  5) : ", end='')
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
    print("    Random Forest (trees= 50) : ", end='')
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
    print("    Random Forest (trees=500) : ", end='')
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=500))
    print("    Random Forest (trees=1000): ", end='')
    run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=1000))
    print("    LinearSVM (C=0.01)        : ", end='')
    run(x_train, y_train, x_test, y_test, LinearSVC(C=0.01))
    print("    LinearSVM (C=0.1)         : ", end='')
    run(x_train, y_train, x_test, y_test, LinearSVC(C=0.1))
    print("    LinearSVM (C=1.0)         : ", end='')
    run(x_train, y_train, x_test, y_test, LinearSVC(C=1.0))
    print("    LinearSVM (C=10.0)        : ", end='')
    run(x_train, y_train, x_test, y_test, LinearSVC(C=10.0))

def main():
    x_train = np.load("mnist/mnist_train_vectors.npy").astype("float64")
    y_train = np.load("mnist/mnist_train_labels.npy")
    x_test = np.load("mnist/mnist_test_vectors.npy").astype("float64")
    y_test = np.load("mnist/mnist_test_labels.npy")

    print("Models trained on raw [0,255] images:")
    train(x_train, y_train, x_test, y_test)
    print("Models trained on raw [0,1) images:")
    train(x_train/256.0, y_train, x_test/256.0, y_test)

    m = x_train.mean(axis=0)
    s = x_train.std(axis=0) + 1e-8
    x_ntrain = (x_train - m) / s
    x_ntest  = (x_test - m) / s

    print("Models trained on normalized images:")
    train(x_ntrain, y_train, x_ntest, y_test)

    pca = decomposition.PCA(n_components=15)
    pca.fit(x_ntrain)
    x_ptrain = pca.transform(x_ntrain)
    x_ptest = pca.transform(x_ntest)

    print("Models trained on first 15 PCA components of normalized images:")
    train(x_ptrain, y_train, x_ptest, y_test)

main()

Models trained on raw [0,255] images:
    Nearest centroid          : 



score = 0.8203 (time, train=   0.511, test=   0.089)
    k-NN classifier (k=3)     : score = 0.9705 (time, train=   0.021, test=   5.535)
    k-NN classifier (k=7)     : score = 0.9694 (time, train=   0.033, test=   8.394)
    Naive Bayes (Gaussian)    : score = 0.5558 (time, train=   0.692, test=   0.407)
    Decision Tree             : score = 0.8784 (time, train=  15.551, test=   0.012)
    Random Forest (trees=  5) : score = 0.9186 (time, train=   1.485, test=   0.022)
    Random Forest (trees= 50) : score = 0.9672 (time, train=  16.359, test=   0.119)
    Random Forest (trees=500) : score = 0.9717 (time, train= 152.042, test=   1.093)
    Random Forest (trees=1000): score = 0.9714 (time, train= 323.391, test=   2.337)
    LinearSVM (C=0.01)        : score = 0.9175 (time, train=1213.297, test=   0.071)
    LinearSVM (C=0.1)         : 

In [None]:
import time
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import decomposition

def run(x_train, y_train, x_test, y_test, clf):
    s = time.time()
    clf.fit(x_train, y_train)
    e_train = time.time() - s
    s = time.time()
    score = clf.score(x_test, y_test)
    e_test = time.time() - s
    return [score, e_train, e_test]

def main():
    x_train = np.load("mnist/mnist_train_vectors.npy").astype("float64")
    y_train = np.load("mnist/mnist_train_labels.npy")
    x_test = np.load("mnist/mnist_test_vectors.npy").astype("float64")
    y_test = np.load("mnist/mnist_test_labels.npy")
    m = x_train.mean(axis=0)
    s = x_train.std(axis=0) + 1e-8
    x_ntrain = (x_train - m) / s
    x_ntest  = (x_test - m) / s

    n = 78
    pcomp = np.linspace(10,780,n, dtype="int16")
    nb=np.zeros((n,4))
    rf=np.zeros((n,4))
    sv=np.zeros((n,4))
    tv=np.zeros((n,2))

    for i,p in enumerate(pcomp):
        pca = decomposition.PCA(n_components=p)
        pca.fit(x_ntrain)
        xtrain = pca.transform(x_ntrain)
        xtest = pca.transform(x_ntest)
        tv[i,:] = [p, pca.explained_variance_ratio_.sum()]
        sc,etrn,etst =run(xtrain, y_train, xtest, y_test, GaussianNB())
        nb[i,:] = [p,sc,etrn,etst]
        sc,etrn,etst =run(xtrain, y_train, xtest, y_test, RandomForestClassifier(n_estimators=50))
        rf[i,:] = [p,sc,etrn,etst]
        sc,etrn,etst =run(xtrain, y_train, xtest, y_test, LinearSVC(C=1.0))
        sv[i,:] = [p,sc,etrn,etst]

    np.save("mnist/mnist_pca_tv.npy", tv)
    np.save("mnist/mnist_pca_nb.npy", nb)
    np.save("mnist/mnist_pca_rf.npy", rf)
    np.save("mnist/mnist_pca_sv.npy", sv)

main()