In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from zadania import \
    Dataset, \
    CVSplitter, \
    train_on_best_hyperparams, \
    double_split_evaluate, \
    random_grid_search

In [2]:
def mnist():
    data = fetch_mldata('MNIST original')
    X = data.data.astype(np.float)/255.
    y = data.target
    rng = np.random.RandomState(seed=43)
    indices = rng.permutation(len(X))[:1000]
    return X[indices], y[indices]

In [3]:
class DTC:
    def __init__(self, X, y, criterion, max_depth, max_leaf_nodes):
        self.m = DecisionTreeClassifier(
            criterion=criterion,
            max_depth=max_depth,
            max_leaf_nodes=max_leaf_nodes,
            random_state=43)
        self.m.fit(X, y)
    def predict(self, X):
        return self.m.predict(X)
    def __str__(self):
        return self.m.__str__()

class KNNC:
    def __init__(self, X, y, n_neighbors=5, weights="uniform", p=2):
        self.m = KNeighborsClassifier(
            n_neighbors=n_neighbors,
            weights=weights,
            p=p)
        self.m.fit(X, y)
    def predict(self, X):
        return self.m.predict(X)
    def __str__(self):
        return self.m.__str__()

In [4]:
def describe_train_on_best_hyperparams_result(model, hyperparams_list, best_h, train_scores, test_scores):
    print()
    print("Best hyperparams:")
    print(best_h)
    print()
    print("Trained model:")
    print(model)
    print()
    print("Tested hyperparams:")
    for h in hyperparams_list:
        print(h)
    print()
    print("Train scores:")
    print(pd.DataFrame(train_scores))
    print()
    print("Test scores:")
    print(pd.DataFrame(test_scores))

def describe_double_split_evaluate_results(summary, hyperparams_list):
    print()
    print("Estimated score:", summary["estimated_score"])
    print()
    print("Train scores:")
    print(summary["train_scores"])
    print()
    print("Test scores:")
    print(summary["test_scores"])
    print()
    print("Best hyperparams per major split:")
    for i, h in enumerate(summary["best_hyperparams"]):
        print("\tmajor split " + str(i+1) + ":", h)
    print()
    print("Tested hyperparams:")
    for h in hyperparams_list:
        print(h)
    print()
    print("Inner scores per major split:")
    for i, (train_scores, test_scores) in enumerate(zip(summary["inner_train_scores"], summary["inner_test_scores"])):
        print()
        print("Major split " + str(i+1) + ":")
        print()
        print("Train scores:")
        print(pd.DataFrame(train_scores))
        print()
        print("Test scores:")
        print(pd.DataFrame(test_scores))

In [5]:
hyperparams_list = random_grid_search(
    {"n_neighbors": [1,2,3,5,8,13]})

model, best_h, train_scores, test_scores = train_on_best_hyperparams(
    dataset=Dataset(*mnist()),
    model_cls=KNNC,
    hyperparams_list=hyperparams_list,
    splitter=CVSplitter(n_splits=5),
    score_function=accuracy_score,
    seed=43)

describe_train_on_best_hyperparams_result(model, hyperparams_list, best_h, train_scores, test_scores)

Evaluating split 1......
Evaluating split 2......
Evaluating split 3......
Evaluating split 4......
Evaluating split 5......

Best hyperparams:
{'n_neighbors': 3}

Trained model:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

Tested hyperparams:
{'n_neighbors': 13}
{'n_neighbors': 3}
{'n_neighbors': 2}
{'n_neighbors': 5}
{'n_neighbors': 1}
{'n_neighbors': 8}

Train scores:
         0        1        2        3    4        5
0  0.85250  0.92625  0.92000  0.90125  1.0  0.88875
1  0.85500  0.93125  0.91875  0.89500  1.0  0.88000
2  0.85875  0.92625  0.93250  0.90750  1.0  0.89625
3  0.86125  0.93625  0.93500  0.91250  1.0  0.88375
4  0.87000  0.93500  0.93125  0.91125  1.0  0.89375

Test scores:
       0      1      2      3      4      5
0  0.805  0.830  0.840  0.840  0.845  0.815
1  0.805  0.880  0.845  0.845  0.845  0.870
2  0.830  0.870  0.885  0.870  0.905  0.870
3  0

In [6]:
hyperparams_list = random_grid_search(
    {
        "criterion": ["gini", "entropy"],
        "max_depth": [3, 6, 12, 24, None],
        "max_leaf_nodes": [32, 64, 128, 256, 512, 1024, None]},
    n=20,
    seed=43)

model, best_h, train_scores, test_scores = train_on_best_hyperparams(
    dataset=Dataset(*mnist()),
    model_cls=DTC,
    hyperparams_list=hyperparams_list,
    splitter=CVSplitter(n_splits=5),
    score_function=accuracy_score,
    seed=43)

describe_train_on_best_hyperparams_result(model, hyperparams_list, best_h, train_scores, test_scores)

Evaluating split 1....................
Evaluating split 2....................
Evaluating split 3....................
Evaluating split 4....................
Evaluating split 5....................

Best hyperparams:
{'criterion': 'entropy', 'max_depth': 12, 'max_leaf_nodes': 512}

Trained model:
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=12,
            max_features=None, max_leaf_nodes=512,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=43,
            splitter='best')

Tested hyperparams:
{'criterion': 'entropy', 'max_depth': 6, 'max_leaf_nodes': 32}
{'criterion': 'entropy', 'max_depth': 12, 'max_leaf_nodes': 512}
{'criterion': 'entropy', 'max_depth': 6, 'max_leaf_nodes': 256}
{'criterion': 'entropy', 'max_depth': 3, 'max_leaf_nodes': None}
{'criterion': 'gini', 'max_depth': 6, 'max_leaf_nodes': None}
{'criterion': 'gi

In [7]:
hyperparams_list = random_grid_search(
    {"n_neighbors": [1,2,3,5,8,13]})

summary = double_split_evaluate(
    dataset=Dataset(*mnist()),
    model_cls=KNNC,
    hyperparams_list=hyperparams_list,
    major_splitter=CVSplitter(n_splits=3),
    minor_splitter=CVSplitter(n_splits=5),
    score_function=accuracy_score,
    seed=43)

describe_double_split_evaluate_results(summary, hyperparams_list)

Evaluating major split 1
Evaluating split 1......
Evaluating split 2......
Evaluating split 3......
Evaluating split 4......
Evaluating split 5......
Evaluating major split 2
Evaluating split 1......
Evaluating split 2......
Evaluating split 3......
Evaluating split 4......
Evaluating split 5......
Evaluating major split 3
Evaluating split 1......
Evaluating split 2......
Evaluating split 3......
Evaluating split 4......
Evaluating split 5......

Estimated score: 0.855025684367

Train scores:
[ 1.          1.          0.91904048]

Test scores:
[ 0.82934132  0.86786787  0.86786787]

Best hyperparams per major split:
	major split 1: {'n_neighbors': 1}
	major split 2: {'n_neighbors': 1}
	major split 3: {'n_neighbors': 3}

Tested hyperparams:
{'n_neighbors': 13}
{'n_neighbors': 3}
{'n_neighbors': 2}
{'n_neighbors': 5}
{'n_neighbors': 1}
{'n_neighbors': 8}

Inner scores per major split:

Major split 1:

Train scores:
          0         1         2         3    4         5
0  0.832707  0.91