In [1]:
import numpy as np
import load_cupsnbottles
import pandas as pd

print(__doc__)

import argparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from matplotlib.ticker import NullFormatter
from sklearn import manifold, datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

Automatically created module for IPython interactive environment


  from numpy.core.umath_tests import inner1d


In [42]:
# load the data
num_samples = 50
X = load_cupsnbottles.load_features('')
df = load_cupsnbottles.load_properties('')
y = np.array(df.label)
labels_old = np.unique(y)
for (i, label) in enumerate(labels_old):
    y[y == label] = i
y = y.astype(int)
X = X[:num_samples]
y = y[:num_samples]

In [None]:
def t_sne(dims, perplexity=30, learning_rate=200.0):
    tsne = manifold.TSNE(n_components=dims, init='random', perplexity=perplexity,
                         learning_rate = learning_rate,
                         n_iter=1000, n_iter_without_progress=300, method='barnes_hut',
                         random_state=0)
    return tsne.fit_transform(X)

In [3]:
def dim_red(X, dims=2, init='pca'):
    """
    :param: X = dataset
    :param: dims = number of dimensions
    :param: init = either 'pca' or 'tsne'
    """
    if init == 'pca':
        pca = PCA(dims)
        X_embedded = pca.fit_transform(X)
        
    elif init == 'tsne':
        X_embedded = t_sne(dims)
    return X_embedded

In [40]:
classifier_names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
          "Decision Tree", "Random Forest", "Neural Net", "Naive Bayes", "QDA"]

classifiers = [
     KNeighborsClassifier(),
     SVC(),
     SVC(),
     GaussianProcessClassifier(),
     DecisionTreeClassifier(),
     RandomForestClassifier(),
     MLPClassifier(),
     GaussianNB(),
     QuadraticDiscriminantAnalysis()]

parameters = [
    {'n_neighbors': [2, 5, 10], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'brute']}, # K Nearest Neighbors
    {'kernel':['linear'], 'C': [1, 5, 10]}, # Linear SVM
    {'kernel':['rbf'], 'C':[1, 5, 10]}, # RBF SVM
    {}, # Gaussian Process
    {'max_depth':[None, 5, 10], 'min_samples_split': [2, 5, 10]}, # Decision Tree
    {'max_depth':[None, 5, 10], 'n_estimators':[10, 50, 100], 'max_features':[1]}, # Random Forest
    {'alpha': [0.0001, 0.001], 'max_iter': [1000, 2000]}, # Neural Net
    {}, # Naive Bayes
    #{'var_smoothing': [1e-9]}, # Naive Bayes
    {'reg_param': [0.0, 0.5],'tol': [1.0e-2, 1.0e-4, 1.0e-6]}] # Quadratic Discriminant Analysis


In [39]:
gs_classifiers = []
scores = []

if args.classifier is not None:
    clf_index = classifier_names.index(args.classifier)
    clf = GridSearchCV(classifiers[clf_index], parameters[clf_index])
    print(clf.fit(X, y))
    score = clf.score(X, y)
    gs_classifiers.append(clf)
    scores.append(score)
    print(pd.DataFrame.from_dict(clf.cv_results_))
    print('>> DONE')
    print('Best found version of ' + classifier_names[i] + ' scores ' + str(score))
    print("The best parameters are: ", clf.best_params_)

else:

    for i, classifier in enumerate(classifiers):
        clf = GridSearchCV(classifier, parameters[i])
        print(clf.fit(X, y))
        score = clf.score(X, y)
        gs_classifiers.append(clf)
        scores.append(score)
        print(pd.DataFrame.from_dict(clf.cv_results_))
        print('>> DONE')
        print('Best found version of ' + classifier_names[i] + ' scores ' + str(score))
        print("The best parameters are: ", clf.best_params_)

    print(list(zip(classifier_names, scores)))

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'algorithm': ['auto', 'brute'], 'n_neighbors': [2, 5, 10], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)




    mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0        0.046417         0.339913            0.906          0.987017   
1        0.036066         0.335076            0.896          1.000000   
2        0.035417         0.344306            0.926          0.972059   
3        0.035768         0.353374            0.924          1.000000   
4        0.035607         0.358753            0.920          0.962060   
5        0.035405         0.356375            0.920          1.000000   
6        0.003388         0.008666            0.906          0.987017   
7        0.003312         0.008702            0.896          1.000000   
8        0.003403         0.008922            0.926          0.972059   
9        0.003267         0.008877            0.924          1.000000   
10       0.003244         0.008842            0.920          0.962060   
11       0.003327         0.008959            0.920          1.000000   

   param_algorithm param_n_neighbors param_weights



   mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
0       0.280187         0.148710             0.93               1.0       1   
1       0.276603         0.148983             0.93               1.0       5   
2       0.276054         0.147557             0.93               1.0      10   

  param_kernel                         params  rank_test_score  \
0       linear   {'C': 1, 'kernel': 'linear'}                1   
1       linear   {'C': 5, 'kernel': 'linear'}                1   
2       linear  {'C': 10, 'kernel': 'linear'}                1   

   split0_test_score  split0_train_score  split1_test_score  \
0           0.863905                 1.0           0.939759   
1           0.863905                 1.0           0.939759   
2           0.863905                 1.0           0.939759   

   split1_train_score  split2_test_score  split2_train_score  std_fit_time  \
0                 1.0           0.987879                 1.0      0.023042   
1    



   mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
0       0.431801         0.168869            0.924          0.974989       1   
1       0.421826         0.162399            0.940          0.999002       5   
2       0.419370         0.164045            0.936          1.000000      10   

  param_kernel                      params  rank_test_score  \
0          rbf   {'C': 1, 'kernel': 'rbf'}                3   
1          rbf   {'C': 5, 'kernel': 'rbf'}                1   
2          rbf  {'C': 10, 'kernel': 'rbf'}                2   

   split0_test_score  split0_train_score  split1_test_score  \
0           0.857988            0.969789           0.945783   
1           0.881657            1.000000           0.951807   
2           0.875740            1.000000           0.945783   

   split1_train_score  split2_test_score  split2_train_score  std_fit_time  \
0            0.985030           0.969697            0.970149      0.035906   
1            0.99



   mean_fit_time  mean_score_time  mean_test_score  mean_train_score params  \
0       5.685377         2.829332            0.174               1.0     {}   

   rank_test_score  split0_test_score  split0_train_score  split1_test_score  \
0                1           0.171598                 1.0           0.174699   

   split1_train_score  split2_test_score  split2_train_score  std_fit_time  \
0                 1.0           0.175758                 1.0      0.143395   

   std_score_time  std_test_score  std_train_score  
0        0.032013         0.00177              0.0  
>> DONE
Best found version of Gaussian Process scores 1.0
The best parameters are:  {}
GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fr



GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 5, 10], 'n_estimators': [10, 50, 100], 'max_features': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0       0.021438         0.002215            0.830          0.999002   
1       0.097675         0.007819            0.914          1.000000   
2       0.186922         0.012963            0.896     



GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_iter': [1000, 2000], 'alpha': [0.0001, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0       1.116555         0.005090            0.934               1.0   
1       1.806160         0.041331            0.938               1.0   
2       2.506130         0.007021            0.938               1.0  



ValueError: Invalid parameter var_smoothing for estimator GaussianNB(priors=None). Check the list of available parameters with `estimator.get_params().keys()`.

In [46]:
clf_index = classifier_names.index('Neural Net')
clf_index

6

In [6]:
# Linear SVM
classifier = SVC()
parameters = {'C': [1, 10]}
clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
print(clf.score(X, y))
print(pd.DataFrame.from_dict(clf.cv_results_))
print("The best parameters are: ", clf.best_params_)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1, param_grid={'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
1.0
   mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
0       3.991501         2.766902         0.868288          0.994722       1   
1       3.605068         2.085048         0.871960          1.000000      10   

      params  rank_test_score  split0_test_score  split0_train_score  \
0   {'C': 1}                2           0.840878            0.995862   
1  {'C': 10}                1           0.827160            1.000000   

   split1_test_score  split1_train_score  split2_test_score  \
0            0.



In [None]:
# K Nearest Neighbors
classifier = KNeighborsClassifier(3)

clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
pd.DataFrame.from_dict(clf.cv_results_)

In [10]:
# RBF SVM
classifier = SVC(gamma=2, C=1)

clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
pd.DataFrame.from_dict(clf.cv_results_)

In [None]:
# Gaussian Process (very slow)
classifier = GaussianProcessClassifier(1.0 * RBF(1.0))

clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
pd.DataFrame.from_dict(clf.cv_results_)

In [None]:
# Decision Tree
classifier = DecisionTreeClassifier(max_depth=5)

clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
pd.DataFrame.from_dict(clf.cv_results_)

In [None]:
# Random Forest
classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)

clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
pd.DataFrame.from_dict(clf.cv_results_)

In [None]:
# Neural Net
classifier = MLPClassifier(alpha=1, max_iter=1000)

clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
pd.DataFrame.from_dict(clf.cv_results_)

In [11]:
# Naive Bayes
classifier = GaussianNB()

clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
pd.DataFrame.from_dict(clf.cv_results_)

GaussianNB(priors=None)

In [None]:
# Quadratic Discriminant Analysis
classifier = QuadraticDiscriminantAnalysis()

clf = GridSearchCV(classifier, parameters)
print(clf.fit(X, y))
pd.DataFrame.from_dict(clf.cv_results_)