In [1]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import svm
from sklearn.datasets import load_iris
from sklearn.pipeline import make_pipeline

In [2]:
iris = load_iris()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

In [17]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [18]:
X_train_transformed = scaler.transform(X_train)

In [19]:
X_train_transformed.std(axis=0)

array([1., 1., 1., 1.])

In [20]:
clf = svm.SVC(C=1).fit(X_train_transformed,y_train)

In [21]:
clf.kernel

'rbf'

In [22]:
X_test_transformed = scaler.transform(X_test)

In [23]:
clf.score(X_test_transformed,y_test)

0.9333333333333333

In [24]:
#using a pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))

In [25]:
from sklearn.model_selection import cross_val_score

In [26]:
cross_val_score(clf, iris.data, iris.target, cv=5)

array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])

In [28]:
#obtaining predictions by cross validation
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

In [29]:
predicted = cross_val_predict(clf, iris.data, iris.target, cv=10)

In [30]:
metrics.accuracy_score(iris.target, predicted)

0.9666666666666667

In [31]:
#different cross validation strategies
from sklearn.model_selection import KFold

In [32]:
x =["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train, test in kf.split(x):
    print("%s %s" %(train, test))

[2 3] [0 1]
[0 1] [2 3]


In [33]:
#the first set is the training and the seconfd set is the test set
#this can create training and test sets based on numpy indexing
import numpy as np

In [34]:
X = np.array([[0.,0.], [1.,1.],[-1.,1.],[2.,2.]])
y = np.array([0,1,1,1])

In [35]:
X_train,X_test, y_train,y_test = X[train],X[test],y[train],y[test]

In [36]:
X_train

array([[0., 0.],
       [1., 1.]])

In [37]:
X_test

array([[-1.,  1.],
       [ 2.,  2.]])

In [38]:
y_train

array([0, 1])

In [39]:
y_test

array([1, 1])

In [40]:
#leave one out(LOO)
from sklearn.model_selection import LeaveOneOut

In [41]:
x = [1,2,3,4]

In [43]:
loo = LeaveOneOut()
for train, test in loo.split(x):
    print("%s %s" %(train, test))


[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


In [44]:
#leave p out cv
from sklearn.model_selection import LeavePOut

In [45]:
X = np.ones(4)
lpo = LeavePOut(p=2)
for train, test in lpo.split(x):
    print("%s %s" %(train, test))

[2 3] [0 1]
[1 3] [0 2]
[1 2] [0 3]
[0 3] [1 2]
[0 2] [1 3]
[0 1] [2 3]


In [48]:
#shuffle split i.e data is shuffled first before split inot training and test data sets.good alternative to kfold
from sklearn.model_selection import ShuffleSplit

In [49]:
X = np.arange(5)
ss = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

In [50]:
for train, test in ss.split(x):
    print("%s %s" %(train, test))

[3 1 0] [2]
[2 1 3] [0]
[0 2 1] [3]


In [51]:
#stratified k fold
from sklearn.model_selection import StratifiedKFold

In [54]:
X = np.ones(10)
y = [0,0,0,0,1,1,1,1,1,1]
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X,y):
    print("%s %s" %(train, test))

[2 3 6 7 8 9] [0 1 4 5]
[0 1 3 4 5 8 9] [2 6 7]
[0 1 2 4 5 6 7] [3 8 9]


In [55]:
#cross cvalidation of grouped data
from sklearn.model_selection import GroupKFold

In [56]:
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c","c", "d", "d","d"]
groups = [1,1,1,2,2,2,3,3,3,3]

In [57]:
gkf = GroupKFold(n_splits=3)
for train, test in gkf.split(X, y, groups = groups):
    print("%s %s" %(train, test))

[0 1 2 3 4 5] [6 7 8 9]
[0 1 2 6 7 8 9] [3 4 5]
[3 4 5 6 7 8 9] [0 1 2]


In [58]:
from sklearn.model_selection import LeaveOneGroupOut

In [59]:
X = [1,5,10,50,60,70,80]
y = [0,1,1,2,2,2,2]
groups = [1,1,2,2,3,3,3]

In [60]:
logo = LeaveOneGroupOut()
for train, test in logo.split(X,y, groups = groups):
    print("%s %s" %(train, test))

[2 3 4 5 6] [0 1]
[0 1 4 5 6] [2 3]
[0 1 2 3] [4 5 6]


In [62]:
from sklearn.model_selection import LeavePGroupsOut

In [63]:
X = np.arange(6)
y = [1,1,1,2,2,2]
groups = [1,1,2,2,3,3]

In [64]:
lpgo = LeavePGroupsOut(n_groups=2)
for train, test in lpgo.split(X,y, groups = groups):
    print("%s %s" %(train, test))

[4 5] [0 1 2 3]
[2 3] [0 1 4 5]
[0 1] [2 3 4 5]


In [65]:
#group shuffle split.alternative to groupkfold
from sklearn.model_selection import GroupShuffleSplit

In [69]:
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c","c", "d", "d","d"]
groups = [1,1,1,2,2,2,3,3,3,3]

In [70]:
gss = GroupShuffleSplit(n_splits=4,random_state=0,test_size=0.5)
for train, test in gss.split(X,y, groups = groups):
    print("%s %s" %(train, test))

[0 1 2] [3 4 5 6 7 8 9]
[3 4 5] [0 1 2 6 7 8 9]
[3 4 5] [0 1 2 6 7 8 9]
[3 4 5] [0 1 2 6 7 8 9]


In [71]:
#cv for time series
from sklearn.model_selection import TimeSeriesSplit

In [72]:
X = np.array([[1,2],[3,4],[1,2],[3,4],[1,2],[3,4]])
y = np.array([1,2,3,4,5,6]) 
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)

TimeSeriesSplit(max_train_size=None, n_splits=3)


In [73]:
for train, test in tscv.split(X):
    print("%s %s" %(train, test))

[0 1 2] [3]
[0 1 2 3] [4]
[0 1 2 3 4] [5]


In [74]:
clf.get_params

<bound method Pipeline.get_params of Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])>

In [3]:
#comparing randomized search cv and gridsearch cv
print(__doc__)

import numpy as np
from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier


Automatically created module for IPython interactive environment


In [4]:
#get some data
digits = load_digits()
X, y = digits.data, digits.target

In [5]:
#building a classifier
clf = RandomForestClassifier(n_estimators=20)

In [7]:
#utility function toreport best scores
def report(results, n_top = 3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

#specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
             "max_features": sp_randint(1, 11),
             "min_samples_split": sp_randint(2, 11),
             "min_samples_leaf": sp_randint(1,11),
             "bootstrap": [True, False],
             "criterion": ["gini", "entropy"]}
#run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                                   n_iter= n_iter_search)

start = time()
random_search.fit(X,y)
print("RandomizedSearchCV took %0.2f seconds for %d candidates parameter settings"
     %((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 5.63 seconds for 20 candidates parameter settings
model with rank: 1
Mean validation score: 0.925 (std: 0.006)
Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 5, 'criterion': 'entropy'}

model with rank: 2
Mean validation score: 0.922 (std: 0.020)
Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 10, 'criterion': 'gini'}

model with rank: 3
Mean validation score: 0.918 (std: 0.009)
Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 7, 'criterion': 'entropy'}



In [8]:
random_search.best_score_

0.9248747913188647

In [9]:
random_search.cv_results_



{'mean_fit_time': array([0.06771231, 0.10938183, 0.1041708 , 0.07291873, 0.06250254,
        0.06771088, 0.04687579, 0.07812778, 0.06771127, 0.09375437,
        0.10938001, 0.05729175, 0.06250254, 0.09896215, 0.05729405,
        0.10417255, 0.07291754, 0.05729437, 0.04687691, 0.05729461]),
 'mean_score_time': array([0.01041706, 0.01562484, 0.00520873, 0.00520905, 0.00520873,
        0.        , 0.        , 0.01041762, 0.        , 0.        ,
        0.        , 0.00520873, 0.0104177 , 0.01041818, 0.00520857,
        0.00520865, 0.00520913, 0.0104177 , 0.01562532, 0.00520873]),
 'mean_test_score': array([0.89371174, 0.92487479, 0.9115192 , 0.9048414 , 0.81803005,
        0.79577073, 0.77017251, 0.91040623, 0.82081247, 0.91597106,
        0.92209238, 0.8458542 , 0.90261547, 0.91040623, 0.82804674,
        0.91819699, 0.91374513, 0.80244853, 0.77072899, 0.80690039]),
 'mean_train_score': array([0.98274965, 1.        , 0.99388006, 0.99332357, 0.88260107,
        0.86420433, 0.82693862, 0.9

In [12]:
#for grid search cv
#specify parameters and distributions to sample from
param_grid = {"max_depth": [3, None],
             "max_features": [2,3,10],
             "min_samples_split": [2,3,10],
             "min_samples_leaf": [2,3,10],
             "bootstrap": [True, False],
             "criterion": ["gini", "entropy"]}
#run gridsearch cv

grid_search = GridSearchCV(clf,param_grid=param_grid)

start = time()
grid_search.fit(X,y)
print("GridSearchCV took %0.2f seconds for %d candidates parameter settings"
     %(time() - start, len(grid_search.cv_results_)))
report(grid_search.cv_results_)

GridSearchCV took 56.11 seconds for 22 candidates parameter settings
model with rank: 1
Mean validation score: 0.935 (std: 0.014)
Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 10, 'criterion': 'gini'}

model with rank: 2
Mean validation score: 0.931 (std: 0.004)
Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 3, 'criterion': 'gini'}

model with rank: 3
Mean validation score: 0.929 (std: 0.004)
Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 3, 'criterion': 'gini'}

