In [5]:
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.datasets import make_blobs

In [4]:
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
print(X.shape, y.shape)

(1000, 100) (1000,)


In [6]:
scores = list()

In [8]:
kfold = KFold(n_splits=10, shuffle=True)

for train_ix, test_ix in kfold.split(X):
    train_X, test_X = X[train_ix], X[test_ix]
    train_y, test_y = y[train_ix], y[test_ix]
    
    model = KNeighborsClassifier()
    model.fit(train_X, train_y)
    
    yhat = model.predict(test_X)
    acc = accuracy_score(test_y, yhat)
    
    scores.append(acc)
    print('> ', acc)
    
mean_s, std_s = mean(scores), std(scores)
print('Mean: %.3f, Standard Deviation: %.3f'%(mean_s,std_s))

>  0.94
>  0.9
>  0.96
>  0.93
>  0.91
>  0.94
>  0.92
>  0.94
>  0.91
>  0.98
Mean: 0.933, Standard Deviation: 0.023


In [9]:
# evaluate model by calculating the score across all predictions
from sklearn.datasets import make_blobs
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# create the inputs and outputs
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# k-fold cross validation
data_y, data_yhat = list(), list()
kfold = KFold(n_splits=10, shuffle=True)
# enumerate splits
for train_ix, test_ix in kfold.split(X):
	# get data
	train_X, test_X = X[train_ix], X[test_ix]
	train_y, test_y = y[train_ix], y[test_ix]
	# fit model
	model = KNeighborsClassifier()
	model.fit(train_X, train_y)
	# make predictions
	yhat = model.predict(test_X)
	# store
	data_y.extend(test_y)
	data_yhat.extend(yhat)
# evaluate the model
acc = accuracy_score(data_y, data_yhat)
print('Accuracy: %.3f' % (acc))

Accuracy: 0.874


In [11]:
from sklearn.model_selection import train_test_split
X, X_val, y, y_val = train_test_split(X, y, test_size= 0.33)

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
# collect out of sample predictions
data_x, data_y, knn_yhat, cart_yhat = list(), list(), list(), list()
kfold = KFold(n_splits=10, shuffle=True)
for train_ix, test_ix in kfold.split(X):
	# get data
	train_X, test_X = X[train_ix], X[test_ix]
	train_y, test_y = y[train_ix], y[test_ix]
	data_x.extend(test_X)
	data_y.extend(test_y)
	# fit and make predictions with cart
	model1 = DecisionTreeClassifier()
	model1.fit(train_X, train_y)
	yhat1 = model1.predict_proba(test_X)[:, 0]
	cart_yhat.extend(yhat1)
	# fit and make predictions with cart
	model2 = KNeighborsClassifier()
	model2.fit(train_X, train_y)
	yhat2 = model2.predict_proba(test_X)[:, 0]
	knn_yhat.extend(yhat2)

In [22]:
def create_meta_dataset(data_x, yhat1, yhat2):
    yhat1 =np.array(yhat1).reshape((len(yhat1), 1))
    yhat2 =np.array(yhat2).reshape((len(yhat2), 1))
    
    meta_X = np.hstack((data_x,yhat1, yhat2))
    return meta_X

In [23]:
meta_X = create_meta_dataset(data_x, knn_yhat, cart_yhat)

In [24]:
# fit final submodels
model1 = DecisionTreeClassifier()
model1.fit(X, y)
model2 = KNeighborsClassifier()
model2.fit(X, y)

KNeighborsClassifier()

In [26]:
from sklearn.linear_model import LogisticRegression
# construct meta classifier
meta_model = LogisticRegression(solver='liblinear')
meta_model.fit(meta_X, data_y)

LogisticRegression(solver='liblinear')

In [27]:
# make predictions with stacked model
def stack_prediction(model1, model2, meta_model, X):
	# make predictions
	yhat1 = model1.predict_proba(X)[:, 0]
	yhat2 = model2.predict_proba(X)[:, 0]
	# create input dataset
	meta_X = create_meta_dataset(X, yhat1, yhat2)
	# predict
	return meta_model.predict(meta_X)

In [28]:
# evaluate sub models on hold out dataset
acc1 = accuracy_score(y_val, model1.predict(X_val))
acc2 = accuracy_score(y_val, model2.predict(X_val))
print('Model1 Accuracy: %.3f, Model2 Accuracy: %.3f' % (acc1, acc2))
# evaluate meta model on hold out dataset
yhat = stack_prediction(model1, model2, meta_model, X_val)
acc = accuracy_score(y_val, yhat)
print('Meta Model Accuracy: %.3f' % (acc))

Model1 Accuracy: 0.633, Model2 Accuracy: 0.861
Meta Model Accuracy: 0.909
