In [33]:
import pandas as pd
import os
import numpy as np
from numpy import array
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


print(os.getcwd())

random_state = 127653

c:\Users\thomas.henno\Desktop\GitRepos\course-machine-learning\week-five\AS15


In [34]:
def define_models(models=dict()):
    models["knn"] = KNeighborsClassifier(n_neighbors=3)
    models["tree"] = DecisionTreeClassifier()
    models["svm"] = SVC(C=1, kernel="rbf", gamma="scale")
    return models

def fit_model(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model

def based_models(X_train, y_train, models):
    Based_models = dict()
    for name, model in models.items():
        Based_models[name] = fit_model(X_train, y_train, model)
    return Based_models

In [35]:
def stacked_dataset(Based_models, X_input):
    X_stack = list()
    for _, model in Based_models.items():
        # Make predictions
        y_pred = model.predict(X_input)
        X_stack.append(y_pred)
    X_stacked = array(X_stack)
    X_stack = X_stacked.T
    return X_stack

def fit_stacked_model(Based_models, X_input, y_input):
    # Create dataset using ensemble
    X_stacked = stacked_dataset(Based_models, X_input)
    # Fit Meta model
    Meta_model = LogisticRegression(C=0.5, solver="liblinear")
    Meta_model.fit(X_stacked, y_input)
    return Meta_model

def stacked_prediction(Based_models, Meta_model, X_input):
    X_stacked = stacked_dataset(Based_models, X_input)
    y_pred = Meta_model.predict(X_stacked)
    return y_pred

In [36]:
# Load data into train, validation and test sets

df = pd.read_csv("banknote_authentication.csv", sep=";")

X, y = df.drop("counterfeit", axis=1), df["counterfeit"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.8, shuffle=True)

X_train, X_test, y_train, y_test = array(X_train), array(X_test), array(y_train), array(y_test)

kf = KFold(n_splits=5, random_state=random_state)

for train, test in kf.split(X_train, y_train):
    X_train_train, X_train_validate, y_train_train, y_train_validate = X_train[train], X_train[test], y_train[train], y_train[test]
    break # We only want one iteration

print(X_train_train.shape)
print(X_train_validate.shape)
print(y_train_train.shape)
print(y_train_validate.shape)
print("")
print(X_test.shape)
print(y_test.shape)

del X
del y
del X_train
del y_train

(877, 4)
(220, 4)
(877,)
(220,)

(275, 4)
(275,)


In [38]:
models = define_models(models=dict())
print(models)
Based_model = based_models(X_train_train, y_train_train, models)
print(Based_model)

results = dict()
for name, model in Based_model.items():
    y_pred = model.predict(X_train_validate)
    # Evaluate predictions on validation set
    accuracy = accuracy_score(y_train_validate, y_pred)
    results[name] = accuracy * 100
    print(">%s: %.3f" % (name, results[name]))

# Fit stacked model using the ensemble
Meta_model = fit_stacked_model(Based_model, X_train_train, y_train_train)

# Evaluate model on test set
y_pred = stacked_prediction(Based_model, Meta_model, X_test)
acc = accuracy_score(y_test, y_pred)
print("Stacked Test accuracy: %.3f" % acc)

{'knn': KNeighborsClassifier(n_neighbors=3), 'tree': DecisionTreeClassifier(), 'svm': SVC(C=1)}
{'knn': KNeighborsClassifier(n_neighbors=3), 'tree': DecisionTreeClassifier(), 'svm': SVC(C=1)}
>knn: 100.000
>tree: 97.727
>svm: 99.545
Stacked Test accuracy: 1.000
