In [60]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# bit.ly/2JDO3i1

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score



In [16]:
names = ['price', 'maint', 'doors', 'persons', 'hoot', 'safety', 'class']
df = pd.read_csv("./data/car.data", header=None, names=names)

X = df.drop("class", axis=1)
y = df["class"]

nominal_features = ['price', 'maint', 'doors', 'persons', 'hoot', 'safety']
for nf in nominal_features:
    X[nf] = X[nf].astype('category').cat.codes
    
onehot = OneHotEncoder()
X = onehot.fit_transform(X).toarray()

y = y.astype("category").cat.codes

train_X, test_X, train_y, test_y = train_test_split(X, y)

In [38]:
train_X.shape
train_y.shape
test_X.shape
test_y.shape

(1296, 21)

(1296,)

(432, 21)

(432,)

# Question 1

How does bagging work? Select a random proportion of features and a random proportion of instances with replacement. Construct n new training sets, train classifier on each of the new training sets than stack.

In sklearn, replacement is on by default, so even with max_samples=1.0 and max_features=1.0, the classfiers will be different.

In [36]:
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.6, max_features=0.6)
bagging.fit(train_X, train_y)
bagging.score(test_X, test_y)

BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
         bootstrap=True, bootstrap_features=False, max_features=0.6,
         max_samples=0.6, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

0.8263888888888888

In [37]:
# for comparison
knn = KNeighborsClassifier()
knn.fit(train_X, train_y)
knn.score(test_X, test_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

0.8888888888888888

# Question 2

In [1]:
class StackEnsemble():
    
    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier
        
    def fit(self, train_X, train_y):
        # train each of the base classifiers
        for clf in self.classifiers:
            clf.fit(train_X, train_y)
            
        # grab the matrix of predictions from the base classifiers
        base_ys = self.base_predict(train_X)
        
        # then train the metaclassifier
        self.metaclassifier.fit(base_ys, train_y)
            
    def base_predict(self, X):
        # call .predict(X) on each of the base classifiers 
        base_ys = []
        for clf in self.classifiers:
            base_y = clf.predict_proba(X) # could also just use .predict as well
            base_ys.append(base_y)
        
        # concatenate each of these prediction vectors into a matrix
        return np.concatenate(base_ys, axis=1)
    
    def predict(self, X):
        base_ys = self.base_predict(X)
        return self.metaclassifier.predict(base_ys)
    
    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

In [77]:
base_classifiers = [GaussianNB(), LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier()]
metaclassifier = DecisionTreeClassifier()
stack = StackEnsemble(base_classifiers, metaclassifier)

In [78]:
stack.fit(train_X, train_y)
stack.score(test_X, test_y)

0.9560185185185185

# Question 3

In [61]:
# why the car dataset is artificial? it's been fabricated so that every feature value combination occurs at least once

In [65]:
rf = RandomForestClassifier()
rf.fit(X, y)
rf.score(X, y)
np.mean(cross_val_score(rf, X, y, cv=10))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

0.9988425925925926

0.83313851495965

In [66]:
dt = DecisionTreeClassifier()
dt.fit(X, y)
dt.score(X, y)
np.mean(cross_val_score(dt, X, y, cv=10))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

1.0

0.853374564831267

Observations? Rf and Dt are both overfitting thet training data (both have accuracy ~ 1.0). Also Dt outperforms Rf based on CV accuracy (0.85 vs 0.83)

What kind of data do we expect RF to perform worse than DT?

* Data in which classes are assigned using DecisionTree, that is totally noiseless
* Very few features or training instances
* High dependence between base classifiers, that is when features have extremely high correlation