This experiments on MNIST by creating multiple views of the dataset.

As the paper deals with binary classification, we use digits 0 to 4 in class 0 and 5 to 9 in class 1

In [13]:
import pandas as pd
import numpy as np

from scipy.signal import convolve2d
from scipy.fft import ifftn

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier 

from models.boostSH import BoostSH
from models.rboostSH import RBoostSH

# Extract features on dataset

The paper compute features on only a subset of the data by extracting 100 datapoints for each set

In [2]:
from sklearn.datasets import load_digits

In [3]:
data, target = load_digits(return_X_y = True)
data = pd.DataFrame(data)
target = pd.Series(target)

### Labels

In [4]:
# Transform labels
target = target > 4

### Subset selection

In [5]:
keep = []
for c in target.unique():
    keep += target[target == c].sample(100).index.tolist()
np.random.shuffle(keep)

In [6]:
data, target = data.loc[keep], target.loc[keep]

### Computation views

6 views are computed in the paper
- Fourier coefficient
- Correlations
- Average 2 x 3 window
- Zernike moments
- Morphological features
- Karhunen coefficient

We focus on only the three first as we didn't find standard implementation of those methods

In [7]:
views = {'original': data}
images = data.values.reshape([-1, 8, 8])

In [8]:
views['Fourier'] = pd.DataFrame([np.real(ifftn(i)).flatten() for i in images],
                                     index = data.index).fillna(1)

In [9]:
views['Correlations'] = pd.DataFrame([np.concatenate([np.corrcoef(i)[np.triu_indices(8, 1)],
                                                      np.corrcoef(i.T)[np.triu_indices(8, 1)]]) for i in images],
                                     index = data.index).fillna(1)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [10]:
views['Convolution'] = pd.DataFrame([convolve2d(i, np.ones((2, 3)), 'valid').flatten() for i in images],
                                     index = data.index).fillna(1)

# Experiment

In [11]:
cv = 30

### Evaluation each view

In [12]:
for v in views:
    score = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(), n_estimators = 100), views[v], target, cv = cv, scoring = 'roc_auc')
    mean, ci = np.mean(score), 1.96 * np.std(score) / np.sqrt(cv) 
    print("View {} achieves {:.2f} ({:.2f} - {:.2f}) AUC".format(v, mean, mean - ci, mean + ci))

View original achieves 0.76 (0.70 - 0.83) AUC
View Fourier achieves 0.74 (0.68 - 0.80) AUC
View Correlations achieves 0.81 (0.77 - 0.86) AUC
View Convolution achieves 0.79 (0.74 - 0.84) AUC


### Early fusion

In [13]:
score = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(), n_estimators = 100), pd.concat(views, axis = 'columns'), target, cv = cv, scoring = 'roc_auc')
mean, ci = np.mean(score), 1.96 * np.std(score) / np.sqrt(cv) 
print("Early fusion achieves {:.2f} ({:.2f} - {:.2f}) AUC".format(mean, mean - ci, mean + ci))

View Convolution achieves 0.90 (0.86 - 0.93) AUC


### Algorithms

#### Boost.SH

In [14]:
%%time
score = cross_val_score(BoostSH(DecisionTreeClassifier(), views, 100), views['original'], target, cv = cv, scoring = 'roc_auc', fit_params = {'edge_estimation_cv': 5})
mean, ci = np.mean(score), 1.96 * np.std(score) / np.sqrt(cv) 
print("Boost.SH achieves {:.2f} ({:.2f} - {:.2f}) AUC".format(mean, mean - ci, mean + ci))

Boost.SH achieves 0.94 (0.90 - 0.97) AUC
CPU times: user 7min 12s, sys: 78.6 ms, total: 7min 12s
Wall time: 7min 12s


#### rBoost.SH

In [37]:
%%time
score = cross_val_score(RBoostSH(DecisionTreeClassifier(), views, 100), views['original'], target, cv = cv, scoring = 'roc_auc', fit_params = {'edge_estimation_cv': 5}, error_score='raise')
mean, ci = np.mean(score), 1.96 * np.std(score) / np.sqrt(cv) 
print("rBoost.SH achieves {:.2f} ({:.2f} - {:.2f}) AUC".format(mean, mean - ci, mean + ci))

Boost.SH achieves 0.96 (0.93 - 0.99) AUC
CPU times: user 1min 57s, sys: 19.9 ms, total: 1min 57s
Wall time: 1min 57s
