# Feature Creation, Classification and Scoring

> "Going the whole way", *Churchill, 2048*


In [None]:
import sys
path_bnd = '../'
sys.path.insert(1, path_bnd)
import analysis_tools.data_loader as dl
from gsprep.visual_tools.visual import display
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from gtda.homology import CubicalPersistence
from pgtda.diagrams import  PersistenceEntropy, Amplitude, Filtering, Scaler
from pgtda.plotting import plot_diagram
from pgtda.images import Inverter, Padder

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion, make_union
from pgtda.images import RollingSubImageTransformer, make_image_union

## Import data 

In [None]:
data_dir = '/Users/julian/stroke_research/brain_and_donuts/full_datasets'
clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, brain_masks, ids, params = \
dl.load_structured_data(data_dir, 'withAngio_all_2016_2017.npz')

# Reshape ct_inputs as it has 1 channel
ct_inputs = ct_inputs.reshape((*ct_inputs.shape[:-1]))

## Setting up data exploration set

In [None]:
# Data subset
n_images = 1
X = (ct_inputs[:n_images] * brain_masks[:n_images])[range(n_images), ::2, ::2, ::2]
y = (ct_lesion_GT[:n_images] * brain_masks[:n_images])[range(n_images), ::2, ::2, ::2]

print(X.shape)
print(y.shape)

In [None]:
display(X[0])
display(y[0])

## Feature Creation
 
Example pipeline: Get persistence entropies and wasserstein amplitude per subwindow for subwindows of different width

In [None]:
width_list = [[5, 5, 5], [7, 7, 7]]
# Note that padding should be same so that output images always have the same size
transformer = make_pipeline(CubicalPersistence(homology_dimensions=(0, 1 ,2), n_jobs=-1), 
                             make_union(PersistenceEntropy(n_jobs=-1), 
                                         Amplitude(metric='wasserstein', metric_params={'p':2}, order=None, n_jobs=-1)))
rsis = make_image_union(*[RollingSubImageTransformer(transformer=transformer, width=width, padding='same')
                    for width in width_list], n_jobs=-1)
X_subfeatures_union = rsis.fit_transform(X)
print(X_subfeatures_union.shape)

In [None]:
# Get the persistence entropies and the amplitudes for subwindows of various sizes
display(X_subfeatures_union[0, :, :, :, 0])

In [None]:
X_features = X_subfeatures_union

In [None]:
# Just cheating here to mimick more subjects
X_features = np.concatenate([X_features, X_features], axis = 0)
y = np.concatenate([y, y], axis=0)
X_features.shape 

## Feature Classification

#### Create classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10000, n_jobs=-1)

#### Prepare dataset 

It is important to split the data subject-wise as otherwise data from testing voxels is leaking into training voxels via the rolling window. 

In [None]:
from sklearn.model_selection import train_test_split

n_images, n_x, n_y, n_z, n_features = X_features.shape
X_flat = X_features.reshape(n_images, -1, n_features)
y_flat = y.reshape(n_images, -1)
# test_size is 0.5 so that the pipeline can be used with only 2 subjects
X_train, X_test, y_train, y_test = train_test_split(X_flat, y_flat, test_size=0.5, random_state=42)
X_train, y_train = X_train.reshape(-1, n_features), y_train.reshape(-1)
X_test, y_test = X_test.reshape(-1, n_features), y_test.reshape(-1)
X_train.shape, y_train.shape

#### Train classifier 

In [None]:
classifier.fit(X_train, y_train)

#### Apply classifier 

In [None]:
probas = classifier.predict_proba(X_test)
predicted = classifier.predict(X_test)

#### Reconstruct output 

In [None]:
probas_3D = probas.reshape(-1, n_x, n_y, n_z, 2)
predicted_3D = predicted.reshape(-1, n_x, n_y, n_z)
probas_3D.shape

In [None]:
display(probas_3D[...,1])
display(predicted_3D)

## Model (Features + Classifier) Evaluation 

In [None]:
from sklearn.metrics import roc_curve, auc
from analysis_tools.metrics.plot_ROC import plot_roc

def roc_auc(label_gt, label_pred):
    y_true = np.array(label_gt).flatten()
    y_scores = np.array(label_pred).flatten()

    fpr, tpr, roc_thresholds = roc_curve(y_true, y_scores)
    roc_auc_score = auc(fpr, tpr)
    return roc_auc_score, (fpr, tpr, roc_thresholds)

In [None]:
def dice(im1, im2, empty_score=1.0):
    """
    Computes the Dice coefficient, a measure of set similarity.
    Parameters
    ----------
    im1 : array-like, bool
        Any array of arbitrary size. If not boolean, will be converted.
    im2 : array-like, bool
        Any other array of identical size. If not boolean, will be converted.
    Returns
    -------
    dice : float
        Dice coefficient as a float on range [0,1].
        Maximum similarity = 1
        No similarity = 0
        Both are empty (sum eq to zero) = empty_score

    Notes
    -----
    The order of inputs for `dice` is irrelevant. The result will be
    identical if `im1` and `im2` are switched.
    """
    im1 = np.asarray(im1).astype(np.bool)
    im2 = np.asarray(im2).astype(np.bool)

    if im1.shape != im2.shape:
        raise ValueError("Shape mismatch: im1 and im2 must have the same shape.")

    im_sum = im1.sum() + im2.sum()
    if im_sum == 0:
        return empty_score

    # Compute Dice coefficient
    intersection = np.logical_and(im1, im2)

    return 2. * intersection.sum() / im_sum

In [None]:
dice_score = dice(predicted.flatten(), y_test.flatten())
roc_auc_score, roc_curve_details = roc_auc(y_test, predicted)

print('Dice:', dice_score)
print('ROC AUC:', roc_auc_score)

In [None]:
fpr, tpr, roc_thresholds = roc_curve_details
plot_roc([tpr], [fpr])

## Model feature analysis

#### Model confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, predicted)
plt.imshow(confusion)
plt.show()

#### Feature correalation 

In [None]:
import numpy as np 

correlation = np.abs(np.corrcoef(X_train.T))
plt.imshow(correlation)
plt.show()