#LIBRARIES IMPORT, FUNCTION DEFINITION AND DATA LOADING

In [10]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:
"""
LIBRARIES IMPORT AND CONSTANTS DEFINITION
"""

import os

import tensorflow as tf 
from tensorflow.keras import models
from sklearn import metrics

import numpy as np

MAX_UINT16 = 65535

BASE_DIR = '/content/gdrive/My Drive/Quintavalla/'
DATA_FOLDER = BASE_DIR + "numpy data"
MODELS_FOLDER = BASE_DIR + "models"

In [12]:
"""
FUNCTION DEFINITION: DATA LOADING
The following function loads test data from the file. A 
dimension is added for compatibility with Keras models (the only grayscale channel
is replicated three times). Since in this task baselines are not required, the 
corresponding images and labels are discarded. 
"""

def load_test_data():
  images = np.load(os.path.join(DATA_FOLDER, 'public_test_tensor.npy'))
  labels = np.load(os.path.join(DATA_FOLDER, 'public_test_labels.npy'))
  images = np.repeat(images[:, :, :, np.newaxis], 3, axis=3)
  not_baseline_indexes = np.where(labels != 0)
  return images[not_baseline_indexes], labels[not_baseline_indexes]

In [13]:
"""
DATA LOADING
Data are loaded from file using function defined above. 
"""

test_images, test_labels = load_test_data()

print(test_images.shape)

print(test_labels.shape)

print(test_labels)

TEST_SET_SIZE = test_images.shape[0]
IMAGE_HEIGHT = test_images.shape[1]
IMAGE_WIDTH =  test_images.shape[2]
CHANNELS = test_images.shape[3]

(336, 150, 150, 3)
(336,)
[2 2 2 2 1 1 2 2 1 2 2 1 1 2 2 2 2 2 2 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1
 1 1 2 2 2 2 1 1 1 1 2 2 1 1 2 2 2 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 2 2 1
 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2
 2 2 1 1 2 2 2 2 2 1 1 2 2 2 2 1 1 2 2 2 1 1 1 1 1 1 1 1 1 2 2 1 2 2 1 2 1
 1 1 1 2 2 1 1 1 1 1 2 2 2 1 1 2 2 1 2 2 1 1 1 1 2 2 1 1 1 1 1 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 4 4 3 3 3 3 3 3 3 3 3 3 3 4 3 3 4 3 3 3 3 4 4 3 3 3 3
 3 3 4 3 3 3 4 4 4 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 3 3 4 4 3 3 4 4 4 4 4 4
 4 3 3 3 3 4 4 4 3 3 3 3 3 3 4 3 3 3 3 3 3 4 4 4 4 3 3 3 3 4 3 3 4 4 4 3 3
 3 3 3 3 3 3 3 3 3 3 3 4 4 3 3 4 4 4 4 4 4 3 3 4 4 4 4 4 4 4 3 3 3 3 3 3 3
 3 3 3]


#SUBTASK 1: CALCIFICATION-MASS CLASSIFICATION
This subsection is focused on Calcification-Mass problem. The only preprocessing adopted is label redefinition (0 for Calcification cases, 1 for Mass cases) and the normalization of test images. 

In [14]:
"""
PREPROCESSING
"""

cm_test_images = test_images / MAX_UINT16
cm_test_labels = np.array([0 if n > 2 else 1 for n in test_labels])

In [15]:
"""
MODELS LOADING
"""

calcification_mass_scratch = models.load_model(os.path.join(MODELS_FOLDER,"calcification_mass_scratch.h5"))
calcification_mass_pretrained_vgg1 = models.load_model(os.path.join(MODELS_FOLDER,"calcification_mass_pretrained_vgg1.h5"))
calcification_mass_pretrained_vgg2 = models.load_model(os.path.join(MODELS_FOLDER,"calcification_mass_pretrained_vgg2.h5"))

In [16]:
"""
MODELS PREDICTIONS
"""

calcification_mass_scratch_predictions = np.around(calcification_mass_scratch.predict(tf.slice(cm_test_images, begin=[0,0,0,0], size=[TEST_SET_SIZE,IMAGE_WIDTH,IMAGE_HEIGHT,1])),2) 
calcification_mass_pretrained_vgg1_predictions = np.around(calcification_mass_pretrained_vgg1.predict(cm_test_images),2) 
calcification_mass_pretrained_vgg2_predictions = np.around(calcification_mass_pretrained_vgg2.predict(cm_test_images),2) 

##EXPERIMENT 1_1: MAJORITY VOTING 
In majority voting, each classifier perform its prediction and the final prediction is done by counting the votes collected by each class.

In [None]:
"""
GETTING MAJORITY VOTING PREDICTIONS
Since the number of classifier is equal to 3 and the classes are only 2, majority 
voting can be performed. In this case it is sufficient to retrieve the predicted 
labels (0 or 1) for each classifier and sum them. If the sum is greater or equal 
to 2, it means that at least two classifiers (the majority) voted for label 1. In 
case the sum is equal to 0 or 1, it means that 0 or at maximum 1 classifier voted
for label 1 (the minority).
"""

mv_cm_scratch_predictions = [ int(np.around(x,0)) for x in calcification_mass_scratch_predictions ]
mv_cm_pretrained_vgg1_predictions = [ int(np.around(x,0)) for x in calcification_mass_pretrained_vgg1_predictions ]
mv_cm_pretrained_vgg2_predictions = [ int(np.around(x,0)) for x in calcification_mass_pretrained_vgg2_predictions ]

mv_cm_predictions = np.array([mv_cm_scratch_predictions, mv_cm_pretrained_vgg1_predictions, mv_cm_pretrained_vgg2_predictions]).sum(axis=0)
mv_cm_predictions = [ 1 if x>=2 else 0 for x in mv_cm_predictions ]

In [None]:
"""
PRINT METRICS
"""

print(metrics.classification_report(cm_test_labels, mv_cm_predictions))
print(metrics.confusion_matrix(cm_test_labels, mv_cm_predictions))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       157
           1       0.86      0.89      0.87       179

    accuracy                           0.86       336
   macro avg       0.86      0.86      0.86       336
weighted avg       0.86      0.86      0.86       336

[[130  27]
 [ 19 160]]


##EXPERIMENT 1_2: AVERAGE VOTING 
In average voting, each raw prediction from the base classifiers is aggregated in order to obtain an average raw prediction. Each classifier has equal weight. Finally, the classification label is derived from the average. 

In [None]:
"""
GETTING AVERAGE VOTING PREDICTIONS
"""

av_cm_predictions = np.average(np.array([calcification_mass_scratch_predictions, 
    calcification_mass_pretrained_vgg1_predictions, calcification_mass_pretrained_vgg2_predictions]), axis=0)

av_cm_predictions = [ int(x) for x in np.around(av_cm_predictions,0)]

In [None]:
"""
PRINT METRICS
"""

print(metrics.classification_report(cm_test_labels, av_cm_predictions))
print(metrics.confusion_matrix(cm_test_labels, av_cm_predictions))

              precision    recall  f1-score   support

           0       0.89      0.83      0.86       157
           1       0.86      0.91      0.88       179

    accuracy                           0.87       336
   macro avg       0.87      0.87      0.87       336
weighted avg       0.87      0.87      0.87       336

[[130  27]
 [ 16 163]]


##EXPERIMENT 1_3: WEIGHTED AVERAGE VOTING
In Weighted Average Voting, a weight is assigned to each classifier according to the accuracy and loss reached. In particular:

$weight = \frac{accuracy*100}{loss}$

Then the weighted contributions are summed and divided by the sum of the weights.

In [17]:
"""
GETTING WEIGHTED AVERAGE PREDICTIONS
"""

scratch_loss, scratch_accuracy = calcification_mass_scratch.evaluate(tf.slice(cm_test_images, begin=[0,0,0,0], size=[TEST_SET_SIZE,IMAGE_WIDTH,IMAGE_HEIGHT,1]), cm_test_labels, verbose=0) 
vgg1_loss, vgg1_accuracy = calcification_mass_pretrained_vgg1.evaluate(cm_test_images, cm_test_labels, verbose=0)
vgg2_loss, vgg2_accuracy = calcification_mass_pretrained_vgg2.evaluate(cm_test_images, cm_test_labels, verbose=0)

weight_scratch = np.around(scratch_accuracy*100/scratch_loss,2) #113.3
weight_pretrained_vgg1 = np.around(vgg1_accuracy*100/vgg1_loss,2) #125 
weight_pretrained_vgg2 = np.around(vgg2_accuracy*100/vgg2_loss,2) #122.8

print(weight_scratch)
print(weight_pretrained_vgg1)
print(weight_pretrained_vgg2)

wav_cm_scratch_predictions = weight_scratch*calcification_mass_scratch_predictions 
wav_cm_pretrained_vgg1_predictions = weight_pretrained_vgg1*calcification_mass_pretrained_vgg1_predictions
wav_cm_pretrained_vgg2_predictions = weight_pretrained_vgg2*calcification_mass_pretrained_vgg2_predictions

wav_cm_predictions = np.array([wav_cm_scratch_predictions, wav_cm_pretrained_vgg1_predictions, wav_cm_pretrained_vgg2_predictions]).sum(axis=0)

wav_cm_predictions = wav_cm_predictions / (weight_scratch+weight_pretrained_vgg1+weight_pretrained_vgg2)

wav_cm_predictions = [ int(x) for x in np.around(wav_cm_predictions,0)]

255.26
245.17
274.03


In [20]:
"""
PRINT METRICS
"""

print(metrics.classification_report(cm_test_labels, wav_cm_predictions))
print(metrics.confusion_matrix(cm_test_labels, wav_cm_predictions))

              precision    recall  f1-score   support

           0       0.89      0.83      0.86       157
           1       0.86      0.91      0.88       179

    accuracy                           0.87       336
   macro avg       0.87      0.87      0.87       336
weighted avg       0.87      0.87      0.87       336

[[130  27]
 [ 16 163]]


##CONCLUSIONS
For Calcification-Mass problem, the developement of a composite classifier led to unsatisfactory results. All the test set accuracy obtained are slightly lower than the accuracy of the best classifier (88%). A possible reason can be that all the considered base classifiers made errors on the same subset of cases, and for that reason a composite classifier is not helpful. Another possibility is that errors are too close to 0 or 1 to be corrected by the average, for example.

#SUBTASK 2: BENIGN-MALIGNANT CLASSIFICATION
This subsection is focused on Calcification-Mass problem. The only preprocessing adopted is label redefinition (0 for Benign cases, 1 for Malignant cases) and the normalization of test images. 

In [5]:
"""
PREPROCESSING
"""

bm_test_images = test_images / MAX_UINT16
bm_test_labels = np.array([0 if n%2==1 else 1 for n in test_labels])

In [6]:
"""
MODELS LOADING
"""

benign_malignant_scratch = models.load_model(os.path.join(MODELS_FOLDER,"benign_malignant_scratch.h5"))
benign_malignant_pretrained_vgg2 = models.load_model(os.path.join(MODELS_FOLDER,"benign_malignant_pretrained_vgg2.h5"))
benign_malignant_pretrained_res1 = models.load_model(os.path.join(MODELS_FOLDER,"benign_malignant_pretrained_res1.h5"))

In [7]:
"""
MODELS PREDICTIONS
"""

benign_malignant_scratch_predictions = np.around(benign_malignant_scratch.predict(tf.slice(bm_test_images, begin=[0,0,0,0], size=[TEST_SET_SIZE,IMAGE_WIDTH,IMAGE_HEIGHT,1])),2) 
benign_malignant_pretrained_vgg2_predictions = np.around(benign_malignant_pretrained_vgg2.predict(bm_test_images),2) 
benign_malignant_pretrained_res1_predictions = np.around(benign_malignant_pretrained_res1.predict(bm_test_images),2) 

##EXPERIMENT 2_1: MAJORITY VOTING 
In majority voting, each classifier perform its prediction and the final prediction is done by counting the votes collected by each class.

In [None]:
"""
GETTING MAJORITY VOTING PREDICTIONS
Since the number of classifier is equal to 3 and the classes are only 2, majority 
voting can be performed. In this case it is sufficient to retrieve the predicted 
labels (0 or 1) for each classifier and sum them. If the sum is greater or equal 
to 2, it means that at least two classifiers (the majority) voted for label 1. In 
case the sum is equal to 0 or 1, it means that 0 or at maximum 1 classifier voted
for label 1 (the minority).
"""

mv_bm_scratch_predictions = [ int(np.around(x,0)) for x in benign_malignant_scratch_predictions ]
mv_bm_pretrained_vgg2_predictions = [ int(np.around(x,0)) for x in benign_malignant_pretrained_vgg2_predictions ]
mv_bm_pretrained_res1_predictions = [ int(np.around(x,0)) for x in benign_malignant_pretrained_res1_predictions ]

mv_bm_predictions = np.array([mv_bm_scratch_predictions, mv_bm_pretrained_vgg2_predictions, mv_bm_pretrained_res1_predictions]).sum(axis=0)
mv_bm_predictions = [ 1 if x>=2 else 0 for x in mv_bm_predictions ]

In [None]:
"""
PRINT METRICS
"""

print(metrics.classification_report(bm_test_labels, mv_bm_predictions))
print(metrics.confusion_matrix(bm_test_labels, mv_bm_predictions))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79       219
           1       0.61      0.42      0.50       117

    accuracy                           0.71       336
   macro avg       0.67      0.64      0.64       336
weighted avg       0.69      0.71      0.69       336

[[188  31]
 [ 68  49]]


##EXPERIMENT 2_2: AVERAGE VOTING 
In average voting, each raw prediction from the base classifiers is aggregated in order to obtain an average raw prediction. Each classifier has equal weight. Finally, the classification label is derived from the average.

In [None]:
"""
GETTING AVERAGE VOTING PREDICTIONS
"""

av_bm_predictions = np.average(np.array([benign_malignant_scratch_predictions, 
    benign_malignant_pretrained_vgg2_predictions, benign_malignant_pretrained_res1_predictions]), axis=0)

av_bm_predictions = [ int(x) for x in np.around(av_bm_predictions,0)]

In [None]:
"""
PRINT METRICS
"""

print(metrics.classification_report(bm_test_labels, av_bm_predictions))
print(metrics.confusion_matrix(bm_test_labels, av_bm_predictions))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       219
           1       0.63      0.47      0.54       117

    accuracy                           0.72       336
   macro avg       0.69      0.66      0.67       336
weighted avg       0.71      0.72      0.71       336

[[187  32]
 [ 62  55]]


##EXPERIMENT 2_3: WEIGHTED AVERAGE VOTING
In Weighted Average Voting, a weight is assigned to each classifier according to the accuracy and loss reached. In particular:

$weight = \frac{accuracy*100}{loss}$

Then the weighted contributions are summed and divided by the sum of the weights.

In [8]:
"""
GETTING WEIGHTED AVERAGE PREDICTIONS
"""

scratch_loss, scratch_accuracy = benign_malignant_scratch.evaluate(tf.slice(bm_test_images, begin=[0,0,0,0], size=[TEST_SET_SIZE,IMAGE_WIDTH,IMAGE_HEIGHT,1]), bm_test_labels, verbose=0) 
vgg2_loss, vgg2_accuracy = benign_malignant_pretrained_vgg2.evaluate(bm_test_images, bm_test_labels, verbose=0)
res1_loss, res1_accuracy = benign_malignant_pretrained_res1.evaluate(bm_test_images, bm_test_labels, verbose=0)

weight_scratch = np.around(scratch_accuracy*100/scratch_loss,2) #113.3
weight_pretrained_vgg2 = np.around(vgg2_accuracy*100/vgg2_loss,2) #125 
weight_pretrained_res1 = np.around(res1_accuracy*100/res1_loss,2) #122.8

print(weight_scratch)
print(weight_pretrained_vgg2)
print(weight_pretrained_res1)

wav_bm_scratch_predictions = weight_scratch*benign_malignant_scratch_predictions 
wav_bm_pretrained_vgg2_predictions = weight_pretrained_vgg2*benign_malignant_pretrained_vgg2_predictions
wav_bm_pretrained_res1_predictions = weight_pretrained_res1*benign_malignant_pretrained_res1_predictions

wav_bm_predictions = np.array([wav_bm_scratch_predictions, wav_bm_pretrained_vgg2_predictions, wav_bm_pretrained_res1_predictions]).sum(axis=0)

wav_bm_predictions = wav_bm_predictions / (weight_scratch+weight_pretrained_vgg2+weight_pretrained_res1)

wav_bm_predictions = [ int(x) for x in np.around(wav_bm_predictions,0)]

112.26
123.02
124.08


In [9]:
"""
PRINT METRICS
"""

print(metrics.classification_report(bm_test_labels, wav_bm_predictions))
print(metrics.confusion_matrix(bm_test_labels, wav_bm_predictions))

              precision    recall  f1-score   support

           0       0.76      0.85      0.80       219
           1       0.64      0.50      0.56       117

    accuracy                           0.73       336
   macro avg       0.70      0.67      0.68       336
weighted avg       0.72      0.73      0.72       336

[[186  33]
 [ 59  58]]


##CONCLUSIONS
For Benign-Malignant classification problem, the developement of a composite classifier led to good results. All the experiments led to an improvement of the results with respect to the base classifiers. The best result is reached from the Weighted Average Voting, which permitted to pass from 70% to 73% in accuracy. In general, Average Voting classifiers provided better results than Majority Voting because of their higher robustness. 