# Cross Validation
Cross validation is essential for this task but we have some problems to implement it: Since our data for a single patient are highly correlated we can only have them all either in the train or in the test set.
For this reason we implement Cross validation without shuffling, and we also cannot use stratified CV.
What we do, since our dataset is unbalance is to split into folds where we are sure that all data of a patient is in the same fold. Then we upsample train set test on the other and so on.

# For this first part we are doing it only for Control patients, then we generalize by mixing control and AMD data

In [1]:
from buildDataset import *
from sklearn.naive_bayes import GaussianNB
root_path =      DATAPATHS["preprocessed"]
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,ConfusionMatrixDisplay,f1_score

  from skimage.morphology import selem


In [2]:
basepath = r'C:\Users\line\Desktop\Mauro\3_DataSet\OCT_balanced'
paths = glob.glob(os.path.join(basepath,'controlP' + '\*.pickle'))
X,y = getXYdata(paths, mode = 'raw',rootpath = basepath,normmode = 'EQ-hist')
print(X.shape)
print(y.shape)
print(768*14*11)

(129024, 80)
(129024,)
118272


In [3]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=False)

In [4]:
fold_number = 6
for tridx,tstidx in kf.split(X,y):
    fold_number += 1
    print(f'fold number: {fold_number}' )
    print(tridx)
    print(tstidx)

fold number: 7
[ 32256  32257  32258 ... 129021 129022 129023]
[    0     1     2 ... 32253 32254 32255]
fold number: 8
[     0      1      2 ... 129021 129022 129023]
[32256 32257 32258 ... 64509 64510 64511]
fold number: 9
[     0      1      2 ... 129021 129022 129023]
[64512 64513 64514 ... 96765 96766 96767]
fold number: 10
[    0     1     2 ... 96765 96766 96767]
[ 96768  96769  96770 ... 129021 129022 129023]


In [5]:
# Create a cv that specifies the folds of my dataset, basically to avoid shuffling
# Having 12 control patients, splits should be divisors of 12, eg: 2 fold, 3-fold, 4-fold, 6-fold, 12-fold
from sklearn.metrics import ConfusionMatrixDisplay

def score_model(model,X,y, params = None, cv=None,plotMatrix = None):
    """
    Creates folds manually, and upsamples within each fold.
    Returns an array of validation (recall) scores
    """
    #smoter = SMOTE(random_state=42)
    
    scores = []

    for train_fold_index, val_fold_index in cv.split(X, y):
        # Get the training data
        X_train_fold, y_train_fold = X[train_fold_index], y[train_fold_index]
        # Get the validation data
        X_val_fold, y_val_fold = X[val_fold_index], y[val_fold_index]

        # Upsample only the data in the training section
        #X_train_fold_upsample, y_train_fold_upsample = smoter.fit_resample(X_train_fold,y_train_fold)
        X_train_fold_upsample, y_train_fold_upsample = upsample(X_train_fold,y_train_fold)                                                                   
        # Fit the model on the upsampled training data
        if params:
            model_obj = model(**params).fit(X_train_fold_upsample, y_train_fold_upsample)
        else:
            model_obj = model().fit(X_train_fold_upsample, y_train_fold_upsample)
        if(plotMatrix):
            # Display confusion matrix
            ConfusionMatrixDisplay.from_estimator(model_obj, X_val_fold, y_val_fold)
            plt.show()
        
        # Score the model on the (non-upsampled) validation data
        predictions = model_obj.predict(X_val_fold)
        scores.append(recall_score(y_val_fold, predictions))
        scores.append(precision_score(y_val_fold, predictions))
        scores.append(accuracy_score(y_val_fold, predictions))
        scores.append(f1_score(y_val_fold, predictions))
    return np.array(scores)

In [6]:
X,y = getXYdata(paths, mode = 'raw',rootpath = basepath,normmode = 'EQ-hist')

In [7]:
# Example of the model in action
kf = KFold(n_splits=6, shuffle=False)

scores = score_model(GaussianNB,X,y, cv=kf)
print(f'Here are the recall scores: {scores[::4]}')
print(f'Here are the precisions scores: {scores[1::4]}')
print(f'Here are the accuracies scores: {scores[2::4]}')
print(f'Here are the f1 scores: {scores[3::4]}')

Here are the recall scores: [0.20111576 0.99893843 0.99199014 1.         0.90127898 0.69791667]
Here are the precisions scores: [0.25805297 0.15693796 0.34705756 0.31761387 0.44671157 0.3755004 ]
Here are the accuracies scores: [0.77041481 0.76488095 0.85853795 0.84254092 0.85863095 0.86286272]
Here are the f1 scores: [0.22605424 0.27125973 0.51421271 0.48210462 0.59735099 0.48828735]


# Here we generalize to both AMD and CONTROL

In [8]:
basepath = r'C:\Users\line\Desktop\Mauro\3_DataSet\OCT_balanced'
paths = glob.glob(os.path.join(basepath,'controlP' + '\*.pickle'))
paths2 = glob.glob(os.path.join(basepath,'amdP' + '\*.pickle'))

In [10]:
merged = [[paths[2*i],paths[2*i+1],paths2[i]]  for i in range(len(paths2))]
flattenedmerged = [item for sublist in merged for item in sublist]
flattenedmerged
print(flattenedmerged)
X,y = getXYdata(flattenedmerged, mode = 'thickness',rootpath = basepath,normmode = 'EQ-hist')

['C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\controlP\\0.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\controlP\\10.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\amdP\\0.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\controlP\\11.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\controlP\\12.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\amdP\\1.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\controlP\\2.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\controlP\\3.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\amdP\\2.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\controlP\\4.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\controlP\\5.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_balanced\\amdP\\3.pickle', 'C:\\Users\\line\\Desktop\\Mauro\\3_DataSet\\OCT_bal

In [None]:
# EQUIVALENTLY 
X,y = getBalancedXYData(mode = 'thickness',normmode = 'EQ-hist')

In [11]:
kf = KFold(n_splits=6, shuffle=False)

scores = score_model(GaussianNB,X,y, cv=kf)
print(f'Here are the recall scores: {scores[::4]}')
print(f'Here are the precisions scores: {scores[1::4]}')
print(f'Here are the accuracies scores: {scores[2::4]}')
print(f'Here are the f1 scores: {scores[3::4]}')

Here are the recall scores: [0.3572546  0.71035518 0.81283644 0.65528808 0.28542688 0.45091138]
Here are the precisions scores: [0.43263083 0.15858834 0.39049135 0.34753035 0.44553484 0.44077169]
Here are the accuracies scores: [0.82381572 0.7484809  0.78199405 0.75254216 0.73623512 0.7234933 ]
Here are the f1 scores: [0.39134626 0.25928969 0.52754636 0.4541849  0.34794605 0.44578388]
