# Setting up the enviroment

In [None]:
# Cloning the git repo with the data structure
!git clone https://github.com/JessyD/test.git

In [None]:
# Install necessary python dependencies
! pip install -r test/requirements.txt

# Download the Data

In [None]:
!wget -O test/data/nspn.fmri.main.RData https://ndownloader.figshare.com/files/20958708

In [None]:
!wget -O test/data/nspn.fmri.gsr.RData https://ndownloader.figshare.com/files/20958699

In [None]:
!wget -O test/data/nspn.fmri.lowmot.RData https://ndownloader.figshare.com/files/20958702

In [None]:
!wget -O test/data/nspn.fmri.general.vars.RData https://ndownloader.figshare.com/files/20819796

# Define key variables

In [None]:
import pickle
import random

import pyreadr 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colorbar
import bct
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from helperfunctions import gateway_coef_sign
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Set the random seed
#np.random.seed(2)
rng = np.random.default_rng(2)
random.seed(2)

In [None]:
# Define paths
from pathlib import Path
PROJECT_ROOT = Path.cwd()
data_path = PROJECT_ROOT / 'test' /'data'
output_path = PROJECT_ROOT / 'test' / 'output'

In [None]:
data1 = pyreadr.read_r(str(data_path / 'nspn.fmri.main.RData'))
data3 = pyreadr.read_r(str(data_path / 'nspn.fmri.lowmot.RData'))
genVar = pyreadr.read_r(str(data_path / 'nspn.fmri.general.vars.RData'))
data2 = pyreadr.read_r(str(data_path / 'nspn.fmri.gsr.RData'))

DataNames=['nspn.fmri.main.RData','nspn.fmri.gsr.RData','nspn.fmri.lowmot.RData']

In [None]:
# Define some images properites
n_regions = 346
subject_array = 520

#Get motion regression functional connectivity data and reshape into 
# region x region x subject array
FC = np.asarray(data1['fc.main'])
MainNoNan = np.nan_to_num(FC,copy=True,nan=1.0)
MainNoNanReshape = np.reshape(MainNoNan, [n_regions,n_regions,subject_array],
                            order='F')

#Get global signal regression functional connectivity data and reshape into
# region x region x subject array
FC=np.asarray(data2['fc.gsr'])
GSRNoNan = np.nan_to_num(FC,copy=True,nan=1.0)
GSRNoNanReshape = np.reshape(GSRNoNan, [n_regions,n_regions,subject_array],
                           order='F')

#Read in subject IDs and age
IDMain=np.asarray(data1['id.main'])
ages=np.asarray(data1['age.main'])

#Find unique subject IDs and index of first instance and find FC data 
# corresponding to these indices
IDs,IDIndexUnique = np.unique(IDMain,return_index=True)
MainNoNanReshapeUnique = MainNoNanReshape[:,:,IDIndexUnique]
GSRNoNanReshapeUnique = GSRNoNanReshape[:,:,IDIndexUnique]
AgesUnique = ages[IDIndexUnique]

# Number of randomly selected subjects to be used to define the low-dimensional 
# space then split FC data and age data into two: 50 for defining space and 
#remaining 248 for subsequent prediction
SpaceDefineN = 50
RandomIndexes = rng.choice(IDs.shape[0], size=IDs.shape[0], replace=False)
MainNoNanModelSpace = MainNoNanReshapeUnique[:,:,RandomIndexes[0:SpaceDefineN]]
MainNoNanPrediction = MainNoNanReshapeUnique[:,:,RandomIndexes[SpaceDefineN:]]
GSRNoNanModelSpace = GSRNoNanReshapeUnique[:,:,RandomIndexes[0:SpaceDefineN]]
GSRNoNanPrediction = GSRNoNanReshapeUnique[:,:,RandomIndexes[SpaceDefineN:]]
AgesModelSpace = AgesUnique[RandomIndexes[0:SpaceDefineN]]
AgesPrediction = AgesUnique[RandomIndexes[SpaceDefineN:]]
IDsModelSpace = IDs[RandomIndexes[0:SpaceDefineN]] 
IDsPrediction = IDs[RandomIndexes[SpaceDefineN:]]

In [None]:
#Get info about brain regions and find Yeo network IDs; useful later on for 
# graph metrics that need community labels.
KeptIDs = np.asarray(genVar['hcp.keep.id'])
YeoIDs = np.asarray(genVar['yeo.id.subc'])
KeptYeoIDs = YeoIDs[KeptIDs-1][:,0,0]

In [None]:
#Dictionary of 16 graph theory measures taken from the Brain Connectivity Toolbox

BCT_models = {
    'degree': bct.degrees_und,
    'strength': bct.strengths_und,
    'betweennness centrality': bct.betweenness_bin,
    'clustering (bin.)': bct.clustering_coef_bu,
    'clustering (wei.)': bct.clustering_coef_wu,
    'eigenvector centrality': bct.eigenvector_centrality_und,
    'sugraph centrality': bct.subgraph_centrality,
    'local efficiency' : bct.efficiency_bin,
    'modularity (louvain)': bct.modularity_louvain_und,
    'modularity (probtune)': bct.modularity_probtune_und_sign,
    'participation coefficient': bct.participation_coef,
    'module degree z-score': bct.module_degree_zscore,
    'pagerank centrality': bct.pagerank_centrality,
    'diversity coefficient': bct.diversity_coef_sign,
    'gateway degree': gateway_coef_sign,
    'k-core centrality': bct.kcoreness_centrality_bu,
}

## Generating data to build low-dimensional space

In [None]:
#This involves exhaustive evaluation of all 544 analysis approaches.  

BCT_Run = {}
Sparsities_Run= {}
Data_Run = {}
GroupSummary = {}

thresholds = [0.4,0.3,0.25,0.2,0.175,0.150,0.125,0.1,0.09,0.08,
              0.07,0.06,0.05,0.04,0.03,0.02,0.01]
preprocessing = ['MRS', 'GRS']

n_thr = len(thresholds)
n_pre = len(preprocessing)
n_BCT = len(BCT_models.keys())
Results = np.zeros(((n_thr * n_pre * n_BCT), n_regions))
ResultsIndVar = np.zeros(((n_thr * n_pre * n_BCT), 1225))
count=0
for count in tqdm(range(n_thr * n_pre * n_BCT)):
  for DataPreproc in preprocessing: # data preprocessing
    if DataPreproc == 'MRS':
        TempData = MainNoNanModelSpace
        TotalSubjects = TempData.shape[2]
    elif DataPreproc == 'GRS':
        TempData = GSRNoNanModelSpace
        TotalSubjects = TempData.shape[2]

    for thr_idx, TempThreshold in enumerate(thresholds): # FC threshold level
        for BCT_Num in BCT_models.keys(): # Graph theory measure
            TempResults = np.zeros((TotalSubjects,n_regions))
            for SubNum in range(TotalSubjects):
                ss = analysis_space(BCT_Num, BCT_models, x, KeptYeoIDs)
                #For each subject for each approach keep the 346 regional values.        
                TempResults[SubNum, :] = ss 

            BCT_Run[count] = BCT_Num;
            Sparsities_Run[count] = TempThreshold
            Data_Run[count] = DataPreproc
            GroupSummary[count] ='Mean'
            # Build an array of similarities between subjects for each
            # analysis approach 
            cos_sim = cosine_similarity(TempResults, TempResults)        
            Results[count, :] = np.mean(TempResults, axis=0)
            ResultsIndVar[count, :] = cos_sim[np.triu_indices(TotalSubjects, k=1)].T                         
                     
ModelsResults={"Results": Results,
               "ResultsIndVar": ResultsIndVar,
               "BCT": BCT_Run,
               "Sparsities": Sparsities_Run, 
               "Data": Data_Run, 
               "SummaryStat": GroupSummary,
               "Ages": np.array(data1['age.main']),
               "AgesPrediction": AgesPrediction,
               "MainNoNanPrediction": MainNoNanPrediction,
               "GSRNoNanPrediction": GSRNoNanPrediction,
               "keptYeoIDs": KeptYeoIDs}
            
pickle.dump( ModelsResults, open(str(output_path / "ModelsResults.p"), "wb" ) )



## Building the low-dimensional space

### LLE, SE, tSNE Analysis

In [None]:
from sklearn import manifold, datasets
from sklearn.preprocessing import StandardScaler
from collections import OrderedDict
from functools import partial
from time import time
import pickle

from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
import matplotlib.pyplot as plt
from umap.umap_ import UMAP


In [None]:
# Load the previous results
ModelResults = pickle.load(open(str(output_path / "ModelsResults.p"), "rb" ) )
Results = ModelResults['ResultsIndVar']
BCT_Run = ModelResults['BCT']
Sparsities_Run = ModelResults['Sparsities']
Data_Run = ModelResults['Data']
GroupSummary = ModelResults['SummaryStat']
AgesPrediction = ModelResults['Ages'])
preprocessing = ['MRS', 'GRS']

In [None]:
#Scale the data prior to dimensionality reduction
scaler = StandardScaler()
X = scaler.fit_transform(Results.T)
X = X.T
n_neighbors = 20
n_components = 2 #number of components requested. In this case for a 2D space.

#Define different dimensionality reduction techniques 
methods = OrderedDict()
LLE = partial(manifold.LocallyLinearEmbedding,
              n_neighbors, n_components, eigen_solver='dense')
methods['LLE'] = LLE(method='standard', random_state=0)
methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
                                           n_neighbors=n_neighbors, random_state=0)
methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca',
                                 random_state=0)
methods['UMAP'] = UMAP(random_state=40, n_components=2, n_neighbors=200,
                             min_dist=.8)
methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=10, 
                              random_state=21, metric=True)

In [None]:
markers = ["x","s","o","*","D","1","v","p","H","+","|","_","3","^","4","<","X"]
colourmaps = {"MRS":"Oranges","GRS":"Purples"}
BCT = np.array(list(BCT_Run.items()))[:,1]
Sparsities = np.array(list(Sparsities_Run.items()))[:,1]
Data=np.array(list(Data_Run.items()))[:,1]

# Reduced dimensions
data_reduced = {}

fig = plt.figure(figsize=(12, 8))
fig.subplots_adjust(right=0.7)
figDE = plt.figure(constrained_layout=False, figsize=(21,6))
gsDE = figDE.add_gridspec(nrows=6, ncols=21)#, left=0.05, right=0.48, wspace=0.05)

#Perform embedding and plot the results (including info about the approach in the color/intensity and shape).

for i, (label, method) in enumerate(methods.items()):
     
    t0 = time()
    Y = method.fit_transform(X)

    t1 = time()
    # Save the results
    data_reduced[label] = Y
    
    ax = figDE.add_subplot(gsDE[:,i*6+i:(i+1)*6+i])
    for d in preprocessing:

        BCTTemp=BCT[Data==d]
        SparsitiesTemp=Sparsities[Data==d]
        YTemp=Y[Data==d,:]

        
        for i, c in enumerate(np.unique(BCTTemp)):
            im=ax.scatter(YTemp[:,0][BCTTemp==c],YTemp[:,1][BCTTemp==c],
                          c=SparsitiesTemp[BCTTemp==c]*-0.6, marker=markers[i],
                          cmap=colourmaps[d], s=80)

    ax.set_title("%s " % (label),fontsize=15,fontweight="bold")

    ax.axis('tight')

OrangePatch = mpatches.Patch(color='orange', label='Motion Regression')
PurplePatch = mpatches.Patch(color='purple', label='Global Signal Regression')


Lines={}
for i, bct_model in enumerate(BCT_models):
    Lines[i] = mlines.Line2D([], [], color='black', linestyle='None',
                             marker=markers[i], markersize=10, 
                             label=bct_model)


figDE.savefig(str(output_path / 'DifferentEmbeddings.png'),dpi=300)
figDE.savefig(str(output_path / 'DifferentEmbeddings.svg'),format="svg")
figDE.show()


In [None]:
# Try UMAP
Y = methods['UMAP'].fit_transform(X)
data_reduced['UMAP'] = Y

figUMAP = plt.figure(constrained_layout=False, figsize=(21,15))
gsUMAP = figUMAP.add_gridspec(nrows=15, ncols=20)
ax = figUMAP.add_subplot(gsUMAP[:,0:15])

for d in preprocessing:
    BCTTemp=BCT[Data==d]
    SparsitiesTemp=Sparsities[Data==d]
    YTemp=Y[Data==d,:]

    for i, c in enumerate(BCT_models):
        im=ax.scatter(YTemp[:,0][BCTTemp==c],YTemp[:,1][BCTTemp==c],
                      c=SparsitiesTemp[BCTTemp==c]*0.1, marker=markers[i],
                      cmap=colourmaps[d], s=150)
        ax.spines['top'].set_linewidth(1.5)
        ax.spines['right'].set_linewidth(1.5)
        ax.spines['bottom'].set_linewidth(1.5)
        ax.spines['left'].set_linewidth(1.5)
        ax.set_xlabel('Dimension 2',fontsize=20,fontweight="bold")
        ax.set_ylabel('Dimension 1',fontsize=20,fontweight="bold")
        ax.tick_params(labelsize=15)


ax.set_title('UMAP', fontsize=25,fontweight="bold")


OrangePatch = mpatches.Patch(color='orange', label='motion regression')
PurplePatch = mpatches.Patch(color=[85/255, 3/255, 152/255], label='global signal regression')

IntensityPatch1 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.4', alpha=1)
IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.1', alpha=0.4)
IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.01', alpha=0.1)

BlankLine=mlines.Line2D([], [], linestyle='None')

Lines={}
for i, bct_model in enumerate(BCT_models):
    Lines[i] = mlines.Line2D([], [], color='black', linestyle='None',
                             marker=markers[i],markersize=10, 
                             label=bct_model)

figUMAP.legend(handles=[OrangePatch, PurplePatch,BlankLine,IntensityPatch1,
                       IntensityPatch2, IntensityPatch3,BlankLine,
                       Lines[0],Lines[1],Lines[2],Lines[3],Lines[4],Lines[5],
                       Lines[6],Lines[7],Lines[8],Lines[9],Lines[10],Lines[11],
                       Lines[12],Lines[13],Lines[14],Lines[15]],fontsize=15,
              frameon=False,bbox_to_anchor=(1.4, 0.8),bbox_transform=ax.transAxes)

 
figUMAP.savefig(str(output_path / 'UMAPSpace.png'), dpi=300)
figUMAP.savefig(str(output_path /'UMAPpace.svg'), format="svg")

In [None]:
#Do the same as above but for MDS
Y = methods['MDS'].fit_transform(X)
data_reduced['MDS'] = Y

figMDS = plt.figure(constrained_layout=False, figsize=(21,15))
gsMDS = figMDS.add_gridspec(nrows=15, ncols=20)
ax = figMDS.add_subplot(gsMDS[:,0:15])

for d in preprocessing:
    BCTTemp=BCT[Data==d]
    SparsitiesTemp=Sparsities[Data==d]
    YTemp=Y[Data==d,:]

    for i, c in enumerate(BCT_models):
        im=ax.scatter(YTemp[:,0][BCTTemp==c],YTemp[:,1][BCTTemp==c],
                      c=SparsitiesTemp[BCTTemp==c]*0.1, marker=markers[i],
                      cmap=colourmaps[d], s=150)
        ax.spines['top'].set_linewidth(1.5)
        ax.spines['right'].set_linewidth(1.5)
        ax.spines['bottom'].set_linewidth(1.5)
        ax.spines['left'].set_linewidth(1.5)
        ax.set_xlabel('Dimension 2',fontsize=20,fontweight="bold")
        ax.set_ylabel('Dimension 1',fontsize=20,fontweight="bold")
        ax.tick_params(labelsize=15)


ax.set_title('Multi-dimensional Scaling', fontsize=25,fontweight="bold")


OrangePatch = mpatches.Patch(color='orange', label='motion regression')
PurplePatch = mpatches.Patch(color=[85/255, 3/255, 152/255], label='global signal regression')

IntensityPatch1 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.4', alpha=1)
IntensityPatch2 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.1', alpha=0.4)
IntensityPatch3 = mpatches.Patch(color=[0.1, 0.1, 0.1], label='threshold: 0.01', alpha=0.1)

BlankLine=mlines.Line2D([], [], linestyle='None')

Lines={}
for i, bct_model in enumerate(BCT_models):
    Lines[i] = mlines.Line2D([], [], color='black', linestyle='None',
                             marker=markers[i],markersize=10, 
                             label=bct_model)

figMDS.legend(handles=[OrangePatch, PurplePatch,BlankLine,IntensityPatch1,
                       IntensityPatch2, IntensityPatch3,BlankLine,
                       Lines[0],Lines[1],Lines[2],Lines[3],Lines[4],Lines[5],
                       Lines[6],Lines[7],Lines[8],Lines[9],Lines[10],Lines[11],
                       Lines[12],Lines[13],Lines[14],Lines[15]],fontsize=15,
              frameon=False,bbox_to_anchor=(1.4, 0.8),bbox_transform=ax.transAxes)

 
figMDS.savefig(str(output_path / 'MDSSpace.png'), dpi=300)
figMDS.savefig(str(output_path /'MDSSpace.svg'), format="svg")

# Save results form the embedding to be used in the remaining analysis
pickle.dump(data_reduced, open(str(output_path / "embeddings.p"), "wb" ) )

## Analyse the neighbours

In [None]:
from helperfunction import (get_models_neighbours, get_dissimilarity_n_neighbours
                            get_null_distribution)

In [None]:
N = 544
n_neighbors_step = 10

neighbours_orig, adj_array = get_models_neighbours(N, n_neighbors_step, X)

In [None]:
neighbours_tsne, _ = get_models_neighbours(N, n_neighbors_step,
                                           data_reduced['t-SNE'])
diss_tsne = get_dissimilarity_n_neighbours(neighbours_orig, neighbours_tsne)
del neighbours_tsne 

In [None]:
neighbours_lle, _ = get_models_neighbours(N, n_neighbors_step, 
                                          data_reduced['LLE'])
diss_lle = get_dissimilarity_n_neighbours(neighbours_orig,neighbours_lle)
del neighbours_lle 

In [None]:
neighbours_se, _ = get_models_neighbours(N, n_neighbors_step,
                                         data_reduced['SE'])
diss_se = get_dissimilarity_n_neighbours(neighbours_orig,neighbours_se)
del neighbours_se

In [None]:
neighbours_mds, _ = get_models_neighbours(N, n_neighbors_step,
                                          data_reduced['MDS'])
diss_mds = get_dissimilarity_n_neighbours(neighbours_orig,neighbours_mds)
del neighbours_mds

In [None]:
null_distribution = get_null_distribution(N, n_neighbors_step)

In [None]:
# Calculate the dissimilarity of the random distribution
diss_random = get_dissimilarity_n_neighbours(neighbours_orig, random_nn)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
n_neighbours = range(2, N, n_neighbors_step)
ax.plot(n_neighbours, diss_tsne, label='t-SNE', color='#1DACE8')
ax.plot(n_neighbours, diss_lle, label='LLE', color='#E5C4A1')
ax.plot(n_neighbours, diss_se, label='SE', color='#F24D29')
ax.plot(n_neighbours, diss_mds, label='MDS', color='#1C366B')
plt.plot(n_neighbours, null_distribution, label='random', c='grey')
#plt.plot(n_neighbours, diss_random, label='random_rnd', c='k')
plt.ylim([0,1])
plt.xlim([0,N])
plt.legend(frameon=False)
plt.xlabel('$k$ Nearest Neighbors')
plt.ylabel('Dissimilarity $\epsilon_k$')
plt.savefig(str(output_path / 'dissimilarity_all.svg'))
plt.show()

In [None]:
# Download file to computer
from google.colab import files
files.download(str(output_path / 'dissimilarity_all.svg'))

## Exhaustive Search

Exhaustive search for SVR prediction of age, so we know what "ground truth" is.

Note: This step is time consuming and might take about 4hrs hrs to run.

In [None]:
from bayes_opt import BayesianOptimization, UtilityFunction
from helperfunctions import objectiveFunc, bayesian_optimisation, display_gp_mean_uncertainty
from sklearn.gaussian_process.kernels import Matern, WhiteKernel
from sklearn.neighbors import NearestNeighbors
from sklearn.gaussian_process import GaussianProcessRegressor
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load embedding results. This cell is only necessary if you are running this
# part of the analysis separatly.
ModelEmbeddings = pickle.load(open(str(output_path / "embeddings.p"), "rb" ) )
ModelEmbedding = ModelEmbeddings['MDS']

In [None]:
PredictedAcc = np.zeros((len(Data_Run)))

for i in tqdm(range(len(Data_Run))):
    tempPredAcc = objectiveFunc(i, AgesPrediction, Sparsities_Run, Data_Run,
                              BCT_models, BCT_Run, KeptYeoIDs, MainNoNanPrediction,
                              GSRNoNanPrediction, 1)
    PredictedAcc[i] = tempPredAcc

#Display how predicted accuracy is distributed across the low-dimensional space
plt.scatter(ModelEmbedding[0: PredictedAcc.shape[0], 0],
            ModelEmbedding[0: PredictedAcc.shape[0], 1],
            c=PredictedAcc)
plt.colorbar()

In [None]:
# Dump accuracies
pickle.dump(PredictedAcc, open(str(output_path / 'predictedAcc.pckl'), 'wb'))

In [None]:
# Download file to computer
from google.colab import files
files.download(str(output_path / 'predictedAcc.pckl'))

## Active Learning

In [None]:
from itertools import product
import pickle

from matplotlib import cm
import bct
from mpl_toolkits.mplot3d import Axes3D
from sklearn.svm import SVR
from sklearn.model_selection import permutation_test_score



from helperfunctions import (initialize_bo, run_bo, posterior, 
                             posteriorOnlyModels, display_gp_mean_uncertainty,
                             plot_bo_estimated_space, plot_bo_evolution,
                             analysis_space)
%load_ext autoreload
%autoreload 2

In [None]:
# Define paths
from pathlib import Path
PROJECT_ROOT = Path.cwd()
data_path = PROJECT_ROOT / 'test' /'data'
output_path = PROJECT_ROOT / 'test' / 'output'

In [None]:
# Load embedding results. This cell is only necessary if you are running this
# part of the analysis separatly.
ModelEmbeddings = pickle.load(open(str(output_path / "embeddings.p"), "rb" ))
ModelEmbedding = ModelEmbeddings['MDS']

PredictedAcc = pickle.load(open(str(output_path / "predictedAcc.pckl"), "rb"))

ModelResults = pickle.load(open(str(output_path / "ModelsResults.p"), "rb" ))
Results = ModelResults['ResultsIndVar']
BCT_Run = ModelResults['BCT']
Sparsities_Run = ModelResults['Sparsities']
Data_Run = ModelResults['Data']
GroupSummary = ModelResults['SummaryStat']
#AgesPrediction = ModelResults['AgesPrediction']
#KeptYeoIDs = ModelResults['KeptYeoIDs']
#GSRNoNanPrediction = ModelResults['GSRNoNanPrediction']
#MainNoNanPrediction = ModelResults['MainNoNanPrediction']
#BCT_models = ModelResults['BCT_models]

#Ages = np.asarray(data1['age.main'])
preprocessing = ['MRS', 'GRS']

### Exploratory analysis

Note: This step takes about 30min.

In [None]:
kappa = 10

# Define settins for the analysis
kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \
                      initialize_bo(ModelEmbedding, kappa)

# Perform optimization. Given that the space is continuous and the analysis 
# approaches are not, we penalize suggestions that are far from any actual 
# analysis approaches. For these suggestions the registered value is set to the
#  lowest value from the burn in. These points (BadIters) are only used
# during search but exluded when recalculating the GP regression after search.
BadIter = run_bo(kernel, optimizer, utility, init_points,
                 n_iter, pbounds, nbrs, RandomSeed,
                 ModelEmbedding, BCT_models,BCT_Run,
                 Sparsities_Run,Data_Run,AgesPrediction,
                 KeptYeoIDs, MainNoNanPrediction,
                 GSRNoNanPrediction,1, MultivariateUnivariate=True, verbose=False)


In [None]:
x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin = \
                                           plot_bo_estimated_space(kappa, BadIter,
                                              optimizer, pbounds, 
                                              ModelEmbedding, PredictedAcc, 
                                              kernel, output_path)

In [None]:
# Display the results of the active search and the evolution of the search
# after 5, 10,20, 30 and 50 iterations.
plot_bo_evolution(kappa, x_exploratory, y_exploratory, z_exploratory, x, y, gp,
                  vmax, vmin, ModelEmbedding, PredictedAcc, output_path)

### Exploitatory analysis

In [None]:
kappa = .1

# Define settins for the analysis
kernel, optimizer, utility, init_points, n_iter, pbounds, nbrs, RandomSeed = \
                      initialize_bo(ModelEmbedding, kappa)

# Perform optimization. Given that the space is continuous and the analysis 
# approaches are not, we penalize suggestions that are far from any actual 
# analysis approaches. For these suggestions the registered value is set to the
#  lowest value from the burn in. These points (BadIters) are only used
# during search but exluded when recalculating the GP regression after search.
BadIter = run_bo(kernel, optimizer, utility, init_points,
                 n_iter, pbounds, nbrs, RandomSeed,
                 ModelEmbedding, BCT_models,BCT_Run,
                 Sparsities_Run,Data_Run,AgesPrediction,
                 KeptYeoIDs, MainNoNanPrediction,
                 GSRNoNanPrediction,1, MultivariateUnivariate=True, verbose=False)


In [None]:
x_exploratory, y_exploratory, z_exploratory, x, y, gp, vmax, vmin = \
                                           plot_bo_estimated_space(kappa, BadIter,
                                              optimizer, pbounds, 
                                              ModelEmbedding, PredictedAcc, 
                                              kernel, output_path)

In [None]:
# Display the results of the active search and the evolution of the search
# after 5, 10,20, 30 and 50 iterations.
plot_bo_evolution(kappa, x_exploratory, y_exploratory, z_exploratory, x, y, gp,
                  vmax, vmin, ModelEmbedding, PredictedAcc, output_path)

In [None]:
# Download file to computer
from google.colab import files
files.download(str(output_path / 'BOptEvolutionK10.svg'))
files.download(str(output_path / 'BOptEvolutionK0.1.svg'))
files.download(str(output_path / 'BOptAndTrueK0.1.svg'))
files.download(str(output_path / 'BOptAndTrueK10.svg'))

### Repetitions

In [None]:
kernel = 1.0 * Matern(length_scale=25, length_scale_bounds=(10,80),nu=2.5) \
    + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 0.1))

lb1 = np.min(ModelEmbedding[:, 0])
hb1 = np.max(ModelEmbedding[:, 0])
lb2 = np.min(ModelEmbedding[:, 1])
hb2 = np.max(ModelEmbedding[:, 1])
pbounds = {'b1': (lb1, hb1), 'b2': (lb2, hb2)}

n_repetitions = 20
BestModelGPSpace=np.zeros(n_repetitions)
BestModelGPSpaceModIndex=np.zeros(n_repetitions)
BestModelEmpirical=np.zeros(n_repetitions)
BestModelEmpiricalModIndex=np.zeros(n_repetitions)
ModelActualAccuracyCorrelation=np.zeros(n_repetitions)
CVPValBestModels=np.zeros(n_repetitions)

for DiffInit in range(n_repetitions):
    optimizer = BayesianOptimization(f=None,
                                     pbounds=pbounds,
                                     verbose=4,
                                     random_state=166+DiffInit)

    optimizer.set_gp_params(kernel=kernel,normalize_y=True,
                            n_restarts_optimizer=10)

    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(ModelEmbedding)

    distances, indices = nbrs.kneighbors(ModelEmbedding)

    utility = UtilityFunction(kind="ucb", kappa=10,xi=1e-1)


    n_iter=10
    init_points=10
    RandomSeed=111+DiffInit
    np.random.seed(RandomSeed)
    FailedIters=bayesian_optimisation(kernel, optimizer, utility, init_points,
                                      n_iter, pbounds, nbrs,RandomSeed,
                                      ModelEmbedding,BCT_models,BCT_Run,
                                      Sparsities_Run,Data_Run,AgesPrediction,
                                      KeptYeoIDs,MainNoNanPrediction,
                                      GSRNoNanPrediction,1,-1)
    
    gp = GaussianProcessRegressor(kernel=kernel, normalize_y=True,
                                  n_restarts_optimizer=10)

    x_temp = np.array([[res["params"]["b1"]] for res in optimizer.res])
    y_temp = np.array([[res["params"]["b2"]] for res in optimizer.res])
    z_temp = np.array([res["target"] for res in optimizer.res])

    x_obs=x_temp[FailedIters==0]
    y_obs=y_temp[FailedIters==0]
    z_obs=z_temp[FailedIters==0]
    
    muModEmb,sigmaModEmb,gpModEmb=posteriorOnlyModels(gp, x_obs, y_obs, z_obs,
                                                      ModelEmbedding)
    
    BestModelGPSpace[DiffInit]=muModEmb.max()
    BestModelGPSpaceModIndex[DiffInit]=muModEmb.argmax()
    BestModelEmpirical[DiffInit]=z_obs.max()
    Model_coord = np.array([[x_obs[z_obs.argmax()][-1], y_obs[z_obs.argmax()][-1]]])
    BestModelEmpiricalModIndex[DiffInit]=nbrs.kneighbors(Model_coord)[1][0][0]
    ModelActualAccuracyCorrelation[DiffInit]=spearmanr(muModEmb,PredictedAcc)[0]
    
    ClassOrRegress=1
    TempModelNum=muModEmb.argmax()
    Y=AgesPrediction
    CommunityIDs=KeptYeoIDs
    if Data_Run[TempModelNum]=='MRS':
        TempData=MainNoNanPrediction # BUG BUG BUG 
        TotalRegions=346
        TotalSubjects=TempData.shape[2]
    elif Data_Run[TempModelNum]=='GRS':
        TempData=GSRNoNanPrediction
        TotalRegions=346
        TotalSubjects=TempData.shape[2]   
    
    TempThreshold=Sparsities_Run[TempModelNum]
    BCT_Num = BCT_Run[TempModelNum]
    #BCT_Num=[i for i, e in enumerate(BCT_models) if e[0] == BCT_Run[TempModelNum]][0]
    
    TempResults=np.zeros([TotalSubjects, n_regions])
    for SubNum in range(0,TotalSubjects):
        ss = analysis_space(BCT_Num, BCT_models, x, KeptYeoIDs)
        TempResults[SubNum,:] = ss 
    scaler = StandardScaler()
    TempResults=scaler.fit_transform(TempResults)
  
    model = SVR(C=1.0, epsilon=0.2)
    #rs = np.random.RandomState(100)
    TempScore=permutation_test_score(model, TempResults, AgesPrediction.ravel(),
                                     groups=None, cv=None, n_permutations=5000, 
                                     n_jobs=None, random_state=5, verbose=0,
                                     scoring="neg_mean_absolute_error")
    CVPValBestModels[DiffInit]=TempScore[2]
    

In [None]:
#TEMP
results = {}
results['ModelEmbedding'] = ModelEmbedding
results['BestModelGPSpaceModIndex'] = BestModelGPSpaceModIndex
results['BestModelEmpiricalModIndex'] = BestModelEmpiricalModIndex
results['BestModelEmpirical'] = BestModelEmpirical
results['ModelActualAccuracyCorrelation'] = ModelActualAccuracyCorrelation
results['TempResults'] = TempResults
pickle.dump(results, open(str(output_path / 'results.pckl'), 'wb'))

In [None]:
# Download file to computer
from google.colab import files
files.download(str(output_path / 'results.pckl'))

In [None]:
PredictedAcc

In [None]:
# displaying results of 20 iterations

fig8 = plt.figure(constrained_layout=False,figsize=(18,6))
gs1 = fig8.add_gridspec(nrows=6, ncols=18)
ax1 = fig8.add_subplot(gs1[:,0:6])
ax1.set_title('Optima GP regression: 20 iterations',fontsize=15,fontweight="bold")
ax1.scatter(ModelEmbedding[0:PredictedAcc.shape[0],0],
            ModelEmbedding[0:PredictedAcc.shape[0],1],
            c=PredictedAcc*10,cmap='coolwarm',alpha=0.2,s=120)#vmax=vmax,vmin=vmin,
ax1.scatter(ModelEmbedding[BestModelGPSpaceModIndex.astype(int)][:,0],
            ModelEmbedding[BestModelGPSpaceModIndex.astype(int)][:,1],s=120,c='black')

ax1.set_xlim(-50, 50)
ax1.set_ylim(-50, 50)

ax2 = fig8.add_subplot(gs1[:,7:13])
ax2.set_title('Empirical optima: 20 iterations',fontsize=15,fontweight="bold")
ax2.scatter(ModelEmbedding[0:PredictedAcc.shape[0],0],
            ModelEmbedding[0:PredictedAcc.shape[0],1],
            c=PredictedAcc*10,cmap='coolwarm',s=120,alpha=0.2)#vmax=vmax,vmin=vmin,
ax2.scatter(ModelEmbedding[BestModelEmpiricalModIndex.astype(int)][:,0],
            ModelEmbedding[BestModelEmpiricalModIndex.astype(int)][:,1],c='black',s=120)

ax2.set_xlim(-50, 50)
ax2.set_ylim(-50, 50)

ax3 = fig8.add_subplot(gs1[:,14:16])
ax3.violinplot([PredictedAcc*10,BestModelEmpirical*10])
ax3.set_xticks([1, 2])
ax3.set_xticklabels(['Accuracy \n of all points', 'Accuracy\n of optima'],fontsize=9)

ax4 = fig8.add_subplot(gs1[:,17:18])
ax4.violinplot([ModelActualAccuracyCorrelation])
ax4.set_xticks([1])
ax4.set_xticklabels(['Correlation: \n est vs emp '],fontsize=9)

gs1
fig8.savefig(str(output_path / 'BOpt20Repeats.png'),dpi=300) 
fig8.savefig(str(output_path / 'BOpt20Repeats.svg'),format="svg") 