# Installations

In [None]:
%%shell
pip install pyimagej spectral -qqq

# apt update # not always this is needed
apt install -y maven > /dev/null

# Clone CubeExplore

In [None]:
!git clone -b development https://github.com/chilly-nk/CubeExplore.git
%cd CubeExplore

# Imports

In [None]:
# Import cubexplore and its functions or classes

import cubexplore as cubex
from cubexplore import Cubes

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from pathlib import Path

%config InlineBackend.figure_format = 'retina' # to make plots appear nicer on some screens

In [None]:
print(sklearn.__version__)

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
project = 'Lichens_3'
project = Path(f'/content/drive/MyDrive/DATA/PROJECTS/{project}')
sorted(project.iterdir())

# Data

In [None]:
# Path of all HSI datasets
hsi_datasets = '/content/drive/MyDrive/DATA/Datasets_HSI'

In [None]:
version = 'Raw' # decide which version of the dataset should be read

experiment = '250707_4DHSI_Lichens_B3_Palette_NC_NAS' # decide experiment
experiment_path = Path(hsi_datasets, version, experiment) # construct the path for that experiment

# load mappings data
mappings_path = Path(experiment_path, 'Mappings.csv')
mappings = pd.read_csv(mappings_path).set_index('Sample_ID')
mappings

In [None]:
sample = 'ManualExposure' # decide on sample

data_path = Path(experiment_path, mappings.loc[sample, 'HSI_Folder'])
metadata_path = Path(experiment_path, mappings.loc[sample, 'Metadata_HSI'])
os.listdir(data_path)

In [None]:
cubes_to_analyse = ['310.tif',
                    '325.tif',
                    '340.tif',
                    '365.tif',
                    '385.tif',
                    '400.tif',
                    '415.tif',
                    '430.tif',
                    'WhiteLight.tif']

In [None]:
cubes = Cubes(data_path, metadata_path, sample, cubes_to_analyse) # if you have .tif files
# cubes = Cubes(data_path, metadata_path, sample, data_source='nuance') # if you want to load .im3 cubes

### Save Cubes as Tiff

In [None]:
data_path.name

In [None]:
# cubes.save_tiff(description=data_path.name)

## ROIs | View & Keep

In [None]:
# Threshold bands for better RGB
cubes.threshold_bands(cubes.names)

In [None]:
cubes.view('WhiteLight.tif', which_data='thresholded')
cubes.roi((100, 870, 680, 1250))
filepath = Path(project, f"Samples_Cropping")
plt.savefig(filepath, bbox_inches='tight', dpi=200)

In [None]:
cubes.crop()
cubes.threshold_bands(cubes.names)
cubes.view('WhiteLight.tif', which_data='thresholded')

In [None]:
# Extract colors from a palette
palette = sns.color_palette("deep")

# Get red, green, blue from the palette
red   = palette[3]  # usually red is at index 3 in "deep"
green = palette[2]  # green is often at index 2
blue  = palette[0]  # blue is typically first

In [None]:
# approach = 'SinglePiece'

cubename = 'WhiteLight.tif'
cubes.view(cubename, 'thresholded')
cubes.reset_rois()

lw = 2.5

# Extract colors from a palette
palette = sns.color_palette("deep")

red   = palette[3]
red = (1.0, 0.0, 0.0)

green = palette[2]
green = (0.0, 1.0, 0.0)

blue  = palette[0]
blue = (0.0, 0.0, 1.0)

black = (0, 0, 0)

specimen = 'B';
cubes.roi((50, 140, 425, 515), keep=True, edgecolor=red, lw=lw, label=f"{specimen}")

specimen = 'C';
cubes.roi((50, 140, 225, 315), keep=True, edgecolor=green, lw=lw, label=f"{specimen}")

specimen = 'A';
cubes.roi((70, 160, 10, 100), keep=True, edgecolor=blue, lw=lw, label=f"{specimen}")

# # specimen = 6;
# # cubes.roi((), keep=True, edgecolor=next(sample_colors), lw=lw, label=f"B3_s{specimen}")

# specimen = 'WhiteStandard';
# cubes.roi((205, 350, 590, 640), keep=True, edgecolor=next(sample_colors), lw=lw, label=f"{specimen}")

specimen = 'Background';
cubes.roi((205, 350, 150, 200), keep=True, edgecolor=black, lw=lw, label=f"{specimen}")

# specimen = 'Paper';
# cubes.roi((), keep=True, edgecolor=next(bgr_colors), lw=lw, label=f"{specimen}")

filename = f"ROIs_{cubename.split('.')[0]}.png"
filepath = Path(project, filename)
plt.savefig(filepath, bbox_inches='tight', dpi=200)

In [None]:
cubes.rois.set_index('label', inplace=True)
cubes.rois

In [None]:
non_lichens = ['Tape', 'Paper', 'WS', 'WhiteStandard', 'Background']

In [None]:
class_labels = [177, 216, 100, 5] # took from Lichens_2

# sample_colors = list(sns.color_palette())[:len(class_labels)-1]
sample_colors = [red, green, blue]
bgr_colors = [(0, 0, 0)]

cubes.rois['class_label'] = class_labels
cubes.rois['colors'] = sample_colors + bgr_colors
cubes.rois

In [None]:
label_dict = cubes.rois.class_label.to_dict()
value_dict = {val: key for key, val in label_dict.items()}
color_dict = cubes.rois[['class_label', 'colors']].set_index('class_label').colors.to_dict()
color_dict

In [None]:
cubes.rois.to_csv(Path(project, f'ROIs.csv'))

# Preprocess

1. First correction of all cubes by exposure time
2. Then, will be using:
  - Normalized cubes (to max)
  - Normalized then concatenated spectra
  - Concatenated then normalized spectra

In [None]:
# Correct by exposure
cubes.correct_by_exposure()
starting_data = 'processed'
print(starting_data)
print(cubes.processed.keys())

In [None]:
cubes.normalized.keys()

In [None]:
cubes.combined.keys()

# kNN - GridSearchCV

In [None]:
# scoring = 'roc_auc_ovr'
scoring = 'balanced_accuracy'
folder = os.path.join(project, f'CV_Results_kNN_{scoring}')
os.makedirs(folder, exist_ok=True)

In [None]:
for cubename in cubes.names:
  cube = cubes.processed[cubename]
  print(cube.shape)

In [None]:
cube.shape[0] * cube.shape[1]

## CV | Normalized Cubes

In [None]:
# Normalized
cubes.normalize(which_data=starting_data, how='snv')
cubes.spectra_from_rois(which_data='normalized')

spectra = cubes.spectra.reset_index()

# Include only lichen classes into training
# Because paper and tape are usually classified well and they push the metrics up artificially obscuring real infromation about lichens
samples = [
    sample for sample in spectra.roi_name.unique()
    if all(substr not in sample for substr in non_lichens)
]
spectra = spectra[spectra['roi_name'].isin(samples)]

# Decide on max k-neighbors
sizes = spectra.groupby('cubename').aggregate({'roi_name': 'count'}).roi_name.tolist()
if all([sizes[i] == sizes[0] for i, size in enumerate(sizes)]):
  n_samples = sizes[0]

cv_train_size = n_samples * 0.8

min_k = 5
max_k = int(cv_train_size)
max_k = 1000

range_1 = list(range(10, 100, 10))
range_2 = list(range(100, 501, 20))

n_neighbors = np.linspace(min_k, max_k, num=50, dtype='int')
# n_neighbors = [min_k] + list(range(10, max_k+1, 10))
n_neighbors = [min_k] + range_1 + range_2

print('ROIs to be used for training:')
print(spectra.roi_name.unique())

print(f"Number of k-neighbors to be tested: (total of {len(n_neighbors)} variants)")
print(n_neighbors)

In [None]:
spectra.cubename.unique()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Develop later

param_grid = {'n_neighbors': n_neighbors}
estimator = KNeighborsClassifier(n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  data = spectra_cube[wvls].to_numpy()
  data = np.nan_to_num(data, nan=0)
  data = StandardScaler().fit_transform(data)

  search = GridSearchCV(estimator, param_grid, scoring=scoring, n_jobs=-1)
  search.fit(data, labels)

  cv_results = pd.DataFrame(search.cv_results_).sort_values(['rank_test_score'])

  filepath = os.path.join(folder, f"CV_{cubename}.csv")
  cv_results.to_csv(filepath)

## CV | Norm_Concat - ToDo

In [None]:
# Preprocess
label = 'Norm_Concat'
cubes.normalize(which_data=starting_data, how='snv') # Normalize
cubes_to_combine = list(set(cubes.names).difference(['WhiteLight.tif'])) # Select only fluorescence cubes
cubes.combine(cubes_to_combine, which_data='normalized', label=label) # Concatenate

# Spectra from ROIs
cubes.spectra_from_rois(label, which_data='combined')
spectra = cubes.spectra.reset_index()

# Include only lichen classes into training
# Because paper and tape are usually classified well and they push the metrics up artificially obscuring real infromation about lichens
samples = [
    sample for sample in spectra.roi_name.unique()
    if all(substr not in sample for substr in non_lichens)
]
spectra = spectra[spectra['roi_name'].isin(samples)]

# Decide on max k-neighbors
sizes = spectra.groupby('cubename').aggregate({'roi_name': 'count'}).roi_name.tolist()
if all([sizes[i] == sizes[0] for i, size in enumerate(sizes)]):
  n_samples = sizes[0]

cv_train_size = n_samples * 0.8

min_k = 5
max_k = int(cv_train_size)
max_k = 1000

range_1 = list(range(10, 100, 10))
range_2 = list(range(100, 501, 20))

n_neighbors = np.linspace(min_k, max_k, num=50, dtype='int')
# n_neighbors = [min_k] + list(range(10, max_k+1, 10))
n_neighbors = [min_k] + range_1 + range_2

print('ROIs to be used for training:')
print(spectra.roi_name.unique())

print(f"Number of k-neighbors to be tested: (total of {len(n_neighbors)} variants)")
print(n_neighbors)

In [None]:
plt.figure(figsize=(40, 5))
columns = cubes.spectra_wvls
sp = spectra.groupby('roi_name')[columns].agg('mean').T
sns.lineplot(sp)
plt.xticks(rotation=90);

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Develop later

param_grid = {'n_neighbors': n_neighbors}
estimator = KNeighborsClassifier(n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  data = spectra_cube[wvls].to_numpy()
  data = np.nan_to_num(data, nan=0)
  data = StandardScaler().fit_transform(data)

  search = GridSearchCV(estimator, param_grid, scoring=scoring, n_jobs=-1)
  search.fit(data, labels)

  cv_results = pd.DataFrame(search.cv_results_).sort_values(['rank_test_score'])

  filepath = os.path.join(folder, f"CV_{cubename}.csv")
  cv_results.to_csv(filepath)
  print(f"CV results saved at {filepath}")

## CV | Concat_Norm - ToDo

In [None]:
# Preprocess
label = 'Concat_Norm'
cubes_to_combine = list(set(cubes.names).difference(['WhiteLight.tif'])) # Select only fluorescence cubes
cubes.combine(cubes_to_combine, which_data=starting_data, label=label) # Concatenate
cubes.normalize(label, which_data='combined', how='snv') # Normalize

# Spectra from ROIs
cubes.spectra_from_rois(label, which_data='normalized')
spectra = cubes.spectra.reset_index()

# Include only lichen classes into training
# Because paper and tape are usually classified well and they push the metrics up artificially obscuring real infromation about lichens
samples = [
    sample for sample in spectra.roi_name.unique()
    if all(substr not in sample for substr in non_lichens)
]
spectra = spectra[spectra['roi_name'].isin(samples)]

# Decide on max k-neighbors
sizes = spectra.groupby('cubename').aggregate({'roi_name': 'count'}).roi_name.tolist()
if all([sizes[i] == sizes[0] for i, size in enumerate(sizes)]):
  n_samples = sizes[0]

cv_train_size = n_samples * 0.8

min_k = 5
max_k = int(cv_train_size)
max_k = 1000

range_1 = list(range(10, 100, 10))
range_2 = list(range(100, 501, 20))

n_neighbors = np.linspace(min_k, max_k, num=50, dtype='int')
# n_neighbors = [min_k] + list(range(10, max_k+1, 10))
n_neighbors = [min_k] + range_1 + range_2

print('ROIs to be used for training:')
print(spectra.roi_name.unique())

print(f"Number of k-neighbors to be tested: (total of {len(n_neighbors)} variants)")
print(n_neighbors)

In [None]:
spectra.cubename.unique()

In [None]:
plt.figure(figsize=(40, 5))
columns = cubes.spectra_wvls
sp = spectra.groupby('roi_name')[columns].agg('mean').T
sns.lineplot(sp)
plt.xticks(rotation=90);

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Develop later

param_grid = {'n_neighbors': n_neighbors}
estimator = KNeighborsClassifier(n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  data = spectra_cube[wvls].to_numpy()
  data = np.nan_to_num(data, nan=0)
  data = StandardScaler().fit_transform(data)

  search = GridSearchCV(estimator, param_grid, scoring=scoring, n_jobs=-1)
  search.fit(data, labels)

  cv_results = pd.DataFrame(search.cv_results_).sort_values(['rank_test_score'])

  filepath = os.path.join(folder, f"CV_{cubename}.csv")
  cv_results.to_csv(filepath)
  print(f"CV results saved at: {filepath}")

## --- 3 Cubes

In [None]:
cubes_to_combine = ['340.tif', '365.tif', '385.tif']

## CV | Norm_Concat - ToDo

In [None]:
# Preprocess
label = 'Norm_Concat_340_365_385'
cubes.normalize(which_data=starting_data, how='snv') # Normalize

cubes_to_combine = ['340.tif', '365.tif', '385.tif']
cubes.combine(cubes_to_combine, which_data='normalized', label=label) # Concatenate

# Spectra from ROIs
cubes.spectra_from_rois(label, which_data='combined')
spectra = cubes.spectra.reset_index()

# Include only lichen classes into training
# Because paper and tape are usually classified well and they push the metrics up artificially obscuring real infromation about lichens
samples = [
    sample for sample in spectra.roi_name.unique()
    if all(substr not in sample for substr in non_lichens)
]
spectra = spectra[spectra['roi_name'].isin(samples)]

# Decide on max k-neighbors
sizes = spectra.groupby('cubename').aggregate({'roi_name': 'count'}).roi_name.tolist()
if all([sizes[i] == sizes[0] for i, size in enumerate(sizes)]):
  n_samples = sizes[0]

cv_train_size = n_samples * 0.8

min_k = 5
max_k = int(cv_train_size)
max_k = 1000

range_1 = list(range(10, 100, 10))
range_2 = list(range(100, 501, 20))

n_neighbors = np.linspace(min_k, max_k, num=50, dtype='int')
# n_neighbors = [min_k] + list(range(10, max_k+1, 10))
n_neighbors = [min_k] + range_1 + range_2

print('ROIs to be used for training:')
print(spectra.roi_name.unique())

print(f"Number of k-neighbors to be tested: (total of {len(n_neighbors)} variants)")
print(n_neighbors)

In [None]:
plt.figure(figsize=(40, 5))
columns = cubes.spectra_wvls
sp = spectra.groupby('roi_name')[columns].agg('mean').T
sns.lineplot(sp)
plt.xticks(rotation=90);

In [None]:
spectra.cubename.unique()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Develop later

param_grid = {'n_neighbors': n_neighbors}
estimator = KNeighborsClassifier(n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  data = spectra_cube[wvls].to_numpy()
  data = np.nan_to_num(data, nan=0)
  data = StandardScaler().fit_transform(data)

  search = GridSearchCV(estimator, param_grid, scoring=scoring, n_jobs=-1)
  search.fit(data, labels)

  cv_results = pd.DataFrame(search.cv_results_).sort_values(['rank_test_score'])

  filepath = os.path.join(folder, f"CV_{cubename}.csv")
  cv_results.to_csv(filepath)
  print(f"CV results saved at {filepath}")

## CV | Concat_Norm - ToDo

In [None]:
# Preprocess
label = 'Concat_Norm_340_365_385'

cubes_to_combine = ['340.tif', '365.tif', '385.tif']
cubes.combine(cubes_to_combine, which_data=starting_data, label=label) # Concatenate
cubes.normalize(label, which_data='combined', how='snv') # Normalize

# Spectra from ROIs
cubes.spectra_from_rois(label, which_data='normalized')
spectra = cubes.spectra.reset_index()

# Include only lichen classes into training
# Because paper and tape are usually classified well and they push the metrics up artificially obscuring real infromation about lichens
samples = [
    sample for sample in spectra.roi_name.unique()
    if all(substr not in sample for substr in non_lichens)
]
spectra = spectra[spectra['roi_name'].isin(samples)]

# Decide on max k-neighbors
sizes = spectra.groupby('cubename').aggregate({'roi_name': 'count'}).roi_name.tolist()
if all([sizes[i] == sizes[0] for i, size in enumerate(sizes)]):
  n_samples = sizes[0]

cv_train_size = n_samples * 0.8

min_k = 5
max_k = int(cv_train_size)
max_k = 1000

range_1 = list(range(10, 100, 10))
range_2 = list(range(100, 501, 20))

n_neighbors = np.linspace(min_k, max_k, num=50, dtype='int')
# n_neighbors = [min_k] + list(range(10, max_k+1, 10))
n_neighbors = [min_k] + range_1 + range_2

print('ROIs to be used for training:')
print(spectra.roi_name.unique())

print(f"Number of k-neighbors to be tested: (total of {len(n_neighbors)} variants)")
print(n_neighbors)

In [None]:
spectra.cubename.unique()

In [None]:
plt.figure(figsize=(40, 5))
columns = cubes.spectra_wvls
sp = spectra.groupby('roi_name')[columns].agg('mean').T
sns.lineplot(sp)
plt.xticks(rotation=90);

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Develop later

param_grid = {'n_neighbors': n_neighbors}
estimator = KNeighborsClassifier(n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  data = spectra_cube[wvls].to_numpy()
  data = np.nan_to_num(data, nan=0)
  data = StandardScaler().fit_transform(data)

  search = GridSearchCV(estimator, param_grid, scoring=scoring, n_jobs=-1)
  search.fit(data, labels)

  cv_results = pd.DataFrame(search.cv_results_).sort_values(['rank_test_score'])

  filepath = os.path.join(folder, f"CV_{cubename}.csv")
  cv_results.to_csv(filepath)
  print(f"CV results saved at: {filepath}")

# Model Selection

In [None]:
scoring = 'balanced_accuracy'

cubenames = {
    '310': 'Ex = 310 nm',
    '325': 'Ex = 325 nm',
    '340': 'Ex = 340 nm',
    '365': 'Ex = 365 nm',
    '385': 'Ex = 385 nm',
    '400': 'Ex = 400 nm',
    '415': 'Ex = 415 nm',
    '430': 'Ex = 430 nm',
    'WhiteLight': 'White light',
    'Norm_Concat': 'Norm_Concat_8',
    'Concat_Norm': 'Concat_Norm_8',
    'Norm_Concat_340_365_385': 'Norm_Concat_3',
    'Concat_Norm_340_365_385': 'Concat_Norm_3',
}

In [None]:
scoring = 'balanced_accuracy'

cubenames = {
    '310': '310',
    '325': '325',
    '340': '340',
    '365': '365',
    '385': '385',
    '400': '400',
    '415': '415',
    '430': '430',
    'WhiteLight': 'WL',
    'Norm_Concat': 'NC_8',
    'Concat_Norm': 'CN_8',
    'Norm_Concat_340_365_385': 'NC_3',
    'Concat_Norm_340_365_385': 'CN_3',
}

In [None]:
# Կարդում ենք դատան
from pathlib import Path
results = list(Path(project, f'CV_Results_kNN_{scoring}').glob('*.csv'))
metrics = []
for result in results:
  cubename = result.stem[3:]
  metric = pd.read_csv(result, index_col=0)
  metric['cubename'] = cubename
  metrics.append(metric)
metrics = pd.concat(metrics)
# metrics = metrics[metrics['rank_test_score'] == 1]
metrics['cubename'] = metrics['cubename'].map(cubenames)
metrics.head(3)

In [None]:
metrics.cubename.unique().tolist()

### Accuracies -NEEDED FOR CV PLOT

In [None]:
columns = ['cubename', 'param_n_neighbors', 'mean_test_score']
average = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='mean_test_score')
average.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(16, 4))
sns.heatmap(average, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title(f'Mean {scoring}')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_{scoring}.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

### Ranks

In [None]:
columns = ['cubename', 'param_n_neighbors', 'rank_test_score']
rank = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='rank_test_score')
rank.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(16, 4))
sns.heatmap(rank, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title('Rank Test Score')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_Rank.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

### SDs

In [None]:
columns = ['cubename', 'param_n_neighbors', 'std_test_score']
std = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='std_test_score')
std.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(20, 4))
sns.heatmap(std, cmap='coolwarm', annot=True, annot_kws={"size": 6})

plt.title('SD Test Score')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_SD.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
avg = average.mean(axis=0);
avg = avg / avg.max()
rnk = rank.mean(axis=0);
rnk = rnk / rnk.max()
sd = std.mean(axis=0);
sd = sd / sd.max()

sns.lineplot(avg, label='Mean average (normalized)')
sns.lineplot(rnk, label='Mean rank (normalized)')
sns.lineplot(sd, label='Mean SD (normalized)')

plt.axvline(avg.idxmax())
print(avg.idxmax())

## Metric Comparison

### Best 5 n-neighbors

In [None]:
# best n_neighbors, as calculated by average of mean_test_score across cubes
average.mean(axis=0).sort_values(ascending=False)[:5].index.tolist()

In [None]:
plt.figure(figsize=(16, 4))
sns.heatmap(average, annot=True, cmap='coolwarm', annot_kws={"size": 9})

plt.title(f'Mean {scoring}')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

In [None]:
# Փոխարկում ենք երկար ձևաչափի` գրաֆիկների մեջ գործածելու հարմարավետության համար

# k = average.mean(axis=0).idxmax() # based on the average test score
k = average.mean(axis=0).sort_values(ascending=False)[:5].index.tolist()
score_cols = [col for col in metrics.columns if 'split' in col]
id_vars = metrics.columns.difference(score_cols)
metrics_long = pd.melt(metrics, id_vars, value_vars=score_cols, var_name='split', value_name='Metric')
metrics_long = metrics_long[metrics_long['param_n_neighbors'].isin(k)]
metrics_long.sort_values(['cubename', 'split']).head(3)

In [None]:
plt.figure(figsize=(15, 5))

order = sorted(metrics_long.cubename.unique())
sns.boxplot(metrics_long, x='cubename', y='Metric', hue='param_n_neighbors', order=order, hue_order=k)
plt.xticks(rotation=45)

filename = f"Scoring_Comparison_Best5n"
filepath = Path(project, filename)
plt.savefig(filepath, bbox_inches='tight', dpi=200)

### Best 1 n-neighbors !

In [None]:
# Փոխարկում ենք երկար ձևաչափի` գրաֆիկների մեջ գործածելու հարմարավետության համար

k = [average.mean(axis=0).idxmax()] # based on the average test score
score_cols = [col for col in metrics.columns if 'split' in col]
id_vars = metrics.columns.difference(score_cols)
metrics_long = pd.melt(metrics, id_vars, value_vars=score_cols, var_name='split', value_name='Metric')
metrics_long = metrics_long[metrics_long['param_n_neighbors'].isin(k)]
metrics_long.sort_values(['cubename', 'split']).head(3)
# metrics_long['config_type'] = metrics_long.cubename.str.contains(r"nm|WL").map({True: 'Single-excitation', False: 'Multi-excitation'})

means = metrics_long.groupby('cubename')['Metric'].mean()
maxes = metrics_long.groupby('cubename')['Metric'].max()
stds = metrics_long.groupby('cubename')['Metric'].std()

# metrics_long['cube_mean'] = metrics_long.cubename.map(means)

In [None]:
metrics_long.cubename.map(means)

## Newer version | Boxplot Split

In [None]:
cubename = order_single[0]
means[cubename]

In [None]:
means

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

# VARIABLES
top = 1.07
left = 0.7
width_ratios = [left, 1-left]
fontsize_xy = 14
fontsize_ticks = 13
fontsize_text = 11

# Step 1: Create global normalization based on full dataset
vmin = metrics_long['mean_test_score'].min()
vmax = metrics_long['mean_test_score'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = plt.get_cmap('RdYlGn')

# Step 2: Map each unique score to a color
unique_scores = metrics_long['mean_test_score'].unique()
score_to_color = {score: cmap(norm(score)) for score in unique_scores}

#PLOTTING
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True, width_ratios=width_ratios)
fig.subplots_adjust(wspace=0.04)
axes = axes.flatten()

order_single = ['WL', '310', '325', '340', '365', '385', '400', '415', '430']
order_multi = ['NC_3', 'CN_3', 'NC_8', 'CN_8']

means = metrics_long.groupby('cubename')['Metric'].mean()
maxes = metrics_long.groupby('cubename')['Metric'].max()
stds = metrics_long.groupby('cubename')['Metric'].std()

# SINGLE
ax = axes[0]
order = order_single
df = metrics_long[metrics_long['cubename'].isin(order)]
sns.boxplot(df, x='cubename', y='Metric', hue='mean_test_score', palette=score_to_color, order=order, ax=ax)
sns.swarmplot(df, x='cubename', y='Metric', ax=ax)

bottom, _ = ax.get_ylim()
ax.set_ylim(bottom, top)

for cubename in order:
  avg = means[cubename]
  maximum = maxes[cubename]
  sd = stds[cubename]

  # text = f'mean={avg:.2f}\nsd={sd:.4f}'
  text = f'{avg:.2f}±{sd:.2f}'
  ax.text(x=cubename, y=maximum+0.02, s=text, ha='center', color='black', fontsize=fontsize_text)

ax.tick_params(axis='y', labelsize=fontsize_ticks)
ax.tick_params(axis='x', labelsize=fontsize_ticks)
ax.set_ylabel('Balanced Accuracy', fontsize=fontsize_xy)
ax.set_xlabel(None, fontsize=fontsize_xy)
ax.legend().set_visible(False)
ax.set_title('Single-Excitation HSI', fontsize=fontsize_xy+2)


# MULTI
ax = axes[1]
order = order_multi
df = metrics_long[metrics_long['cubename'].isin(order)]
sns.boxplot(df, x='cubename', y='Metric', hue='mean_test_score', palette=score_to_color, order=order, ax=ax)
sns.swarmplot(df, x='cubename', y='Metric', ax=ax)

bottom, _ = ax.get_ylim()
ax.set_ylim(bottom, top)

for cubename in order:
  avg = means[cubename]
  maximum = maxes[cubename]
  sd = stds[cubename]

  # text = f'mean={avg:.2f}\nsd={sd:.4f}'
  text = f'{avg:.2f}±{sd:.2f}'
  ax.text(x=cubename, y=maximum+0.02, s=text, ha='center', color='black', fontsize=fontsize_text)

# ax.tick_params(axis='y', labelsize=fontsize_ticks)
ax.tick_params(axis='x', labelsize=fontsize_ticks)
ax.set_xlabel(None, fontsize=fontsize_xy)
ax.legend().set_visible(False)
ax.set_title('Multi-Excitation HSI', fontsize=fontsize_xy+2)

plt.savefig(Path(project, f"knn_cv_3.png"), bbox_inches='tight', dpi=200)

---

In [None]:
order = [val for key, val in cubenames.items()]
order

In [None]:
plt.figure(figsize=(15, 5))

# order = sorted(metrics_long.cubename.unique())
order = [val for key, val in cubenames.items()]

sns.boxplot(metrics_long, x='cubename', y='Metric', hue='config_type', order=order)
# sns.lineplot(metrics_long, x='cubename', y='Metric', estimator='mean', errorbar=None)
sns.swarmplot(metrics_long, x='cubename', y='Metric')
plt.xticks(fontsize=12, rotation=45);
plt.yticks(fontsize=12);

means = metrics_long.groupby('cubename')['Metric'].mean()
maxes = metrics_long.groupby('cubename')['Metric'].max()
stds = metrics_long.groupby('cubename')['Metric'].std()

for cubename in metrics_long.cubename.unique():
  k = k # check previous cell
  avg = means[cubename]
  maximum = maxes[cubename]
  sd = stds[cubename]

  plt.text(x=cubename, y=maximum+0.02, s=f'mean={avg:.2f}\nsd={sd:.4f}', ha='center', color='black', fontsize=10.5)

# plt.title(f"k-NN Cross Validation Scores | cv=5, scoring='{scoring}', n_neighbors={k[0]}")
plt.xlabel('Excitation / Configuration', fontsize=13)
plt.ylabel(f'Balanced Accuracy', fontsize=13)

bottom, top = plt.ylim()
plt.ylim(bottom=bottom, top=1.08)
# plt.legend().set_visible(False)

plt.savefig(Path(project, f"knn_cv_2.png"), bbox_inches='tight', dpi=200)

## Stats

### Calculate P-Values

In [None]:
from scipy.stats import ttest_ind

In [None]:
k = 5 # based on the average test score

split_cols = [name for name in metrics.columns if 'split' in name]
splits = metrics[metrics['param_n_neighbors'] == k].set_index('cubename')
splits = splits[split_cols].T
splits = splits[order_single+order_multi]
splits

In [None]:
p_values = pd.DataFrame()
for g1 in splits.columns:
  for g2 in splits.columns:
    group1 = splits[g1].tolist()
    group2 = splits[g2].tolist()
    t_stat, p_val = ttest_ind(group1, group2)
    p_values.loc[g1, g2] = p_val

p_values = p_values.replace({1: np.nan})

filename = f"CV_P-Values.csv"
p_values.to_csv(Path(project, filename))

### Plot P-Values

In [None]:
# VARIABLES
top = 1.07
left = 0.7
width_ratios = [left, 1-left]
fontsize_xy = 14
fontsize_ticks = 13
fontsize_text = 11

cmap = 'YlGnBu_r'
cmap = 'RdYlGn'

#PLOTTING
fig, axes = plt.subplots(1, 2, figsize=(15, 3), sharey=True, width_ratios=width_ratios)
fig.subplots_adjust(wspace=0.04)
axes = axes.flatten()

order_single = ['WL', '310', '325', '340', '365', '385', '400', '415', '430']
order_multi = ['NC_3', 'CN_3', 'NC_8', 'CN_8']

# SINGLE
ax = axes[0]
order = order_single
df = p_values[order_single]
sns.heatmap(df, annot=True, fmt='.3f', cmap=cmap, ax=ax, cbar=None)

ax.tick_params(axis='y', labelsize=fontsize_ticks)
ax.tick_params(axis='x', labelsize=fontsize_ticks)
# ax.set_xlabel('Single-Excitation Cubes', fontsize=fontsize_xy)

# MULTI
ax = axes[1]
order = order_multi
df = p_values[order_multi]
sns.heatmap(df, annot=True, fmt='.3f', cmap=cmap, ax=ax, cbar=None)

# ax.tick_params(axis='y', labelsize=fontsize_ticks)
ax.tick_params(axis='x', labelsize=fontsize_ticks)
# ax.set_xlabel('Multi-Excitation Cubes', fontsize=fontsize_xy)

plt.savefig(Path(project, f"cv_p_values_drosh.png"), bbox_inches='tight', dpi=200)

In [None]:
from matplotlib.colors import LinearSegmentedColormap

plt.figure(figsize=(15, 3))

fontsize_xy = 12
colors = [(0.2, 0.6, 0.2), (1, 1, 1)]  # green to white
custom_cmap = LinearSegmentedColormap.from_list("pval_cmap", colors, N=256)

sns.heatmap(p_values, annot=True, fmt='.2f', cmap='coolwarm', annot_kws={"size": 11}, vmax=0.05, cbar=None)
# plt.title('P-Values')
plt.yticks(fontsize=fontsize_xy);
plt.xticks(fontsize=fontsize_xy);

filename = f"CV_P-Values_Heatmap_4"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
from matplotlib.colors import LinearSegmentedColormap

plt.figure(figsize=(15, 3))

colors = [(0.2, 0.6, 0.2), (1, 1, 1)]  # green to white
custom_cmap = LinearSegmentedColormap.from_list("pval_cmap", colors, N=256)

sns.heatmap(p_values, annot=True, fmt='.2f', cmap=custom_cmap, annot_kws={"size": 9}, vmax=0.05, cbar=None)
plt.title('P-Values')
plt.xticks(rotation=45);

filename = f"CV_P-Values_Heatmap_3"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
plt.figure(figsize=(15, 3))

sns.heatmap(p_values, annot=True, fmt='.2f', cmap='coolwarm', annot_kws={"size": 9}, vmax=0.05, cbar=None)
# plt.xticks(rotation=45)
# plt.setp(plt.gca().get_xticklabels(), va='top')
plt.title('P-Values')
plt.xticks(rotation=45);

filename = f"CV_P-Values_Heatmap_2"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
round((splits['430'] * 100).mean(), 2)

In [None]:
plt.figure(figsize=(10, 4))
sns.heatmap(p_values < 0.05, annot=True, fmt='.3f', cmap='coolwarm')
# plt.xticks(rotation=45)
# plt.setp(plt.gca().get_xticklabels(), va='top')
plt.title('P-Values')

filename = f"CV_P-Values_Heatmap"
# plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

## CV + P values

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 10), sharex=True)
axes = axes.flatten()

ax = axes[0]
sns.heatmap(p_values, annot=True, fmt='.3f', cmap='coolwarm', ax=ax, cbar=None)

ax = axes[1]
# order = sorted(metrics_long.cubename.unique())
order = [val for key, val in cubenames.items()]

sns.boxplot(metrics_long, x='cubename', y='Metric', hue='cubename', order=order, ax=ax)
# sns.lineplot(metrics_long, x='cubename', y='Metric', estimator='mean', errorbar=None)
sns.swarmplot(metrics_long, x='cubename', y='Metric', ax=ax)
plt.xticks(rotation=45);
plt.ylim(top=1.02)

# means = metrics_long.groupby('cubename')['Metric'].mean()
# maxes = metrics_long.groupby('cubename')['Metric'].max()
# stds = metrics_long.groupby('cubename')['Metric'].std()

# for cubename in metrics_long.cubename.unique():
#   k = k # check previous cell
#   average = means[cubename]
#   maximum = maxes[cubename]
#   sd = stds[cubename]

#   plt.text(x=cubename, y=maximum+0.02, s=f'mean={average:.2f}\nsd={sd:.4f}', ha='center', color='black', fontsize=10)

# # plt.title(f"k-NN Cross Validation Scores | cv=5, scoring='{scoring}', n_neighbors={k[0]}")
# plt.xlabel('Excitation / Configuration', fontsize=13)
# plt.ylabel(f'Balanced Accuracy', fontsize=13)

# bottom, top = plt.ylim()
# plt.ylim(bottom=bottom, top=1.08)
# # plt.legend().set_visible(False)

# plt.savefig(Path(project, f"knn_cv_1.png"), bbox_inches='tight', dpi=200)

# kNN - Classification

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TunedThresholdClassifierCV

from PIL import Image

In [None]:
n_neighbors = 5

folder = Path(project, f"kNN_Predict_n{n_neighbors}_ColorMasks")
os.makedirs(folder, exist_ok=True)

## Pred | Normalized

In [None]:
# Starting data is processed (corrected by light source)
cubes.processed.keys()

In [None]:
# Normalized
label = 'normalized'
cubes.normalize(which_data=starting_data, how='snv')
cubes.spectra_from_rois(which_data=label)
spectra = cubes.spectra.reset_index()

estimator = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  train = spectra_cube[wvls].dropna(axis=1, how='all').to_numpy()
  train = np.nan_to_num(train, nan=0)

  scaler = StandardScaler().fit(train)
  train = scaler.transform(train)

  estimator.fit(train, labels)

  cube = cubes.normalized[f"{cubename}.tif"]
  test = cube.reshape(cube.shape[0]*cube.shape[1], cube.shape[2])
  test = scaler.transform(test)

  # predictions = estimator.predict_proba(test)

  predictions = estimator.predict(test)
  mask = predictions.reshape(cube.shape[0], cube.shape[1]).astype(np.uint8)

  mask_rgb = np.zeros((*mask.shape, 3))
  for cls, color in color_dict.items():
    mask_rgb[mask == cls] = color
  mask_rgb = (mask_rgb * 255).astype(np.uint8)
  mask_rgb = Image.fromarray(mask_rgb)

  filename = f"kNN_Predict_n{n_neighbors}_ColorMask_{label.capitalize()}_{cubename}.png"
  filepath = Path(project, folder, filename)
  mask_rgb.save(filepath)

### Threshold visualization

In [None]:
class_num = len(estimator.classes_)

fig, axes = plt.subplots(3, class_num, figsize = (12, 5))
# axes = axes.flatten()

thr = 0.5
titlesize = 9
for i, clas in enumerate(estimator.classes_):
  preds = predictions[:, i]
  mask = preds > thr
  mask = mask.reshape(cube.shape[0], cube.shape[1]).astype(np.uint8)

  label = '_'.join(label_values[clas].split('_')[:-1])
  title_prob = f"{label} | thr={thr}"

  ax = axes[0, i]
  sns.histplot(preds, ax=ax)
  ax.axvline(thr, color='red')
  ax.set_ylim(top=40000)
  ax.set_yticks([]) if i != 0 else None
  ax.set_title(f"{label}\nProbabilities", fontsize=titlesize)
  # ax.set_xlabel("Probability", fontsize=titlesize)
  ax.set_ylabel("Count", fontsize=titlesize)

  ax = axes[1, i]
  preds_img = preds.reshape(cube.shape[0], cube.shape[1])
  ax.imshow(preds_img)
  # ax.set_xticks([])
  # ax.set_yticks([]) if i != 0 else None
  ax.set_title(f"{label}\nProbas of Class", fontsize=titlesize)

  ax = axes[2, i]
  ax.imshow(mask)
  # ax.set_xticks([])
  # ax.set_yticks([]) if i != 0 else None
  ax.set_title(f"{label}\nPos. if thr>{thr}", fontsize=titlesize)

plt.tight_layout()

## Pred | Norm_Concat

In [None]:
# Starting data is processed (corrected by light source)
cubes.processed.keys()

In [None]:
starting_data

In [None]:
# Normalize
label = 'Norm_Concat'
cubes.normalize(which_data=starting_data, how='snv') # Normalize

# Concatenate
cubes_to_combine = list(set(cubes.names).difference(['WhiteLight.tif'])) # Select only fluorescence cubes
cubes.combine(cubes_to_combine, which_data='normalized', label=label) # Concatenate

# Spectra from ROIs | TRAIN data
cubes.spectra_from_rois(label, which_data='combined')
spectra = cubes.spectra.reset_index()

estimator = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  train = spectra_cube[wvls].dropna(axis=1, how='all').to_numpy()
  train = np.nan_to_num(train, nan=0)

  scaler = StandardScaler().fit(train)
  train = scaler.transform(train)

  estimator.fit(train, labels)

  # TEST data
  cube = cubes.combined[f"{cubename}"]
  test = cube.reshape(cube.shape[0]*cube.shape[1], cube.shape[2])
  test = scaler.transform(test)

  predictions = estimator.predict(test)
  mask = predictions.reshape(cube.shape[0], cube.shape[1]).astype(np.uint8)

  mask_rgb = np.zeros((*mask.shape, 3))
  for cls, color in color_dict.items():
    mask_rgb[mask == cls] = color
  mask_rgb = (mask_rgb * 255).astype(np.uint8)
  mask_rgb = Image.fromarray(mask_rgb)

  filename = f"kNN_Predict_n{n_neighbors}_ColorMask_{label}.png"
  filepath = Path(project, folder, filename)
  mask_rgb.save(filepath)

## Pred | Concat_Norm

In [None]:
# Starting data is processed (corrected by light source)
cubes.processed.keys()

In [None]:
starting_data

In [None]:
# Preprocess
label = 'Concat_Norm'
cubes_to_combine = list(set(cubes.names).difference(['WhiteLight.tif'])) # Select only fluorescence cubes
cubes.combine(cubes_to_combine, which_data=starting_data, label=label) # Concatenate
cubes.normalize(label, which_data='combined', how='snv') # Normalize

# Spectra from ROIs
cubes.spectra_from_rois(label, which_data='normalized')
spectra = cubes.spectra.reset_index()

estimator = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  train = spectra_cube[wvls].dropna(axis=1, how='all').to_numpy()
  train = np.nan_to_num(train, nan=0)

  scaler = StandardScaler().fit(train)
  train = scaler.transform(train)

  estimator.fit(train, labels)

  # TEST data
  cube = cubes.normalized[f"{cubename}"]
  test = cube.reshape(cube.shape[0]*cube.shape[1], cube.shape[2])
  test = scaler.transform(test)

  # predictions = estimator.predict_proba(test)

  predictions = estimator.predict(test)
  mask = predictions.reshape(cube.shape[0], cube.shape[1]).astype(np.uint8)

  mask_rgb = np.zeros((*mask.shape, 3))
  for cls, color in color_dict.items():
    mask_rgb[mask == cls] = color
  mask_rgb = (mask_rgb * 255).astype(np.uint8)
  mask_rgb = Image.fromarray(mask_rgb)

  filename = f"kNN_Predict_n{n_neighbors}_ColorMask_{label}.png"
  filepath = Path(project, folder, filename)
  mask_rgb.save(filepath)

## --- 3 Cubes

## Pred | Norm_Concat

In [None]:
# Starting data is processed (corrected by light source)
cubes.processed.keys()

In [None]:
starting_data

In [None]:
# Preprocess
label = 'Norm_Concat_340_365_385'
cubes.normalize(which_data=starting_data, how='snv') # Normalize

cubes_to_combine = ['340.tif', '365.tif', '385.tif']
cubes.combine(cubes_to_combine, which_data='normalized', label=label) # Concatenate

# Spectra from ROIs | TRAIN data
cubes.spectra_from_rois(label, which_data='combined')
spectra = cubes.spectra.reset_index()

estimator = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  train = spectra_cube[wvls].dropna(axis=1, how='all').to_numpy()
  train = np.nan_to_num(train, nan=0)

  scaler = StandardScaler().fit(train)
  train = scaler.transform(train)

  estimator.fit(train, labels)

  # TEST data
  cube = cubes.combined[f"{cubename}"]
  test = cube.reshape(cube.shape[0]*cube.shape[1], cube.shape[2])
  test = scaler.transform(test)

  predictions = estimator.predict(test)
  mask = predictions.reshape(cube.shape[0], cube.shape[1]).astype(np.uint8)

  mask_rgb = np.zeros((*mask.shape, 3))
  for cls, color in color_dict.items():
    mask_rgb[mask == cls] = color
  mask_rgb = (mask_rgb * 255).astype(np.uint8)
  mask_rgb = Image.fromarray(mask_rgb)

  filename = f"kNN_Predict_n{n_neighbors}_ColorMask_{label}.png"
  filepath = Path(project, folder, filename)
  mask_rgb.save(filepath)

## Pred | Concat_Norm

In [None]:
# Starting data is processed (corrected by light source)
cubes.processed.keys()

In [None]:
starting_data

In [None]:
# Preprocess
label = 'Concat_Norm_340_365_385'

cubes_to_combine = ['340.tif', '365.tif', '385.tif']
cubes.combine(cubes_to_combine, which_data=starting_data, label=label) # Concatenate
cubes.normalize(label, which_data='combined', how='snv') # Normalize

# Spectra from ROIs
cubes.spectra_from_rois(label, which_data='normalized')
spectra = cubes.spectra.reset_index()

estimator = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1)

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  train = spectra_cube[wvls].dropna(axis=1, how='all').to_numpy()
  train = np.nan_to_num(train, nan=0)

  scaler = StandardScaler().fit(train)
  train = scaler.transform(train)

  estimator.fit(train, labels)

  # TEST data
  cube = cubes.normalized[f"{cubename}"]
  test = cube.reshape(cube.shape[0]*cube.shape[1], cube.shape[2])
  test = scaler.transform(test)

  # predictions = estimator.predict_proba(test)

  predictions = estimator.predict(test)
  mask = predictions.reshape(cube.shape[0], cube.shape[1]).astype(np.uint8)

  mask_rgb = np.zeros((*mask.shape, 3))
  for cls, color in color_dict.items():
    mask_rgb[mask == cls] = color
  mask_rgb = (mask_rgb * 255).astype(np.uint8)
  mask_rgb = Image.fromarray(mask_rgb)

  filename = f"kNN_Predict_n{n_neighbors}_ColorMask_{label}.png"
  filepath = Path(project, folder, filename)
  mask_rgb.save(filepath)

# Image captions

In [None]:
raw_masks = Path(project, 'kNN_Predict_n5_ColorMasks')
labeled_masks = Path(project, 'kNN_Predict_n5_ColorMasks_Labeled')
os.makedirs(labeled_masks, exist_ok=True)

In [None]:
files = raw_masks.glob('*.png')
for fp in files:
  # fp_new = Path(labeled_masks, fp.name)
  label = fp.stem[25:].replace('Normalized_', '')
  img = Image.open(fp)

# EEMs

In [None]:
cubes.rois

In [None]:
starting_data

In [None]:
folder = Path(project, f"EEMs")
os.makedirs(folder, exist_ok=True)

In [None]:
for roi in cubes.rois.index:
  plt.figure(figsize=(6, 1.5))
  cubenames = list(set(cubes.names).difference(['WhiteLight.tif']))
  cubes.get_eem(cubes_to_analyse=cubenames, which_data=starting_data, roi_name=roi)

  sns.heatmap(cubes.eem, cmap='coolwarm')
  plt.xticks(rotation=45)
  plt.xlabel('Emission')
  plt.ylabel('Excitation')

  filename = f"EEM_{roi}.png"
  filepath = Path(project, folder, filename)
  plt.savefig(filepath, bbox_inches='tight', dpi=200)
  plt.close()

# Analysis | Previous

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

import os

from pathlib import Path

In [None]:
project = Path('/content/drive/MyDrive/DATA/PROJECTS/Lichens_1')

## Read Spectra

In [None]:
# ստեղծում ենք գրաֆիկների համար պանակ
figures = os.path.join(project, 'Figures')
os.makedirs(figures, exist_ok=True)

In [None]:
from pathlib import Path

spectra_files = sorted(project.glob('*Spectra_P2*'))
spectra_files

In [None]:
# կարդում ենք բոլոր ֆայլերը և միացնում մեկ աղյուսակի մեջ

all_dfs = [] # այս դատարկ լիստի մեջ հավաքելու ենք բոլոր աղյուսակները, հետո միացնենք իրար
for filepath in spectra_files:
  df = pd.read_csv(filepath).reset_index(drop=True)
  df['cubename'] = df['cubename'].astype(str)
  all_dfs.append(df) # կցում ենք վերևի լիստին

spectra = pd.concat(all_dfs) # լիստի էլեմենտ աղյուսակները միացնում ենք իրար
spectra

In [None]:
# ֆիլտրում ենք դատայի միայն նմուշների սպեկտրները (հանելով ֆոնի սպեկտրները)

samples = [
    sample for sample in spectra.roi_name.unique()
    if all(substr not in sample for substr in ['Tape', 'Paper', 'WS'])
]
spectra = spectra[spectra['roi_name'].isin(samples)]
spectra.roi_name.unique()

In [None]:
spectra.cubename.unique()

In [None]:
spectra.groupby(['cubename']).aggregate({'roi_name': 'count'})

## kNN | GridSearchCV | ROC AUC

In [None]:
# Model Optimization
from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

folder = os.path.join(project, 'CV_Results_kNN_ROC_AUC')
os.makedirs(folder, exist_ok=True)

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Develop later

n_neighbors = np.linspace(5, 400*0.8, num=10).astype(int).tolist() # 400 - number of spectra, 0.8 because default cv=5, i.e. train set will be max 0.8 * 400
param_grid = {
    'n_neighbors': n_neighbors,
}

estimator = KNeighborsClassifier(n_jobs=-1)

rois = spectra.roi_name.unique()
label_dict = {roi: val for roi, val in zip(rois, range(len(rois)))}

for cubename in spectra.cubename.unique():
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  labels = metadata['roi_name'].map(label_dict).tolist()
  data = spectra_cube[wvls].to_numpy()
  data = np.nan_to_num(data, nan=0)
  data = StandardScaler().fit_transform(data)

  search = GridSearchCV(estimator, param_grid, scoring='roc_auc_ovr', n_jobs=-1)
  search.fit(data, labels)

  cv_results = pd.DataFrame(search.cv_results_).sort_values(['rank_test_score'])

  filepath = os.path.join(folder, f"CV_Ex_{cubename}.csv")
  cv_results.to_csv(filepath)

In [None]:
os.listdir(folder)

### Metrics Analysis

In [None]:
# Կարդում ենք դատան
from pathlib import Path
results = list(Path(project, 'CV_Results_kNN_ROC_AUC').glob('*.csv'))
metrics = []
for result in results:
  ex = result.stem[6:]
  metric = pd.read_csv(result, index_col=0)
  metric['Excitation'] = ex
  metrics.append(metric)
metrics = pd.concat(metrics)
metrics = metrics[metrics['rank_test_score'] == 1]
metrics.head(3)

In [None]:
metrics.Excitation.unique()

In [None]:
# Փոխարկում ենք երկար ձևաչափի` գրաֆիկների մեջ գործածելու հարմարավետության համար
score_cols = [col for col in metrics.columns if 'split' in col]
id_vars = metrics.columns.difference(score_cols)
metrics_long = pd.melt(metrics, id_vars, value_vars=score_cols, var_name='Split', value_name='ROC_AUC')
metrics_long.sort_values(['Excitation', 'Split']).head(3)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

figures = os.path.join(project, 'Figures')
os.makedirs(figures, exist_ok=True)

# minimum = metrics.mean_test_score.min()
# maximum = metrics.mean_test_score.max()

for cubename in spectra.cubename.unique():

  # Spectra
  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի
  spectra_cube_melt = pd.melt(spectra_cube, id_vars=id_vars, var_name='wavelength', value_name='intensity') # դարձնում ենք երկար ֆորմատի

  ratio = np.array([24, 16])
  frac = 2.1
  figsize = (ratio / frac)

  fig, axes = plt.subplots(2, 2, figsize=figsize)

  # --------------------------------

  # Spectra Average
  ax = axes.flat[2]
  sns.lineplot(spectra_cube_melt, x='wavelength', y='intensity', hue='roi_name', errorbar=None, ax=ax)
  ax.tick_params(rotation=90)
  ax.set_title(f"Average normalized spectra | Ex = {cubename} nm")
  ax.legend(title='Samples', loc='upper left', bbox_to_anchor=(-0.7, 1), fontsize=7)

  # ----------------------------------

  # Spectra
  ax = axes.flat[3]
  sns.lineplot(spectra_cube_melt, x='wavelength', y='intensity', hue='roi_name', errorbar='sd', ax=ax)
  ax.tick_params(rotation=90)
  ax.set_title(f"Average normalized spectra & SD | Ex = {cubename} nm")
  ax.legend().set_visible(False)

  #----------------------------------------

  # PCA
  ax = axes.flat[0]
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  data = spectra_cube[wvls].to_numpy()
  data = np.nan_to_num(data, nan=0)

  data = StandardScaler().fit_transform(data)

  n_components = 3
  columns = [f"PC{comp+1}" for comp in range(n_components)]
  pca = PCA(n_components=n_components)
  pca.fit(data)
  pcs = pca.transform(data)
  pcs = pd.DataFrame(pcs, columns=columns)
  pcs = pd.concat([metadata, pcs], axis=1)

  sns.scatterplot(pcs, x='PC1', y='PC2', hue='roi_name', s=10, ax=ax)
  ax.set_title(f"PCA | Ex = {cubename} nm")
  ax.legend().set_visible(False)

  #-------------------------------------

  # KNN Cross Validation Scores
  ax = axes.flat[1]
  data = metrics_long[metrics_long['Excitation'] == cubename]

  # Some metrics to put on plot
  k = metrics.set_index('Excitation').loc[cubename, 'param_n_neighbors']
  average = metrics.set_index('Excitation').loc[cubename, 'mean_test_score']
  sd = metrics.set_index('Excitation').loc[cubename, 'std_test_score']
  med = data['ROC_AUC'].median()

  # Plotting
  sns.boxplot(data, y='ROC_AUC', ax=ax)
  sns.swarmplot(data, y='ROC_AUC', ax=ax)
  # ax.set_ylim(minimum, maximum)
  ax.set_title(f'k-NN CV Scores | Ex={cubename} | k = {k}')
  ax.text(x=0, y=med-0.0012, s=f'mean={average:.2f} | sd={sd:.2f}', ha='center', color='white')

  plt.tight_layout()

  filename = f"Spectra_normalized_PCA_ROC_AUC_{cubename}.png"
  plt.savefig(os.path.join(figures, filename))
  plt.close()

## Comparison of CV

In [None]:
metrics

In [None]:
plt.figure(figsize=(15, 5))

sns.boxplot(metrics_long, x='Excitation', y='ROC_AUC', hue='Excitation')
sns.lineplot(metrics_long, x='Excitation', y='ROC_AUC', errorbar=None)
plt.xticks(rotation=45);
plt.ylim(top=1.02)

score_cols = [col for col in metrics.columns if 'split' in col]

for cubename in metrics['Excitation']:
  # Some metrics to put on plot
  k = metrics.set_index('Excitation').loc[cubename, 'param_n_neighbors']
  average = metrics.set_index('Excitation').loc[cubename, 'mean_test_score']
  maximum = metrics.set_index('Excitation').loc[cubename, score_cols].max()
  # sd = metrics.set_index('Excitation').loc[cubename, 'std_test_score']
  med = data['ROC_AUC'].median()

  plt.text(x=cubename, y=maximum+0.002, s=f'avg={average:.2f}\nk={k}', ha='center', color='black', fontsize=8)

plt.title('k-NN Cross Validation Scores And The Average Performance')
plt.xlabel('HSI Cube Name')
plt.ylabel('ROC AUC')
# plt.legend().set_visible(False)

plt.savefig(Path(project, f"ROC_AUC_Comparison_P2.png"), bbox_inches='tight', dpi=200)

In [None]:
metrics

# Background vs Normal

In [None]:
# ստեղծում ենք գրաֆիկների համար պանակ
figures = os.path.join(project, 'Figures')
os.makedirs(figures, exist_ok=True)

In [None]:
import glob
project = '/content/drive/MyDrive/DATA/PROJECTS/Lichens'
spectra_files = glob.glob(os.path.join(project, '*.csv')) # որպեսզի միայն .csv-ները վերցնենք
spectra_files

In [None]:
# կարդում ենք բոլոր ֆայլերը և միացնում մեկ աղյուսակի մեջ

all = [] # այս դատարկ լիստի մեջ հավաքելու ենք բոլոր աղյուսակները, հետո միացնենք իրար
for filepath in spectra_files:
  df = pd.read_csv(filepath).reset_index(drop=True)
  df['cubename'] = df['cubename'].astype(str)
  all.append(df) # կցում ենք վերևի լիստին

spectra = pd.concat(all) # լիստի էլեմենտ աղյուսակները միացնում ենք իրար
spectra

In [None]:
spectra.roi_name.unique()

In [None]:
spectra[spectra['cubename'] == 'background'].roi_name.unique()

# Drafts

### WO Concatenated

#### Only Lichen ROIs

In [None]:
columns = ['cubename', 'param_n_neighbors', 'mean_test_score']
average = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='mean_test_score')
average.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(16, 4))
sns.heatmap(average, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title('Mean ROC AUC')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_ROC_AUC.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
columns = ['cubename', 'param_n_neighbors', 'rank_test_score']
rank = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='rank_test_score')
rank.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(16, 4))
sns.heatmap(rank, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title('Rank Test Score')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_Rank.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
columns = ['cubename', 'param_n_neighbors', 'std_test_score']
std = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='std_test_score')
std.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(20, 4))
sns.heatmap(std, cmap='coolwarm', annot=True, annot_kws={"size": 6})

plt.title('SD Test Score')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_SD.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
avg = average.mean(axis=0);
avg = avg / avg.max()
rnk = rank.mean(axis=0);
rnk = rnk / rnk.max()
sd = std.mean(axis=0);
sd = sd / sd.max()

sns.lineplot(avg, label='Mean average (normalized)')
sns.lineplot(rnk, label='Mean rank (normalized)')
sns.lineplot(sd, label='Mean SD (normalized)')

plt.axvline(avg.idxmax())
print(avg.idxmax())

#### Only Lichen ROIs | Non-scaled

In [None]:
columns = ['cubename', 'param_n_neighbors', 'mean_test_score']
average = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='mean_test_score')
average.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(16, 4))
sns.heatmap(average, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title('Mean ROC AUC')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_ROC_AUC_NoScaling.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
columns = ['cubename', 'param_n_neighbors', 'rank_test_score']
rank = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='rank_test_score')
rank.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(16, 4))
sns.heatmap(rank, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title('Rank Test Score')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_Rank_NoScaling.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
columns = ['cubename', 'param_n_neighbors', 'std_test_score']
std = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='std_test_score')
std.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(20, 4))
sns.heatmap(std, cmap='coolwarm', annot=True, annot_kws={"size": 6})

plt.title('SD Test Score')
plt.xlabel('Number of Neighbors')
plt.ylabel('Cube Name')
plt.xticks(rotation=45);

filename = f"Model_Performance_SD_NoScaling.png"
plt.savefig(Path(project, filename), bbox_inches='tight', dpi=200)

In [None]:
avg = average.mean(axis=0);
avg = avg / avg.max()
rnk = rank.mean(axis=0);
rnk = rnk / rnk.max()
sd = std.mean(axis=0);
sd = sd / sd.max()

sns.lineplot(avg, label='Mean average (normalized)')
sns.lineplot(rnk, label='Mean rank (normalized)')
sns.lineplot(sd, label='Mean SD (normalized)')

plt.axvline(avg.idxmax())
print(avg.idxmax())

#### Also Bgr/WS ROIs

In [None]:
columns = ['cubename', 'param_n_neighbors', 'mean_test_score']
heat = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='mean_test_score')
heat.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(25, 4))
sns.heatmap(heat, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title('Mean ROC AUC')

In [None]:
columns = ['cubename', 'param_n_neighbors', 'rank_test_score']
heat = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='rank_test_score')
heat.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(20, 4))
sns.heatmap(heat, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title('Rank Test Score')

In [None]:
avg = heat.mean(axis=0)
sns.lineplot(avg)
plt.axvline(avg.idxmin())
print(avg.idxmin())

---

In [None]:
columns = ['cubename', 'param_n_neighbors', 'std_test_score']
heat = metrics[columns].pivot(index='cubename', columns='param_n_neighbors', values='std_test_score')
heat.sort_index(ascending=True, inplace=True)

plt.figure(figsize=(20, 4))
sns.heatmap(heat, cmap='coolwarm', annot=True, annot_kws={"size": 9})

plt.title('Std Test Score')

### Developing

In [None]:
# Normalized
cubes.normalize(which_data=starting_data, how='snv')
cubes.spectra_from_rois(which_data='normalized')
spectra = cubes.spectra.reset_index()
spectra.head(2)

In [None]:
# Include only lichen classes into training
samples = [
    sample for sample in spectra.roi_name.unique()
    if all(substr not in sample for substr in ['Tape', 'Paper', 'WS'])
]
spectra = spectra[spectra['roi_name'].isin(samples)]
spectra.roi_name.unique()

In [None]:
spectra.groupby('cubename').aggregate({'roi_name': 'count'})

In [None]:
sizes = spectra.groupby('cubename').aggregate({'roi_name': 'count'}).roi_name.tolist()
if all([sizes[i] == sizes[0] for i, size in enumerate(sizes)]):
  n_samples = sizes[0]

train_size = n_samples * 0.8

min_k = 5
max_k = int(train_size)

n_neighbors = [min_k] + list(range(10, max_k+1, 10))
print(f"Number of k-neighbors to be tested:")
print(n_neighbors)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

figures = os.path.join(project, 'Spectra_PCA_w_Scaled')
os.makedirs(figures, exist_ok=True)

for cubename in spectra.cubename.unique()[:1]:

  id_vars = spectra.columns[:6]
  spectra_cube = spectra[spectra['cubename'] == cubename].reset_index(drop=True) # ֆիլտրում ենք սպեկտրները ըստ կուբի

  ratio = np.array([24, 8])
  frac = 1.5
  figsize = (ratio / frac)
  fig, axes = plt.subplots(1, 2, figsize=figsize)

  # Spectra
  ax = axes.flat[0]
  spectra_cube_melt = pd.melt(spectra_cube, id_vars=id_vars, var_name='wavelength', value_name='intensity') # դարձնում ենք երկար ֆորմատի
  sns.lineplot(spectra_cube_melt, x='wavelength', y='intensity', hue='roi_name', errorbar=None, ax=ax)
  ax.tick_params(rotation=90)
  ax.set_title(f"Average normalized spectra | Ex = {cubename} nm")
  ax.legend(title='Samples', loc='upper left', bbox_to_anchor=(-0.5, 1), fontsize=10)

  # PCA
  ax = axes.flat[1]
  wvls = spectra_cube.columns.difference(id_vars)
  metadata = spectra_cube[id_vars]
  data = spectra_cube[wvls].to_numpy()
  data = np.nan_to_num(data, nan=0)

  data = StandardScaler().fit_transform(data)

  n_components = 3
  columns = [f"PC{comp+1}" for comp in range(n_components)]
  pca = PCA(n_components=n_components)
  pca.fit(data)
  pcs = pca.transform(data)
  pcs = pd.DataFrame(pcs, columns=columns)
  pcs = pd.concat([metadata, pcs], axis=1)

  sns.scatterplot(pcs, x='PC1', y='PC2', hue='roi_name', s=10, ax=ax)
  ax.set_title(f"PCA | Ex = {cubename} nm")
  ax.legend().set_visible(False)

  plt.tight_layout()

  filename = f"Spectra_PCA_normalized_{cubename}.png"
  # plt.savefig(os.path.join(figures, filename))

In [None]:
cubename = '300'
plt.figure(figsize=(4, 2.5))

# Data to plot
data = metrics_long[metrics_long['Excitation'] == cubename]

# Some metrics to put on plot
k = metrics.set_index('Excitation').loc[cubename, 'param_n_neighbors']
average = metrics.set_index('Excitation').loc[cubename, 'mean_test_score']
sd = metrics.set_index('Excitation').loc[cubename, 'std_test_score']

# Plotting
sns.boxplot(data, y='ROC_AUC')
sns.swarmplot(data, y='ROC_AUC')
plt.xlabel(None)
plt.title(f'k-NN CV Scores | Ex={cubename} | k = {k}', fontsize=11)

In [None]:
for filepath in spectra_files:
  df = pd.read_csv(filepath)
  print(os.path.basename(filepath))
  df['cube_sample'] = df['cubename'].astype(str) + '_' + df['roi_name'].astype(str)
  print(df['cube_sample'].unique().tolist())
  print('---------------------------------------')

In [None]:
df['cubename'].astype(str) + '_' + df['roi_name']

---