#Clone the repo

In [None]:
!git clone https://github.com/Moda007/MethodicalSplit.git

In [None]:
from MethodicalSplit import Functions as Fn
from MethodicalSplit.ExpModel import ExpModel

#Define experiment details

In [None]:
DataSet, ModelName, group_idx, exp_idx, Avg, Sub, Rare, stratify, train = Fn.expDetails(Fn.all_exp)

#Colab

##Ignore Warnings

In [None]:
import warnings
warnings.filterwarnings('ignore')

##Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Importing Packages and Libraries

In [None]:
!pip install hdbscan

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

import umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

#Dataset

##Importing Dataset

In [None]:
import tensorflow_datasets as tfds
# Construct a tf.data.Dataset
train, test = tfds.load('smallnorb', split=["train", "test"], as_supervised=True, shuffle_files=True)
# Convert to Numpy
(x_train, y_train), (x_test, y_test) = Fn.TFDStoNP(train), Fn.TFDStoNP(test)

In [None]:
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

##Concatenate train and test dataset

In [None]:
x_data = np.concatenate([x_train, x_test], axis=0)
y_data = np.concatenate([y_train, y_test], axis=0)
print("x_data shape:", x_data.shape)
print("y_data shape:", y_data.shape)

##Split test data (holdout)

In [None]:
from sklearn.model_selection import train_test_split

x_data, x_test, y_data, y_test = train_test_split(x_data, y_data, train_size= 0.9, test_size= 0.1)

print("x_data shape:", x_data.shape)
print("x_test shape:", x_test.shape)
print("y_data shape:", y_data.shape)
print("y_test shape:", y_test.shape)

##Flatten dataset

In [None]:
flatten_x = x_data.reshape((x_data.shape[0], -1))
print("flatten_x shape:", flatten_x.shape)
x_test = x_test.reshape((x_test.shape[0], -1))
print("x_test shape:", x_test.shape)

In [None]:
train = 'avg'
is_colored = False

#Model (Loop)

##>>> Experiment

In [None]:
looped = True
if looped:
  looped = 'looped_'
else:
  looped = ''

ds_path = '/content/drive/My Drive/Thesis Notebooks/' + DataSet + '/'
exp_path = ds_path + ModelName + '/Group_' + group_idx + '/Exp' + exp_idx + '/'

In [None]:
def plotResults(DataSet, ModelName, group_idx, exp_idx, history, loop_no=''):
  # plot the loss and accuracy
  global exp_path

  loop_no = str(loop_no)
  plot_path = exp_path + 'plot'

  acc = history.history['acc']
  val_acc = history.history['val_acc']
  loss = history.history['loss']
  val_loss = history.history['val_loss']
  epochs = range(1, len(acc) + 1)

  plt.title('Training and validation accuracy')
  plt.plot(epochs, acc, 'red', label='Training acc')
  plt.plot(epochs, val_acc, 'blue', label='Validation acc')
  plt.legend()

  plt.savefig(f'{plot_path}/{loop_no}TrainValidAcc.jpg')

  plt.figure()
  plt.title('Training and validation loss')
  plt.plot(epochs, loss, 'red', label='Training loss')
  plt.plot(epochs, val_loss, 'blue', label='Validation loss')
  plt.legend()
  plt.savefig(f'{plot_path}/{loop_no}TrainValidAcc&Loss.jpg')

  plt.show()

#Looped Experiment

##Inception

In [None]:
#Define All Results array
All_results = []

for idx in range(5):
  
  global exp_path
  plot_path = exp_path +  'plot'
  print('*****************************')
  print(f'Experiment {str(idx)} starts')
  print('*****************************')

  #2D Embedding - UMAP
  standard_embedding = umap.UMAP(random_state=42, n_neighbors=30, min_dist=0.0, n_components=2).fit_transform(flatten_x)
  #Plotting with Original Labels
  ##2D - Plotting
  plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=y_data, s=0.1, cmap='Spectral')
  plt.savefig(f'{plot_path}/{idx}2DOriginalScatter.jpg')
  plt. clf()
  sns.distplot(standard_embedding, kde=True, rug=True, bins=15)
  plt.savefig(f'{plot_path}/{idx}2DOriginalhistogram.jpg')
  plt. clf()

  #Clustering - HDBSCAN
  hdbscan_labels = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500).fit_predict(standard_embedding)
  (adjusted_rand_score(y_data, hdbscan_labels), adjusted_mutual_info_score(y_data, hdbscan_labels))
  no_of_clusters = Fn.clustersDet(hdbscan_labels)
  ##Plotting with Cluster Labels
  ### 2D - Plotting
  plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=hdbscan_labels, s=0.1, cmap='Spectral')
  plt.savefig(f'{plot_path}/{idx}2DClustersScatter.jpg')
  plt. clf()
  
  #1D  Embedding per Cluster on the original Data
  #Using UMAP =>>> random_state=42, n_neighbors=30, min_dist=0, n_components=1
  clusters_list, labels_list, original_images_list = Fn.embedding1D(no_of_clusters, flatten_x, hdbscan_labels, y_data, False)
  #Plotting Clusters Scatter Diagram
  ##All Clusters
  #Plotting scatter diagram for all clusters together
  #define a list of colors for clusters
  color_maps=['tab10', 'hsv', 'gist_stern', 'Accent', 'Dark2', 'Spectral', 'rainbow', 'brg', 'Pastel1', 'coolwarm']

  for u in range(no_of_clusters):
    plt.scatter(standard_embedding[hdbscan_labels==u][:, 0], standard_embedding[hdbscan_labels==u][:, 1],\
              c=hdbscan_labels[hdbscan_labels==u], s=0.1, cmap=color_maps[u])
  plt.savefig(f'{plot_path}/{idx}1DClustersScatter.jpg')
  plt. clf()
  for u in range(no_of_clusters):
    plt.scatter(standard_embedding[hdbscan_labels==u][:, 0], standard_embedding[hdbscan_labels==u][:, 1],\
              c=hdbscan_labels[hdbscan_labels==u], s=0.1, cmap=color_maps[u])
    plt.savefig(f'{plot_path}/{idx}1DCluster{str(u)}Scatter.jpg')
    plt. clf()
  
  #Data Splitting (per cluster) using Quantile
  Region_1, Region_2, Region_3,\
  Region_1_labels, Region_2_labels, Region_3_labels,\
  Region_1_original, Region_2_original, Region_3_original = Fn.splitData(no_of_clusters, Avg, Sub, Rare, clusters_list,\
                                                                       labels_list, original_images_list)
  for u in range(no_of_clusters):
    Fn.showSplit(Region_1, Region_2, Region_3, u)
    plt.savefig(f'{plot_path}/{idx}Cluster{str(u)}SplitHistogram.jpg')
    plt. clf()
    Fn.showSamples(Region_1_original[u], Region_2_original[u], Region_3_original[u], Region_1_labels[u], Region_2_labels[u], Region_3_labels[u])
    plt.savefig(f'{plot_path}/{idx}Samples{str(u)}.jpg')
    plt. clf()
  ##Data Preperation
  ###Splitting Data into 70% Train, 30% Validate
    x_train_X, y_train_X, x_valid_X, y_valid_X, x_test_X, y_test_X = Fn.prepareDataSpecial(no_of_clusters, Region_1_original,\
                                                                                       Region_1_labels, Region_2_original,\
                                                                                       Region_2_labels, Region_3_original,\
                                                                                       Region_3_labels, x_test, y_test, train)
  print('x_train_X shape:', x_train_X.shape)
  print('x_valid_X shape:', x_valid_X.shape)
  print('x_test_X shape:', x_test_X.shape)

  ##Train
  thisModel = ExpModel(ModelName, DataSet, x_train_X, y_train_X, x_valid_X, y_valid_X, x_test_X, y_test_X)
  model, history, results = thisModel.trainModel()

  ##Store experiment results
  All_results.append(results)

  ##Model Saving
  filename = str(idx) + ModelName + '_' + DataSet + '_' + group_idx + '_'+ exp_idx
  model.save(exp_path + filename + '.h5')

  # plot the loss and accuracy
  plotResults(DataSet, ModelName, group_idx, exp_idx, history, idx)
  
  print('*****************************')
  print(f'Experiment {str(idx)} ends')
  print('*****************************')

  tf.keras.backend.clear_session()

##Check experiments results

In [None]:
if len(All_results)==5:
  print('Results Are Complete')
else:
  print('Results record has issue!!!')

##Exporting Results to Excel

In [None]:
!pip install xlsxwriter

###Create Excel with columns header

In [None]:
excel_name = ModelName + '.xlsx'
sheet_name = group_idx + '_' + exp_idx
filepath = ds_path + excel_name

column_titles = [['Idx', 'Train Acc', 'Valid Acc', 'Train Loss', 'Valid Loss',\
                 'Test Acc', 'Test Precision', 'Test Recall', 'Test F-score', 'Hamming Loss']]
df = pd.DataFrame(column_titles)

In [None]:
if not Path(filepath).exists():
  # Create writer object with an engine xlsxwriter
  writer = pd.ExcelWriter(filepath, engine='xlsxwriter')
  # Write data to an excel
  df.to_excel(writer, sheet_name=sheet_name, index=False, header=None)
  writer.save()
else:
  book = load_workbook(filepath)
  with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)    
    ##dataframe to append. 
    df.to_excel(writer, sheet_name=sheet_name, index=False, header=None)  
    writer.save()

###Parse this experiment results

In [None]:
for idx, result in enumerate(All_results):
  this_result = [idx] + result['train'] + result['test']
  this_result = np.array(this_result).reshape(1,-1)
  this_result = pd.DataFrame(this_result)

  writer = pd.ExcelWriter(filepath, engine='openpyxl')
  # try to open an existing workbook
  writer.book = load_workbook(filepath)
  # copy existing sheets
  writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
  # read existing file
  reader = pd.read_excel(filepath)
  # write out the new sheet
  this_result.to_excel(writer, sheet_name=sheet_name, index=False, header=False, startrow=idx+1)

  writer.close()