In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os

# https://stackoverflow.com/questions/66828031/do-i-always-have-to-restart-my-kernel-in-jupyter-lab-when-code-in-a-local-module

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')

# Dataset Generation
The functions to manipulate the dataset allow for

In [72]:
from DatasetFuncs import allDataset_loader,dataset_split,show_porcentages,show_partition_nanopores

data_folder="../ext/QuipuData/"; #path where the datasets are/will be stored
allDatasets=allDataset_loader(data_folder) #Generates the processed df and saves it. If it is already in memory
#then it loads it. One can specify cut=False to obtain the uncut traces (if not they are cut at 700 samples as was used in quipunet)

trainSet,testSet=dataset_split(allDatasets)#Divides in train and test, in a way that the test set is within a range of
# % of samples, and that nanopores used on train set are not used in train for the same barcode.
samples_perc=show_porcentages(trainSet,testSet) #This verifies the percentages

     Train  Test
000   5769   315
001   7665  1340
010   2418   101
011  14231  2378
100    919    83
101   7551   427
110   6731   606
111   6979   665
         Train       Test
000  94.822485   5.177515
001  85.119378  14.880622
010  95.990472   4.009528
011  85.682461  14.317539
100  91.716567   8.283433
101  94.647781   5.352219
110  91.740493   8.259507
111  91.300366   8.699634


In [73]:
#With this code we show which is the nanopores used for this train and test dataset
print(show_partition_nanopores(trainSet))
print(show_partition_nanopores(testSet))

   barcode  nanopore  size
0      000         7  2172
1      000         8  3498
2      000      1017    99
3      001         8  1737
4      001        27  3583
5      001        28   923
6      001        29  1255
7      001      1053   167
8      010         7  1021
9      010         9   629
10     010        10   768
11     011        11   362
12     011        12   577
13     011        31   899
14     011        33  2192
15     011        35   483
16     011        36  1780
17     011        37  1583
18     011        38  1229
19     011        39  1735
20     011        40  2120
21     011        41  1271
22     100         7   288
23     100        13   631
24     101         8  1240
25     101        26  1764
26     101        27   924
27     101        29  3494
28     101      1662   129
29     110         7  1161
30     110         8  2702
31     110        13  1309
32     110        14  1559
33     111         7   735
34     111         8  2866
35     111         9   677
3

# Testing reproducibility

With this function we can run the training of quipu how it was done in their notebook (with n epochs=60)

In [3]:
from ModelTrainer import ModelTrainer
mt=ModelTrainer()
mt.quipu_def_train(n_epochs=2)

  super(Adam, self).__init__(name, **kwargs)


=== Epoch: 0 ===
  prep time: 3.0 sec   train time: 21.0 sec
  loss: 1.893   acc: 0.2963   val_acc: 0.4077
=== Epoch: 1 ===
  prep time: 3.0 sec   train time: 17.0 sec
  loss: 1.694   acc: 0.3553   val_acc: 0.4316
       [ loss , accuracy ]
Train: [1.465266466140747, 0.4333359897136688]
Validation  : [1.4687469005584717, 0.43164435029029846]
Test : [1.507033109664917, 0.49643704295158386]


(0.4333359897136688, 0.43164435029029846, 0.49643704295158386)

Then we did this for 20 runs and saved it in "../results/QuipuReproduction.csv", so when we load the datafram we can observe the accuracies and their variance.

In [19]:
def print_accs(all_accs): #Array that has 3 columns, train valid and test accs, each row is a model result
    all_means=np.mean(all_accs,axis=0)
    all_stds=np.std(all_accs,axis=0)
    print("Train Acc: " + "{:.2f}".format(all_means[0]*100) + " +- " + "{:.2f}".format(all_stds[0]*100) + " % ")
    print("Valid Acc: " + "{:.2f}".format(all_means[1]*100) + " +- " + "{:.2f}".format(all_stds[1]*100) + " % ")
    print("Test Acc: " + "{:.2f}".format(all_means[2]*100) + " +- " + "{:.2f}".format(all_stds[2]*100) + " % ")
    n_samples=np.shape(all_accs)[0]
    print("Test Accuracy estimator: " + "{:.2f}".format(all_means[2]*100) + " +- " + "{:.2f}".format(all_stds[2]*100/np.sqrt(n_samples)) + " % ")

def get_df_results_folder(folder):
    f_count=0;
    df_list=[]
    for dirpath, dnames, fnames in os.walk(folder):
        for f in fnames:
            if f.endswith(".csv") and (not "checkpoint" in f):
                print(f)
                df=pd.read_csv(folder+f)
                all_accs=df.iloc[:,:-1].values
                all_means=np.mean(all_accs,axis=0)*100
                all_stds=np.std(all_accs,axis=0)*100
                n_samples=np.shape(all_accs)[0]
                df_list.append([f,all_means[2],all_stds[2]/np.sqrt(n_samples),all_means[0],all_stds[0],all_means[1],all_stds[1]]);
                f_count=f_count+1
    df = pd.DataFrame(df_list, columns=["Name file", "Test mean", "Test mean std","Train", "Train std", "Valid","Valid std"])
    return df;

In [21]:
get_df_results_folder("../results/TrainingAsQuipu/")

Reproduction_N1_512_N2_512.csv
Reproduction_N1_1000_N2_1000.csv
WBrowAug_09_N1_512_N2_512.csv
WBrowAug_09_N1_2048_N2_1024.csv


Unnamed: 0,Name file,Test mean,Test mean std,Train,Train std,Valid,Valid std
0,Reproduction_N1_512_N2_512.csv,88.153328,0.529545,93.526185,1.879622,92.89916,1.488911
1,Reproduction_N1_1000_N2_1000.csv,92.836885,0.328662,97.652759,0.270256,96.201223,0.482746
2,WBrowAug_09_N1_512_N2_512.csv,87.007103,0.605504,93.107564,1.160031,92.77688,1.148343
3,WBrowAug_09_N1_2048_N2_1024.csv,92.466036,0.415098,96.596867,0.140547,95.753147,0.408163


In [22]:
get_df_results_folder("../results/TrainingWithES/")

Reproduction_N1_512_N2_512.csv
Reproduction_N1_2048_N2_1024.csv
WBrowAug_090_N1_2048_N2_1024.csv
WBrowAug_090_N1_512_N2_512.csv
WBrowAug_090_N1_2048_N2_1024_again.csv
Reproduction_N1_2048_N2_1024_again.csv


Unnamed: 0,Name file,Test mean,Test mean std,Train,Train std,Valid,Valid std
0,Reproduction_N1_512_N2_512.csv,92.530366,0.165775,98.86677,0.412448,96.285074,0.320209
1,Reproduction_N1_2048_N2_1024.csv,92.793956,0.182575,98.912258,0.466596,96.392705,0.329265
2,WBrowAug_090_N1_2048_N2_1024.csv,93.14848,0.175979,98.138564,0.337926,96.199989,0.289616
3,WBrowAug_090_N1_512_N2_512.csv,92.305498,0.172442,97.322122,0.393818,95.630905,0.374171
4,WBrowAug_090_N1_2048_N2_1024_again.csv,93.050807,0.15598,98.145452,0.367531,96.172914,0.26284
5,Reproduction_N1_2048_N2_1024_again.csv,92.489252,0.161634,98.923574,0.396808,96.339467,0.25667


In [23]:
dfrepBig1=pd.read_csv("../results/TrainingWithES/Reproduction_N1_2048_N2_1024.csv")
dfrepBig2=pd.read_csv("../results/TrainingWithES/Reproduction_N1_2048_N2_1024_again.csv")
dfrepBigWhole=pd.concat([dfrepBig1, dfrepBig2])
print_accs(dfrepBigWhole.iloc[:,:-1].values)

dfrepBig1=pd.read_csv("../results/TrainingWithES/WBrowAug_090_N1_2048_N2_1024.csv")
dfrepBig2=pd.read_csv("../results/TrainingWithES/WBrowAug_090_N1_2048_N2_1024_again.csv")
dfrepBigWhole=pd.concat([dfrepBig1, dfrepBig2])
print_accs(dfrepBigWhole.iloc[:,:-1].values)

Train Acc: 98.92 +- 0.43 % 
Valid Acc: 96.37 +- 0.30 % 
Test Acc: 92.64 +- 1.73 % 
Test Accuracy estimator: 92.64 +- 0.12 % 
Train Acc: 98.14 +- 0.35 % 
Valid Acc: 96.19 +- 0.28 % 
Test Acc: 93.10 +- 1.66 % 
Test Accuracy estimator: 93.10 +- 0.12 % 


In [17]:
dfrepBig1

Unnamed: 0,Name file,Test mean,Test mean std,Train,Train std,Valid,Valid std


## To check if we can remove it. Old used to debug

In [13]:
import math
brow_aug=0.9;
str(int(brow_aug))+str(int(math.modf(brow_aug)[0]*100))

'090'

In [3]:
from DataLoader import DataLoader
from DataAugmentator import DataAugmentator
dl=DataLoader();
#X_train,X_valid,Y_train,Y_valid,X_test,Y_test=dl.get_datasets_numpy_quipu();
X_train,X_valid,Y_train,Y_valid,X_test,Y_test=dl.get_datasets_numpy();
da=DataAugmentator();
out=da.test_brow_aug(X_train);

<class 'TypeError'>: 'tuple' object cannot be interpreted as an integer