In [2]:
## Save a channel and label data for the NA12878 sample with the Mills2011_nanosv labels, for Chr[1,2,3]
## This dataset is used as a reference test set

In [1]:
import os
import numpy as np
from keras.utils.np_utils import to_categorical
import keras
import gzip
from collections import Counter
import pandas as pd

%pylab inline
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


Populating the interactive namespace from numpy and matplotlib


In [2]:
# Auxiliary functions

def transposeDataset(X):
    image = []
    for i in range (0, len(X -1)):
        tr = X[i].transpose()
        image.append(tr)
    return np.array(image)

def write_prec_rec(y_binary, probs):
    
    df_conf = pd.DataFrame()

    for i in np.linspace(1.0 / len(labels), 1, num=50, endpoint=False):

        predicted = np.argwhere(probs > i)[:, 1]
        y_index = np.argwhere(y_binary > i)[:, 1]

        # Rows: true, columns: predicted
        confusion_matrix = pd.crosstab(pd.Series(y_index), pd.Series(predicted))
        confusion_matrix.index = [labels[i] for i in confusion_matrix.index]
        confusion_matrix.columns = [labels[i] for i in confusion_matrix.columns]
        confusion_matrix.reindex(columns=[l for l in labels], fill_value=0)

        for l in labels:
            if l in confusion_matrix.index:
                
                label_precision = np.around(confusion_matrix.loc[l, l] / sum(confusion_matrix.loc[:, l]) * 100)
                label_recall = np.around(confusion_matrix.loc[l, l] / sum(confusion_matrix.loc[l, :]) * 100)
                label_F1 = 2 * (label_precision * label_recall) / (label_precision + label_recall)

                # print(f'Iter:{i} {l} -> Precision:{label_precision}%, Recall:{label_recall}%, F1:{label_F1}')

                df_intres = pd.DataFrame(
                    {'iteration': [i], 'label': [l],
                     'precision': [label_precision], 'recall': [label_recall], 'F1': [label_F1]})
                df_conf = df_conf.append(df_intres)

    # print(df_conf)
    df_conf.to_csv(path_or_buf=os.path.join(datapath_now,'model_PrecRec.csv'))
    

In [3]:
sample_name = 'NA12878'
#date = '231118'
date = '270219'
label_type = 'Mills2011_nanosv'

# Load label dictionary
dico_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'+sample_name+'/MultiLabelData/labels.pickle.gz'
with gzip.GzipFile(dico_file, "rb") as f:
    dico = np.load(f)
f.close()

In [4]:
# Create reference test set using Chr1 to Chr3

#Leaving out chromosome Y and MT for the moment
chr_list = list(map(str, np.arange(1,4)))

print(chr_list)

test_data = []
test_labels = []
test_id = []

datapath = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'+sample_name+'/ChannelData/'

for i in chr_list:
    
    print('Loading data for Chr%s' % i)
    data_file = datapath + sample_name + '_' + str(i) + '.npy.gz'
    with gzip.GzipFile(data_file, "rb") as f:
        data_mat = np.load(f)
        test_data.extend(data_mat)
    f.close()
    
    test_labels.extend(dico[label_type][i])
    test_id.extend([d['chromosome']+'_'+str(d['position']) for d in dico['id'][i]])
    
print(Counter(test_labels))
assert len(test_data) == len(test_labels)

['1', '2', '3']
Loading data for Chr1
Loading data for Chr2
Loading data for Chr3
Counter({'noSV': 258257, 'UK': 328, 'DEL_start': 302, 'DEL_end': 254})


In [5]:
test_data = np.array(test_data)
test_labels = np.array(test_labels)
test_id = np.array(test_id)

# Remove windows labelled as unknown ('UK')
keep = np.where(np.array(test_labels)!='UK')
test_data = test_data[keep]
test_labels = test_labels[keep]
test_id = test_id[keep]

Balance dataset

In [6]:
cnt_lab = Counter(test_labels)
min_v = min([v for k, v in cnt_lab.items()])
print(cnt_lab)
print('Minimum number of labels = ' + str(min_v))

data_balanced = []
labels_balanced = []
id_balanced = []

for l in cnt_lab.keys():
    #print(l)
    iw = np.where(test_labels==l)
    ii = iw[0][:min_v]
    data_balanced.extend(test_data[ii])
    labels_balanced.extend(test_labels[ii])
    id_balanced.extend(test_id[ii])

print(Counter(labels_balanced))

X = np.array(data_balanced)
y = np.array(labels_balanced)
z = np.array(id_balanced)

Counter({'noSV': 258257, 'DEL_start': 302, 'DEL_end': 254})
Minimum number of labels = 254
Counter({'noSV': 254, 'DEL_start': 254, 'DEL_end': 254})


In [7]:
# Remove windows with nan if present
print(X[np.where(np.isnan(X))])
idx = numpy.unique(np.where(np.isnan(X))[0])
X = np.delete(X, idx, 0)
y = np.delete(y, idx, 0)
z = np.delete(z, idx, 0)

[]


In [8]:
X = transposeDataset(X)

In [9]:
mapclasses = {'DEL_start': 1, 'DEL_end': 0,  'noSV': 2}
y_num = np.array([mapclasses[c] for c in y], dtype='int')
y_binary = to_categorical(y_num)

In [10]:
print(X.shape)
print(y.shape)

(762, 200, 29)
(762,)


## Save channels and labels

In [11]:
%%time

import errno

datapath_test=os.path.join(datapath,'../TestData/')

try:
    os.mkdir(datapath_test)
except OSError as exc:
    if exc.errno != errno.EEXIST:
        raise
    pass

data_output_file = datapath_test + sample_name + '_' + label_type + '_channels.npy'
np.save(data_output_file, X)
os.system('gzip '+data_output_file)

label_output_file = datapath_test + sample_name + '_' + label_type + '_labels.npy'
np.save(label_output_file, y)
os.system('gzip '+label_output_file)

label_output_file = datapath_test + sample_name + '_' + label_type + '_labels_binary.npy'
np.save(label_output_file, y_binary)
os.system('gzip '+label_output_file)

id_output_file = datapath_test + sample_name + '_' + label_type + '_ids.npy'
np.save(id_output_file, z)
os.system('gzip '+id_output_file)

CPU times: user 1.74 s, sys: 336 ms, total: 2.08 s
Wall time: 1.24 s
