In [1]:
## Save a channel and label data for the NA12878 sample with the Mills2011_nanosv labels, for Chr[1,2,3]
## This dataset is used as a reference test set

In [2]:
import os
import numpy as np
from keras.utils.np_utils import to_categorical
import keras
import gzip
from collections import Counter
import pandas as pd

%pylab inline
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


Populating the interactive namespace from numpy and matplotlib


In [3]:
# Auxiliary functions

def transposeDataset(X):
    image = []
    for i in range (0, len(X -1)):
        tr = X[i].transpose()
        image.append(tr)
    return np.array(image)

In [11]:
date = '270219'

art_training_data = []
art_training_labels = []

base_dir = os.path.join('/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test',
                        date, 'TrainingData_'+date)

svtype = 'INDEL'
sample = 'G1'

for svtype in ['INDEL', 'INDEL_HOM']:

    datapath = os.path.join(base_dir, svtype, sample)
    data_file = os.path.join(datapath, 'ChannelData', sample+'.npy.gz')
    label_file = os.path.join(datapath, 'LabelData', sample+'_17_label.npy.gz')

    with gzip.GzipFile(data_file, "rb") as f:
        data_mat = np.load(f)
        art_training_data.extend(data_mat)
    f.close()

    with gzip.GzipFile(label_file, "rb") as f:
        label_list = np.load(f)
        art_training_labels.extend(label_list)
    f.close()

art_training_data = np.array(art_training_data)
art_training_labels = np.array(art_training_labels)

assert len(art_training_data) == len(art_training_labels)

print(art_training_data.shape)
print(art_training_labels.shape)

(39450, 29, 200)
(39450,)


In [3]:
sample_name = 'NA12878'
#date = '231118'
date = '270219'
label_type = 'Mills2011_nanosv'

# Load label dictionary
dico_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'+sample_name+'/MultiLabelData/labels.pickle.gz'
with gzip.GzipFile(dico_file, "rb") as f:
    dico = np.load(f)
f.close()

In [4]:
# Create reference test set using Chr4 to ChrX

#Leaving out chromosome Y and MT for the moment
chr_list = list(map(str, np.arange(4,23)))
chr_list.append('X')

print(chr_list)

training_data = []
training_labels = []
training_id = []

datapath = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'+sample_name+'/ChannelData/'

for i in chr_list:
    
    print('Loading data for Chr%s' % i)
    data_file = datapath + sample_name + '_' + str(i) + '.npy.gz'
    with gzip.GzipFile(data_file, "rb") as f:
        data_mat = np.load(f)
        training_data.extend(data_mat)
    f.close()
    
    training_labels.extend(dico[label_type][i])
    training_id.extend([d['chromosome']+'_'+str(d['position']) for d in dico['id'][i]])
    
print(Counter(training_labels))
assert len(training_data) == len(training_labels)

['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X']
Loading data for Chr4
Loading data for Chr5
Loading data for Chr6
Loading data for Chr7
Loading data for Chr8
Loading data for Chr9
Loading data for Chr10
Loading data for Chr11
Loading data for Chr12
Loading data for Chr13
Loading data for Chr14
Loading data for Chr15
Loading data for Chr16
Loading data for Chr17
Loading data for Chr18
Loading data for Chr19
Loading data for Chr20
Loading data for Chr21
Loading data for Chr22
Loading data for ChrX
Counter({'noSV': 934333, 'UK': 1231, 'DEL_start': 904, 'DEL_end': 823})


In [5]:
training_data = np.array(training_data)
training_labels = np.array(training_labels)
training_id = np.array(training_id)

# Remove windows labelled as unknown ('UK')
keep = np.where(np.array(training_labels)!='UK')
training_data = training_data[keep]
training_labels = training_labels[keep]
training_id = training_id[keep]

Balance dataset

In [6]:
cnt_lab = Counter(training_labels)
min_v = min([v for k, v in cnt_lab.items()])
print(cnt_lab)
print('Minimum number of labels = ' + str(min_v))

data_balanced = []
labels_balanced = []
id_balanced = []

for l in cnt_lab.keys():
    #print(l)
    iw = np.where(training_labels==l)
    ii = iw[0][:min_v]
    data_balanced.extend(training_data[ii])
    labels_balanced.extend(training_labels[ii])
    id_balanced.extend(training_id[ii])

print(Counter(labels_balanced))

X = np.array(data_balanced)
y = np.array(labels_balanced)
z = np.array(id_balanced)

Counter({'noSV': 934333, 'DEL_start': 904, 'DEL_end': 823})
Minimum number of labels = 823
Counter({'noSV': 823, 'DEL_end': 823, 'DEL_start': 823})


In [7]:
# Remove windows with nan if present
# print(np.where(np.isnan(X)))
idx = np.unique(np.where(np.isnan(X))[0])
print(idx)
#print(X[np.where(np.isnan(X))])
print(z[idx])
idx = numpy.unique(np.where(np.isnan(X))[0])
X = np.delete(X, idx, 0)
y = np.delete(y, idx, 0)
z = np.delete(z, idx, 0)

[]
[]


In [8]:
# GC track seems fine at positions Chr4_1423148 and Chr4_1478645

# import pyBigWig

# bw_gc = pyBigWig.open("/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/UCSC/hg19/hg19.gc5Base.bw")

# print(bw_gc.chroms())
# for loc in z[idx]:
    
#     chr_pos = loc.split('_')
#     chrName = chr_pos[0]
#     mid_pos = int(chr_pos[1])
#     win = 200
#     start_win = mid_pos-int(win/2)
#     end_win = mid_pos+int(win/2)
    
#     gc_array = bw_gc.values('chr' + chrName, start_win, end_win)
    
#     print(mid_pos)
#     print(gc_array)
#     print(len(gc_array))

In [9]:
X = transposeDataset(X)

In [10]:
mapclasses = {'DEL_start': 1, 'DEL_end': 0,  'noSV': 2}
y_num = np.array([mapclasses[c] for c in y], dtype='int')
y_binary = to_categorical(y_num)

In [11]:
print(X.shape)
print(y.shape)

(2469, 200, 29)
(2469,)


## Save channels and labels

In [13]:
%%time

import errno

datapath_training=os.path.join(datapath,'../TrainingData/balanced_r')

try:
    os.makedirs(datapath_training, exist_ok=True)
except OSError as exc:
    if exc.errno != errno.EEXIST:
        raise
    pass

data_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_channels.npy')
np.save(data_output_file, X)
os.system('gzip '+data_output_file)

label_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_labels.npy')
np.save(label_output_file, y)
os.system('gzip '+label_output_file)

label_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_labels_binary.npy')
np.save(label_output_file, y_binary)
os.system('gzip '+label_output_file)

id_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_ids.npy')
np.save(id_output_file, z)
os.system('gzip '+id_output_file)

CPU times: user 7.64 ms, sys: 703 ms, total: 711 ms
Wall time: 2.73 s


In [15]:
data_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_channels.npy.gz')
with gzip.GzipFile(data_output_file, "rb") as f:
    X = np.load(f)

label_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_labels.npy.gz')
with gzip.GzipFile(label_output_file, "rb") as f:
    y = np.load(f)
f.close()

label_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_labels_binary.npy.gz')
with gzip.GzipFile(label_output_file, "rb") as f:
    y_binary = np.load(f)
f.close()

id_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_ids.npy.gz')
with gzip.GzipFile(id_output_file, "rb") as f:
    win_ids = np.load(f)
f.close()

print(X.shape)
print(y.shape)
print(y.shape)
print(win_ids.shape)

(2469, 200, 29)
(2469,)
(2469,)
(2469,)


In [56]:
def real_data():
    # Create reference test set using Chr4 to ChrX
    sample_name = 'NA12878'
    date = '270219'
    label_type = 'Mills2011_nanosv'
    
    def remove_label(training_data, training_labels, training_id, label):

        # Remove windows labelled as label
        keep = np.where(np.array(training_labels) != label)
        training_data = training_data[keep]
        training_labels = training_labels[keep]
        training_id = training_id[keep]

        return training_data, training_labels, training_id

    def get_label_dict():
        # Load label dictionary
        dico_file = os.path.join('/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test',
                                 date, 'TestData_' + date, sample_name, 'MultiLabelData/labels.pickle.gz')
        with gzip.GzipFile(dico_file, "rb") as f:
            dico = np.load(f)
        f.close()

        return dico

    dico = get_label_dict()

    # Leaving out chromosome Y and MT for the moment
    #chr_list = list(map(str, np.arange(4, 23)))
    #chr_list.append('X')
    chr_list = ['22']

    print(chr_list)

    training_data = []
    training_labels = []
    training_id = []

    datapath = os.path.join('/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test',
                             date, 'TestData_'+date, sample_name, 'ChannelData')

    for i in chr_list:
        print('Loading data for Chr%s' % i)
        data_file = os.path.join(datapath, sample_name + '_' + str(i) + '.npy.gz')
        with gzip.GzipFile(data_file, "rb") as f:
            data_mat = np.load(f)
            training_data.extend(data_mat)
        f.close()

        training_labels.extend(dico[label_type][i])
        training_id.extend([d['chromosome'] + '_' + str(d['position']) for d in dico['id'][i]])

    print(Counter(training_labels))

    training_data = np.array(training_data)
    training_labels = np.array(training_labels)
    training_id = np.array(training_id)
    
    training_data, training_labels, training_id = remove_label(training_data,
                                                               training_labels, training_id, label = 'UK')
    print(Counter(training_labels))

    assert len(training_data) == len(training_labels)

    return training_data, training_labels, training_id


def artificial_data():
    
    def remove_label(training_data, training_labels, label):

        # Remove windows labelled as label
        keep = np.where(np.array(training_labels) != label)
        training_data = training_data[keep]
        training_labels = training_labels[keep]

        return training_data, training_labels

    training_data = []
    training_labels = []

    base_dir = os.path.join('/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test',
                            date, 'TrainingData_'+date)
    sample = 'G1'

    for svtype in ['INDEL', 'INDEL_HOM']:

        datapath = os.path.join(base_dir, svtype, sample)
        data_file = os.path.join(datapath, 'ChannelData', sample+'.npy.gz')
        label_file = os.path.join(datapath, 'LabelData', sample+'_17_label.npy.gz')

        with gzip.GzipFile(data_file, "rb") as f:
            data_mat = np.load(f)
            training_data.extend(data_mat)
        f.close()

        with gzip.GzipFile(label_file, "rb") as f:
            label_list = np.load(f)
            training_labels.extend(label_list)
        f.close()

    training_data = np.array(training_data)
    training_labels = np.array(training_labels)
    
    training_data, training_labels = remove_label(training_data,
                                                  training_labels, label = 'INS_pos')
    print(Counter(training_labels))

    assert len(training_data) == len(training_labels)

    return training_data, training_labels


def mixed_data():

    def subsample_nosv(data, labels, pc, lab):
        
        print('subsample')

        indices_label = np.where(training_labels == lab)[0]
        print(indices_label)
        indices_to_remove = indices_label[np.arange(int(round(len(indices_label)*pc)), len(indices_label))]
        print(indices_to_remove)
        X = np.delete(data, indices_to_remove)
        y = np.delete(labels, indices_to_remove)

        print(X.shape)
        print(y.shape)
        print(Counter(y))

        return X, y

    real_training_data, real_training_labels, real_training_id = real_data()
    art_training_data, art_training_labels = artificial_data()

    # print('concatenare data')
    training_data = np.concatenate((real_training_data, art_training_data), axis=0)
    # print('concatenare labels')
    print(real_training_labels)
    print(art_training_labels)
    training_labels = np.concatenate((real_training_labels, art_training_labels), axis=0)

    for pc in np.linspace(0.1, 1, num=9):
        print(pc)
        X, y = subsample_nosv(training_data, training_labels, pc, 'noSV')

        mapclasses = {'DEL_start': 1, 'DEL_end': 0, 'noSV': 2}
        y_num = np.array([mapclasses[c] for c in y], dtype='int')
        y_binary = to_categorical(y_num)

In [57]:
mixed_data()

['22']
Loading data for Chr22
Counter({'noSV': 17527, 'UK': 32, 'DEL_start': 13, 'DEL_end': 9})
Counter({'noSV': 17527, 'DEL_start': 13, 'DEL_end': 9})
Counter({'DEL_end': 9767, 'DEL_start': 9646})
['noSV' 'noSV' 'noSV' ... 'noSV' 'noSV' 'noSV']
['DEL_start' 'DEL_start' 'DEL_start' ... 'DEL_end' 'DEL_end' 'DEL_end']
0.1
subsample
[    0     1     2 ... 17546 17547 17548]
[ 1757  1758  1759 ... 17546 17547 17548]
(214363826,)
(21188,)
Counter({'DEL_end': 9776, 'DEL_start': 9659, 'noSV': 1753})
0.21250000000000002
subsample
[    0     1     2 ... 17546 17547 17548]
[ 3730  3731  3732 ... 17546 17547 17548]
(214365797,)
(23159,)
Counter({'DEL_end': 9776, 'DEL_start': 9659, 'noSV': 3724})
0.325
subsample
[    0     1     2 ... 17546 17547 17548]
[ 5704  5705  5706 ... 17546 17547 17548]
(214367769,)
(25131,)
Counter({'DEL_end': 9776, 'DEL_start': 9659, 'noSV': 5696})
0.4375
subsample
[    0     1     2 ... 17546 17547 17548]
[ 7678  7679  7680 ... 17546 17547 17548]
(214369741,)
(27103,)
C