In [None]:
## Save a channel and label data for the NA12878 sample with the Mills2011_nanosv labels, for Chr[1,2,3]
## This dataset is used as a reference test set

In [None]:
import os
import numpy as np
from keras.utils.np_utils import to_categorical
import keras
import gzip
from collections import Counter
import pandas as pd

%pylab inline
%load_ext autoreload
%autoreload 2

In [None]:
# Auxiliary functions

def transposeDataset(X):
    image = []
    for i in range (0, len(X -1)):
        tr = X[i].transpose()
        image.append(tr)
    return np.array(image)

def write_prec_rec(y_binary, probs):
    
    df_conf = pd.DataFrame()

    for i in np.linspace(1.0 / len(labels), 1, num=50, endpoint=False):

        predicted = np.argwhere(probs > i)[:, 1]
        y_index = np.argwhere(y_binary > i)[:, 1]

        # Rows: true, columns: predicted
        confusion_matrix = pd.crosstab(pd.Series(y_index), pd.Series(predicted))
        confusion_matrix.index = [labels[i] for i in confusion_matrix.index]
        confusion_matrix.columns = [labels[i] for i in confusion_matrix.columns]
        confusion_matrix.reindex(columns=[l for l in labels], fill_value=0)

        for l in labels:
            if l in confusion_matrix.index:
                
                label_precision = np.around(confusion_matrix.loc[l, l] / sum(confusion_matrix.loc[:, l]) * 100)
                label_recall = np.around(confusion_matrix.loc[l, l] / sum(confusion_matrix.loc[l, :]) * 100)
                label_F1 = 2 * (label_precision * label_recall) / (label_precision + label_recall)

                # print(f'Iter:{i} {l} -> Precision:{label_precision}%, Recall:{label_recall}%, F1:{label_F1}')

                df_intres = pd.DataFrame(
                    {'iteration': [i], 'label': [l],
                     'precision': [label_precision], 'recall': [label_recall], 'F1': [label_F1]})
                df_conf = df_conf.append(df_intres)

    # print(df_conf)
    df_conf.to_csv(path_or_buf=os.path.join(datapath_now,'model_PrecRec.csv'))
    

In [None]:
chr_list = list(map(str, np.arange(1,23)))
chr_list.extend(['X','Y'])

date = '070119'
label_type = 'bpi'
base_dir = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'
comparisons = os.listdir(base_dir)

ids = []
labels = []
for sample_name in comparisons:
    print('Loading %s labels...'%sample_name)
    dico_file = base_dir+sample_name+'/MultiLabelData/labels.pickle.gz'
    with gzip.GzipFile(dico_file, "rb") as f:
        dico = np.load(f)
    f.close()
    for chrom_name in chr_list:
        if chrom_name in dico[label_type].keys():
            labels.extend(dico[label_type][chrom_name])
            ids.extend(dico['id'][chrom_name])
    
print(Counter(labels))

In [None]:
labels = np.array(labels)
ids = np.array(ids)

label_output_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData/all_labels.npy'
np.save(label_output_file, labels)
os.system('gzip '+label_output_file)

id_output_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData/all_ids.npy'
np.save(id_output_file, ids)
os.system('gzip '+id_output_file)

In [None]:
# sample_name = 'NA12878'
# date = '231118'
# label_type = 'bpi'

# # Load label dictionary
# dico_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'+sample_name+'/MultiLabelData/labels.pickle.gz'
# with gzip.GzipFile(dico_file, "rb") as f:
#     dico = np.load(f)
# f.close()

In [None]:
# print(chr_list)

data = []
labels = []
ids = []

for sample_name in comparisons:
    
    print('Loading sample: %s...'%sample_name)
    datapath = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'+sample_name

    partial_data = []
    partial_labels = []
    partial_id = []
    
    dico_file = datapath+'/MultiLabelData/labels.pickle.gz'
    with gzip.GzipFile(dico_file, "rb") as f:
        dico = np.load(f)
    f.close()
    
    print(dico[label_type].keys())
    # print(chr_list)
    
    #for i in dico[label_type].keys():
    for i in chr_list:
        if i in dico[label_type].keys():
            if not (sample_name == 'O16_B16' and i == '2'):

                print('Loading data for Chr%s' % i)

                partial_labels.extend(dico[label_type][i])
                partial_id.extend([d['chromosome']+'_'+str(d['position']) for d in dico['id'][i]])

                data_file = datapath+ '/ChannelData/'+ str(i) + '_channel_maker_real_germline.npy.gz'
                with gzip.GzipFile(data_file, "rb") as f:
                    data_mat = np.load(f)
                    partial_data.extend(data_mat)
                f.close()

    partial_labels = np.array(partial_labels)
    i_nosv = np.where(partial_labels=='noSV')[0]
    
    #print(i_nosv)
    
    i_nosv_idx = np.random.choice(a=i_nosv,
                                  #size=int(np.round(i_nosv.shape[0]/100)),
                                  size=100,
                                  replace=False)
    i_sv = np.where(partial_labels!='noSV')[0]
    
    partial_data = np.array(partial_data)
    partial_data = np.append(partial_data[i_sv], partial_data[i_nosv_idx])
    
    partial_labels = np.array(partial_labels)
    partial_data = np.append(partial_labels[i_sv], partial_labels[i_nosv_idx])
    
    partial_id = np.array(partial_id)
    partial_id = np.append(partial_id[i_sv], partial_id[i_nosv_idx])
    
    data.extend(partial_data)
    labels.extend(partial_labels)
    ids.extend(partial_id)
    
print(Counter(labels))
assert len(data) == len(labels)

In [None]:
training_data = np.array(data)
training_labels = np.array(labels)
training_id = np.array(ids)

data_output_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData/data.npy'
np.save(data_output_file, data)
os.system('gzip '+data_output_file)

label_output_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData/labels.npy'
np.save(label_output_file, labels)
os.system('gzip '+label_output_file)

id_output_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData/ids.npy'
np.save(id_output_file, ids)
os.system('gzip '+id_output_file)

# Load and process training data from now on

In [None]:
# Loading data

data_output_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData/data.npy.gz'

with gzip.GzipFile(data_output_file, "rb") as f:
    training_data = np.load(f)
f.close()

label_output_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData/labels.npy.gz'

with gzip.GzipFile(label_output_file, "rb") as f:
    training_labels = np.load(f)
f.close()

id_output_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData/ids.npy.gz'

with gzip.GzipFile(id_output_file, "rb") as f:
    training_id = np.load(f)
f.close()

In [None]:
training_data = np.array(training_data)
training_labels = np.array(training_labels)
training_id = np.array(training_id)

for l in ['UK', 'INS_start']:
    # Remove windows labelled as unknown ('UK')
    keep = np.where(np.array(training_labels)!=l)
    training_data = training_data[keep]
    training_labels = training_labels[keep]
    training_id = training_id[keep]

Balance dataset

In [None]:
cnt_lab = Counter(training_labels)
min_v = min([v for k, v in cnt_lab.items()])
print(cnt_lab)
print('Minimum number of labels = ' + str(min_v))

data_balanced = []
labels_balanced = []
id_balanced = []

for l in cnt_lab.keys():
    #print(l)
    iw = np.where(training_labels==l)
    ii = iw[0][:min_v]
    data_balanced.extend(training_data[ii])
    labels_balanced.extend(training_labels[ii])
    id_balanced.extend(training_id[ii])

print(Counter(labels_balanced))

X = np.array(data_balanced)
y = np.array(labels_balanced)
z = np.array(id_balanced)

Upsample dataset

In [None]:
cnt_lab = Counter(training_labels)
max_v = max([v for k, v in cnt_lab.items()])
print(cnt_lab)
print('Maximum number of labels = ' + str(max_v))

data_balanced = []
labels_balanced = []
id_balanced = []

for l in cnt_lab.keys():
    #print(l)
    iw = np.where(training_labels==l)
    #ii = iw[0][:min_v]
    ii = np.random.choice(a=iw[0], size=max_v, replace=True)
    data_balanced.extend(training_data[ii])
    labels_balanced.extend(training_labels[ii])
    id_balanced.extend(training_id[ii])

print(Counter(labels_balanced))

X = np.array(data_balanced)
y = np.array(labels_balanced)
z = np.array(id_balanced)

In [None]:
X = np.array(training_data)
y = np.array(training_labels)
z = np.array(training_id)

In [None]:
# Remove windows with nan if present
# print(np.where(np.isnan(X)))
idx = np.unique(np.where(np.isnan(X))[0])
print(idx)
#print(X[np.where(np.isnan(X))])
print(z[idx])
idx = numpy.unique(np.where(np.isnan(X))[0])
X = np.delete(X, idx, 0)
y = np.delete(y, idx, 0)
z = np.delete(z, idx, 0)

In [None]:
# GC track seems fine at positions Chr4_1423148 and Chr4_1478645

# import pyBigWig

# bw_gc = pyBigWig.open("/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/UCSC/hg19/hg19.gc5Base.bw")

# for loc in z[idx]:
    
#     chr_pos = loc.split('_')
#     chrName = chr_pos[0]
#     mid_pos = int(chr_pos[1])
#     win = 200
#     start_win = mid_pos-int(win/2)
#     end_win = mid_pos+int(win/2)
    
#     gc_array = bw_gc.values('chr' + chrName, start_win, end_win)
    
#     print(mid_pos)
#     print(gc_array)
#     print(len(gc_array))

In [None]:
X = transposeDataset(X)

In [None]:
# Derive mapclasses
classes = sorted(list(set(y)))
mapclasses = dict()
for i, c in enumerate(classes):
    mapclasses[c] = i
# for ... in ...
# mapclasses = {'DEL_start': 1, 'DEL_end': 0,  'noSV': 2}
print(mapclasses)
y_num = np.array([mapclasses[c] for c in y], dtype='int')
y_binary = to_categorical(y_num)
print(y_binary)

In [None]:
print(X.shape)
print(y.shape)
print(y_binary.shape)

## Save channels and labels

In [None]:
%%time

import errno

datapath = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date
datapath_training=os.path.join(datapath,'TrainingData/')

try:
    os.mkdir(datapath_training)
except OSError as exc:
    if exc.errno != errno.EEXIST:
        raise
    pass

data_output_file = datapath_training + 'OC' + '_' + label_type + '_upsampled.npz'
np.savez(data_output_file, X=X, y=y, y_binary=y_binary, ids=z)
os.system('gzip -f '+data_output_file)

In [None]:
print(data_output_file)

In [None]:
data_output_file = datapath_training + 'OC' + '_' + label_type + '.npz.gz'
with gzip.GzipFile(data_output_file, "rb") as f:
    npzfiles = np.load(f)
    X_loaded = npzfiles['X']
    y_loaded = npzfiles['y']
    y_binary_loaded = npzfiles['y_binary']
    win_ids_loaded = npzfiles['ids']

print(X_loaded.shape)
print(y_loaded.shape)
print(y_binary_loaded.shape)
print(win_ids_loaded.shape)