In [1]:
## Save a channel and label data for the NA12878 sample with the Mills2011_nanosv labels, for Chr[1,2,3]
## This dataset is used as a reference test set

In [1]:
import os
import numpy as np
from keras.utils.np_utils import to_categorical
import keras
import gzip
from collections import Counter
import pandas as pd

%pylab inline
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


Populating the interactive namespace from numpy and matplotlib


In [2]:
# Auxiliary functions

def transposeDataset(X):
    image = []
    for i in range (0, len(X -1)):
        tr = X[i].transpose()
        image.append(tr)
    return np.array(image)

def write_prec_rec(y_binary, probs):
    
    df_conf = pd.DataFrame()

    for i in np.linspace(1.0 / len(labels), 1, num=50, endpoint=False):

        predicted = np.argwhere(probs > i)[:, 1]
        y_index = np.argwhere(y_binary > i)[:, 1]

        # Rows: true, columns: predicted
        confusion_matrix = pd.crosstab(pd.Series(y_index), pd.Series(predicted))
        confusion_matrix.index = [labels[i] for i in confusion_matrix.index]
        confusion_matrix.columns = [labels[i] for i in confusion_matrix.columns]
        confusion_matrix.reindex(columns=[l for l in labels], fill_value=0)

        for l in labels:
            if l in confusion_matrix.index:
                
                label_precision = np.around(confusion_matrix.loc[l, l] / sum(confusion_matrix.loc[:, l]) * 100)
                label_recall = np.around(confusion_matrix.loc[l, l] / sum(confusion_matrix.loc[l, :]) * 100)
                label_F1 = 2 * (label_precision * label_recall) / (label_precision + label_recall)

                # print(f'Iter:{i} {l} -> Precision:{label_precision}%, Recall:{label_recall}%, F1:{label_F1}')

                df_intres = pd.DataFrame(
                    {'iteration': [i], 'label': [l],
                     'precision': [label_precision], 'recall': [label_recall], 'F1': [label_F1]})
                df_conf = df_conf.append(df_intres)

    # print(df_conf)
    df_conf.to_csv(path_or_buf=os.path.join(datapath_now,'model_PrecRec.csv'))
    

In [3]:
sample_name = 'NA12878'
#date = '231118'
date = '270219'
label_type = 'Mills2011_nanosv'

# Load label dictionary
dico_file = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'+sample_name+'/MultiLabelData/labels.pickle.gz'
with gzip.GzipFile(dico_file, "rb") as f:
    dico = np.load(f)
f.close()

In [4]:
# Create reference test set using Chr4 to ChrX

#Leaving out chromosome Y and MT for the moment
chr_list = list(map(str, np.arange(4,23)))
chr_list.append('X')

print(chr_list)

training_data = []
training_labels = []
training_id = []

datapath = '/hpc/cog_bioinf/ridder/users/lsantuari/Processed/Test/'+date+'/TestData_'+date+'/'+sample_name+'/ChannelData/'

for i in chr_list:
    
    print('Loading data for Chr%s' % i)
    data_file = datapath + sample_name + '_' + str(i) + '.npy.gz'
    with gzip.GzipFile(data_file, "rb") as f:
        data_mat = np.load(f)
        training_data.extend(data_mat)
    f.close()
    
    training_labels.extend(dico[label_type][i])
    training_id.extend([d['chromosome']+'_'+str(d['position']) for d in dico['id'][i]])
    
print(Counter(training_labels))
assert len(training_data) == len(training_labels)

['4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X']
Loading data for Chr4
Loading data for Chr5
Loading data for Chr6
Loading data for Chr7
Loading data for Chr8
Loading data for Chr9
Loading data for Chr10
Loading data for Chr11
Loading data for Chr12
Loading data for Chr13
Loading data for Chr14
Loading data for Chr15
Loading data for Chr16
Loading data for Chr17
Loading data for Chr18
Loading data for Chr19
Loading data for Chr20
Loading data for Chr21
Loading data for Chr22
Loading data for ChrX
Counter({'noSV': 934333, 'UK': 1231, 'DEL_start': 904, 'DEL_end': 823})


In [5]:
training_data = np.array(training_data)
training_labels = np.array(training_labels)
training_id = np.array(training_id)

# Remove windows labelled as unknown ('UK')
keep = np.where(np.array(training_labels)!='UK')
training_data = training_data[keep]
training_labels = training_labels[keep]
training_id = training_id[keep]

Balance dataset

In [6]:
cnt_lab = Counter(training_labels)
min_v = min([v for k, v in cnt_lab.items()])
max_v = max([v for k, v in cnt_lab.items()])
print(cnt_lab)
print('Minimum number of labels = ' + str(min_v))

data_balanced = []
labels_balanced = []
id_balanced = []

for l in cnt_lab.keys():
    #print(l)
    iw = np.where(training_labels==l)
    # ii = iw[0][:min_v]
    ii = np.random.choice(a=iw[0], size=max_v, replace=True)
    data_balanced.extend(training_data[ii])
    labels_balanced.extend(training_labels[ii])
    id_balanced.extend(training_id[ii])

print(Counter(labels_balanced))

X = np.array(data_balanced)
y = np.array(labels_balanced)
z = np.array(id_balanced)

Counter({'noSV': 934333, 'DEL_start': 904, 'DEL_end': 823})
Minimum number of labels = 823
Counter({'noSV': 823, 'DEL_end': 823, 'DEL_start': 823})


In [7]:
# Remove windows with nan if present
# print(np.where(np.isnan(X)))
idx = np.unique(np.where(np.isnan(X))[0])
print(idx)
#print(X[np.where(np.isnan(X))])
print(z[idx])
idx = numpy.unique(np.where(np.isnan(X))[0])
X = np.delete(X, idx, 0)
y = np.delete(y, idx, 0)
z = np.delete(z, idx, 0)

[]
[]


In [8]:
# GC track seems fine at positions Chr4_1423148 and Chr4_1478645

# import pyBigWig

# bw_gc = pyBigWig.open("/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/UCSC/hg19/hg19.gc5Base.bw")

# print(bw_gc.chroms())
# for loc in z[idx]:
    
#     chr_pos = loc.split('_')
#     chrName = chr_pos[0]
#     mid_pos = int(chr_pos[1])
#     win = 200
#     start_win = mid_pos-int(win/2)
#     end_win = mid_pos+int(win/2)
    
#     gc_array = bw_gc.values('chr' + chrName, start_win, end_win)
    
#     print(mid_pos)
#     print(gc_array)
#     print(len(gc_array))

In [9]:
X = transposeDataset(X)

In [10]:
mapclasses = {'DEL_start': 1, 'DEL_end': 0,  'noSV': 2}
y_num = np.array([mapclasses[c] for c in y], dtype='int')
y_binary = to_categorical(y_num)

In [11]:
print(X.shape)
print(y.shape)

(2469, 200, 29)
(2469,)


## Save channels and labels

In [13]:
%%time

import errno

datapath_training=os.path.join(datapath,'../TrainingData/balanced_r')

try:
    os.makedirs(datapath_training, exist_ok=True)
except OSError as exc:
    if exc.errno != errno.EEXIST:
        raise
    pass

data_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_channels.npy')
np.save(data_output_file, X)
os.system('gzip '+data_output_file)

label_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_labels.npy')
np.save(label_output_file, y)
os.system('gzip '+label_output_file)

label_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_labels_binary.npy')
np.save(label_output_file, y_binary)
os.system('gzip '+label_output_file)

id_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_ids.npy')
np.save(id_output_file, z)
os.system('gzip '+id_output_file)

CPU times: user 7.64 ms, sys: 703 ms, total: 711 ms
Wall time: 2.73 s


In [15]:
data_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_channels.npy.gz')
with gzip.GzipFile(data_output_file, "rb") as f:
    X = np.load(f)

label_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_labels.npy.gz')
with gzip.GzipFile(label_output_file, "rb") as f:
    y = np.load(f)
f.close()

label_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_labels_binary.npy.gz')
with gzip.GzipFile(label_output_file, "rb") as f:
    y_binary = np.load(f)
f.close()

id_output_file = os.path.join(datapath_training, sample_name + '_' + label_type + '_ids.npy.gz')
with gzip.GzipFile(id_output_file, "rb") as f:
    win_ids = np.load(f)
f.close()

print(X.shape)
print(y.shape)
print(y.shape)
print(win_ids.shape)

(2469, 200, 29)
(2469,)
(2469,)
(2469,)


In [4]:
HPC_MODE = False
sample_name = 'NA12878'
date = '270219'
label_type = 'Mills2011_nanosv'
balancing = 'balanced'

datapath_prefix = '/hpc/cog_bioinf/ridder/users/lsantuari' if HPC_MODE else '/Users/lsantuari/Documents'
datapath_training = datapath_prefix + '/Processed/Test/' + \
                    date + '/TestData_' + date + '/' + sample_name + '/TrainingData/'
datapath_test = datapath_prefix + '/Processed/Test/' + \
                date + '/TestData_' + date + '/' + sample_name + '/TestData/'

In [6]:
datapath = datapath_training

data_output_file = os.path.join(datapath, '_'.join([sample_name, label_type, balancing]))

with gzip.GzipFile(data_output_file + '.npz.gz', 'rb') as f:
    npzfiles = np.load(f)
    X = npzfiles['X']
    y = npzfiles['y']
    y_binary = npzfiles['y_binary']
    win_ids = npzfiles['z']

In [41]:
def get_channel_labels():
    # Fill labels for legend

    labels = list()
    labels.append("coverage")
    labels.append("mean_read_quality")
    labels.append("#left_clipped_reads")
    labels.append("#right_clipped_reads")
    labels.append("#CIGAR_D_left_reads")
    labels.append("#CIGAR_D_right_reads")
    labels.append("#CIGAR_I_right_reads")
    labels.append("INV_before")
    labels.append("INV_after")
    labels.append("DUP_before")
    labels.append("DUP_after")
    labels.append("TRA_opposite")
    labels.append("TRA_same")

    for direction in ['Forward', 'Reverse']:
        for clipped in ['Left', 'Right', 'All']:
            for value in ['median']:
                labels.append(direction + '_' + clipped + '_Clipped_' + value)

    labels.append("#left split reads")
    labels.append("#right split reads")

    for clipped in ['L', 'R']:
        for value in ['median']:
            labels.append(clipped + '_SplitRead_' + value)

    labels.append("Mappability")

    for nuc in ['A', 'T', 'C', 'G', 'N']:
        labels.append("One_hot_encoding_" + nuc)

    for k, l in enumerate(labels):
        print(str(k) + ':' + l)

    return labels

In [48]:
def get_classes(labels):
    return sorted(list(set(labels)))

def set_figure_size(plt):

    # plt.tight_layout()

    F = plt.gcf()
    # Now check everything with the defaults:
    DPI = F.get_dpi()
    print(
        "DPI:", DPI)
    DefaultSize = F.get_size_inches()
    print(
        "Default size in Inches", DefaultSize)
    print(
        "Which should result in a %i x %i Image" % (DPI * DefaultSize[0], DPI * DefaultSize[1]))

    F.set_figwidth(DefaultSize[0] * 5)
    F.set_figheight(DefaultSize[1] * 4)
    Size = F.get_size_inches()
    print(
        "Size in Inches", Size)

In [51]:
def plot_channels(X, y, ids):

    output_dir = os.path.join(datapath, 'channel_plots')
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    labels = get_channel_labels()
    number_channels = len(labels)

    classes = get_classes(y)

    for c in classes:

        output_dir_class = os.path.join(output_dir, c)
        if not os.path.isdir(output_dir_class):
            os.mkdir(output_dir_class)

        c_idx = np.where(y == c)[0]
        c_idx_random = np.random.choice(c_idx, size=10, replace=False)

        for i in c_idx_random:

            id_chr = ids[i]['chromosome'] + ':' + str(ids[i]['position'])
            fig_name = y[i] + '_' + 'Chr' + id_chr

            print(y[i], 'id:', 'Chr' + id_chr)
            plt.title('Class: ' + y[i] + ' ' + 'Position: Chr' + id_chr, fontsize=30)
            plt.ylim([0, number_channels])
            plt.yticks(np.arange(number_channels), labels, fontsize=20)
            plt.xticks(fontsize=30)

            plt.vlines(x=0, ymin=0, ymax=number_channels, color='black')

            for j in range(number_channels - 1, -1, -1):

                if sum(X[i,:,j]) != 0:
                    X_win = ((X[i,:,j] - min(X[i,:,j])) / max(X[i,:,j]))
                else:
                    X_win = X[i,:,j]

                Z = [x + j for x in X_win]

                plt.plot(np.arange(-100, 100), Z, label=y[j], linewidth=2)
                plt.fill_between(np.arange(-100, 100), Z, j, alpha=.5, interpolate=True)

            set_figure_size(plt)
            plt.savefig(os.path.join(output_dir_class, fig_name + '.png'))
            plt.clf()
            plt.close()

In [52]:
plot_channels(X,y,win_ids)

0:coverage
1:mean_read_quality
2:#left_clipped_reads
3:#right_clipped_reads
4:#CIGAR_D_left_reads
5:#CIGAR_D_right_reads
6:#CIGAR_I_right_reads
7:INV_before
8:INV_after
9:DUP_before
10:DUP_after
11:TRA_opposite
12:TRA_same
13:Forward_Left_Clipped_median
14:Forward_Right_Clipped_median
15:Forward_All_Clipped_median
16:Reverse_Left_Clipped_median
17:Reverse_Right_Clipped_median
18:Reverse_All_Clipped_median
19:#left split reads
20:#right split reads
21:L_SplitRead_median
22:R_SplitRead_median
23:Mappability
24:One_hot_encoding_A
25:One_hot_encoding_T
26:One_hot_encoding_C
27:One_hot_encoding_G
28:One_hot_encoding_N
DEL_end id: Chr1:243783763
DPI: 72.0
Default size in Inches [ 6.  4.]
Which should result in a 432 x 288 Image
Size in Inches [ 30.  16.]
DEL_end id: Chr9:93665536
DPI: 72.0
Default size in Inches [ 6.  4.]
Which should result in a 432 x 288 Image
Size in Inches [ 30.  16.]
DEL_end id: Chr4:31406966
DPI: 72.0
Default size in Inches [ 6.  4.]
Which should result in a 432 x 288 

In [25]:
print(y)

['noSV' 'noSV' 'noSV' ..., 'DEL_end' 'DEL_end' 'DEL_end']


In [22]:
print(win_ids)

[{'chromosome': '1', 'position': 10151}
 {'chromosome': '1', 'position': 10150}
 {'chromosome': '1', 'position': 10114} ...,
 {'chromosome': 'X', 'position': 143409968}
 {'chromosome': 'X', 'position': 145330210}
 {'chromosome': 'X', 'position': 150295715}]
