In [131]:
import zarr
import gzip
import json
import os
import numpy as np

In [132]:
samples = os.listdir('svchan_data')

In [133]:
samples

['HG01114',
 'HG02924',
 'NA06991',
 'HG00420',
 'HG03992',
 'HG02018',
 'HG01053',
 'HG01881']

In [134]:
sample_out = 'HG00420'
chr_out = 'chr3'

In [135]:
X = []
y = []
win_pos = []
samples_list = []

for s in samples:
    if s != sample_out:
        w = os.path.join('svchan_data',s,'channels.zarr.zip')
        l = os.path.join('svchan_data',s,'labels.json.gz')
        
        partial_X = zarr.load(w)
        with gzip.GzipFile(l, 'r') as fin:
            partial_y = json.loads(fin.read().decode('utf-8'))
            
        X.extend(partial_X)
        y.extend(partial_y.values())
        win_pos.extend(partial_y.keys())
        # add sample name
        samples_list.extend([s] * len(partial_y))

X = np.stack(X, axis=0)
y = np.array(y)

In [136]:
first_chrom = [w.split('/')[0] for w in win_pos]

In [137]:
from collections import Counter
Counter(y)

Counter({'noDEL': 72720, 'DEL': 16322})

In [138]:
first_chrom = np.array(first_chrom)

In [139]:
X_nochr1 = X[first_chrom!=chr_out]
y_nochr1 = y[first_chrom!=chr_out]

In [140]:
X_nochr1.shape

(84991, 35, 256)

In [141]:
y_nochr1.shape

(84991,)

In [142]:
Counter(y_nochr1)

Counter({'noDEL': 69746, 'DEL': 15245})

In [143]:
w = os.path.join('svchan_data',sample_out,'channels.zarr.zip')
l = os.path.join('svchan_data',sample_out,'labels.json.gz')

X_i = zarr.load(w)
with gzip.GzipFile(l, 'r') as fin:
    y_partial = json.loads(fin.read().decode('utf-8'))
    
y_i = y_partial.values()
win_pos_i = y_partial.keys()

X_i = np.array(X_i)
y_i = np.array(list(y_i))

In [144]:
first_chrom_i = [w.split('/')[0] for w in win_pos_i]
first_chrom_i = np.array(first_chrom_i)

In [145]:
X_chr1 = X_i[first_chrom_i==chr_out]
#y_chr1 = y_i[first_chrom_i=='chr1']
X_chr1.shape

(534, 35, 256)

In [146]:
y_chr1 = y_i[first_chrom_i==chr_out]
y_chr1.shape

(534,)

In [147]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import (Activation, BatchNormalization,
                                     Convolution1D, Dense, Flatten,
                                     Dropout)
from tensorflow.keras.models import Sequential

def create_model(X, outputdim, learning_rate, regularization_rate,
                 filters, layers, kernel_size, fc_nodes):
    weightinit = 'lecun_uniform'  # weight initialization

    model = Sequential()

    model.add(BatchNormalization(input_shape=(X.shape[1], X.shape[2])))

    filters_list = [filters] * layers

    for filter_number in filters_list:
        model.add(
            Convolution1D(filter_number,
                          kernel_size=(kernel_size,),
                          padding='same',
                          kernel_regularizer=l2(regularization_rate),
                          kernel_initializer=weightinit))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dropout(rate=0.2))

    model.add(Flatten())

    model.add(
        Dense(units=fc_nodes,
              kernel_regularizer=l2(regularization_rate),
              kernel_initializer=weightinit))  # Fully connected layer
    model.add(Activation('relu'))  # Relu activation
    model.add(Dropout(rate=0.2))

    model.add(Dense(units=outputdim, kernel_initializer=weightinit))
    model.add(BatchNormalization())
    model.add(Activation("sigmoid"))  # Final classification layer

    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=learning_rate),
                  metrics=['accuracy'])

    return model

In [148]:
learning_rate = 1e-4
regularization_rate = 1e-1
filters = 4
layers = 1
kernel_size = 7 
fc_nodes = 4
outputdim = 2

In [149]:
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical

mapclasses = {'DEL': 0, 'noDEL': 1}
y_lab = np.array([mapclasses[i] for i in y_nochr1])
classes = np.array(np.unique(y_lab))
class_weights = compute_class_weight('balanced', classes, y_lab)
class_weights = {i: v for i, v in enumerate(class_weights)}
train_y = to_categorical(y_lab, num_classes=2)



In [150]:
model = create_model(X_nochr1, 2,
                     learning_rate,
                     regularization_rate,
                     filters,
                     layers,
                     kernel_size,
                     fc_nodes)
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_12 (Batc (None, 35, 256)           1024      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 35, 4)             7172      
_________________________________________________________________
batch_normalization_13 (Batc (None, 35, 4)             16        
_________________________________________________________________
activation_12 (Activation)   (None, 35, 4)             0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 35, 4)             0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 140)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 4)                

In [None]:
history = model.fit(x=X_nochr1, y=train_y,
                    epochs=50, batch_size=32,
                    shuffle=True,
                    validation_split=0.3,
                    class_weight=class_weights)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50

In [None]:
model.save('model_'+sample_out+'_'+chr_out'.h5')

In [None]:
y_i_lab = np.array([mapclasses[i] for i in y_chr1])
test_y = to_categorical(y_i_lab, num_classes=2)

In [None]:
import pandas as pd
class_labels = ['DEL', 'noDEL']
probs = model.predict(X_chr1, batch_size=1000, verbose=False)
# columns are predicted, rows are truth
predicted = probs.argmax(axis=1)
print(f"predicted shape: {predicted.shape}")
y_index = test_y.argmax(axis=1)
confusion_matrix = pd.crosstab(pd.Series(y_index), pd.Series(predicted))
confusion_matrix.index = ['true_'+class_labels[i] for i in confusion_matrix.index]
confusion_matrix.columns = ['predicted_'+class_labels[i]
                            for i in confusion_matrix.columns]
confusion_matrix.reindex(columns=class_labels, fill_value=0)
confusion_matrix.to_csv(sample_out+'_'+chr_out+'_confusion_matrix.csv')

In [None]:
confusion_matrix

In [None]:
Counter(y_chr1)