In [1]:
import pandas as pd
import os
import numpy as np

import pysam

import matplotlib.pyplot as plt

from denovonet.dataset import Dataset

from contextlib import redirect_stdout

In [2]:
DATASET_FOLDER = '/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset'
IMAGES_FOLDER = os.path.join(DATASET_FOLDER, 'images')

reference_genome_path = '/ifs/data/research/projects/solve_rd/Extra/GRCh37_WithScaffoldsChr/hs37d5.fa.gz'
REREFERENCE_GENOME = pysam.FastaFile(reference_genome_path)

dataset_path = os.path.join(
    DATASET_FOLDER, 'training_dataset.csv'
)

dataset = pd.read_csv(dataset_path).fillna('')

dataset.head(2)

Unnamed: 0,Chromosome,Start position,End position,Reference,Variant,Variant type,De novo assessment,Child,Father,Mother,Tag,Child_BAM,Father_BAM,Mother_BAM,Dataset
0,chr20,35827028,35827028.0,,TT,Insertion,PV MV,DNA15-09361B,DNA15-09135B,DNA15-09134B,sampled negative,/ifs/data/diagnostics/bgi/exomes/work/DNA15-09...,/ifs/data/diagnostics/bgi/exomes/work/DNA15-09...,/ifs/data/diagnostics/bgi/exomes/work/DNA15-09...,train
1,chr3,105412286,105412286.0,,TT,Insertion,PV MV,DNA15-09361B,DNA15-09135B,DNA15-09134B,sampled negative,/ifs/data/diagnostics/bgi/exomes/work/DNA15-09...,/ifs/data/diagnostics/bgi/exomes/work/DNA15-09...,/ifs/data/diagnostics/bgi/exomes/work/DNA15-09...,train


In [3]:
dataset['Dataset'].unique()

array(['train', 'val', 'test'], dtype=object)

In [None]:
! rm -rf {IMAGES_FOLDER}
! mkdir {IMAGES_FOLDER}

# Deletions

In [5]:
!rm -rf {IMAGES_FOLDER}/deletions

!mkdir {IMAGES_FOLDER}/deletions
!mkdir {IMAGES_FOLDER}/deletions/train
!mkdir {IMAGES_FOLDER}/deletions/val
!mkdir {IMAGES_FOLDER}/deletions/train/iv
!mkdir {IMAGES_FOLDER}/deletions/train/dnm
!mkdir {IMAGES_FOLDER}/deletions/val/iv
!mkdir {IMAGES_FOLDER}/deletions/val/dnm

In [4]:
!mkdir {IMAGES_FOLDER}/deletions/test
!mkdir {IMAGES_FOLDER}/deletions/test/iv
!mkdir {IMAGES_FOLDER}/deletions/test/dnm

mkdir: cannot create directory '/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/deletions/test': File exists
mkdir: cannot create directory '/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/deletions/test/iv': File exists
mkdir: cannot create directory '/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/deletions/test/dnm': File exists


In [5]:
DATASET_NAME = 'deletions'

train_variants_path = os.path.join(DATASET_FOLDER, 'del_train.csv')

val_variants_path = os.path.join(DATASET_FOLDER, 'del_val.csv')

test_variants_path = os.path.join(DATASET_FOLDER, 'del_test.csv')


dataset[(dataset['Variant type'] == 'Deletion') & 
       (dataset['Dataset'] == 'train')].to_csv(
    train_variants_path, sep='\t', index=False
)

dataset[(dataset['Variant type'] == 'Deletion') & 
       (dataset['Dataset'] == 'val')].to_csv(
    val_variants_path, sep='\t', index=False
)

dataset[(dataset['Variant type'] == 'Deletion') & 
       (dataset['Dataset'] == 'test')].to_csv(
    test_variants_path, sep='\t', index=False
)

In [7]:
print('Building validation dataset based on file {}'.format(train_variants_path))

with open(os.path.join(DATASET_FOLDER, 'del_train_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(train_variants_path, 'train', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/del_train.csv


In [8]:
print('Building validation dataset based on file {}'.format(val_variants_path))

with open(os.path.join(DATASET_FOLDER, 'del_val_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(val_variants_path, 'val', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/del_val.csv


In [6]:
print('Building validation dataset based on file {}'.format(test_variants_path))

with open(os.path.join(DATASET_FOLDER, 'del_test_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(test_variants_path, 'test', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/del_val.csv


# Insertions

In [10]:
!rm -rf {IMAGES_FOLDER}/insertions

!mkdir {IMAGES_FOLDER}/insertions
!mkdir {IMAGES_FOLDER}/insertions/train
!mkdir {IMAGES_FOLDER}/insertions/val
!mkdir {IMAGES_FOLDER}/insertions/train/iv
!mkdir {IMAGES_FOLDER}/insertions/train/dnm
!mkdir {IMAGES_FOLDER}/insertions/val/iv
!mkdir {IMAGES_FOLDER}/insertions/val/dnm

In [7]:
!mkdir {IMAGES_FOLDER}/insertions/test
!mkdir {IMAGES_FOLDER}/insertions/test/iv
!mkdir {IMAGES_FOLDER}/insertions/test/dnm

In [8]:
DATASET_NAME = 'insertions'

train_variants_path = os.path.join(DATASET_FOLDER, 'ins_train.csv')

val_variants_path = os.path.join(DATASET_FOLDER, 'ins_val.csv')

test_variants_path = os.path.join(DATASET_FOLDER, 'ins_test.csv')


dataset[(dataset['Variant type'] == 'Insertion') & 
       (dataset['Dataset'] == 'train')].to_csv(
    train_variants_path, sep='\t', index=False
)

dataset[(dataset['Variant type'] == 'Insertion') & 
       (dataset['Dataset'] == 'val')].to_csv(
    val_variants_path, sep='\t', index=False
)

dataset[(dataset['Variant type'] == 'Insertion') & 
       (dataset['Dataset'] == 'test')].to_csv(
    test_variants_path, sep='\t', index=False
)

In [12]:
print('Building validation dataset based on file {}'.format(train_variants_path))

with open(os.path.join(DATASET_FOLDER, 'ins_train_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(train_variants_path, 'train', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/ins_train.csv


In [13]:
print('Building validation dataset based on file {}'.format(val_variants_path))

with open(os.path.join(DATASET_FOLDER, 'ins_val_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(val_variants_path, 'val', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/ins_val.csv


In [9]:
print('Building validation dataset based on file {}'.format(test_variants_path))

with open(os.path.join(DATASET_FOLDER, 'ins_test_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(test_variants_path, 'test', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/ins_test.csv


# SNPs

In [15]:
!rm -rf {IMAGES_FOLDER}/snps

!mkdir {IMAGES_FOLDER}/snps
!mkdir {IMAGES_FOLDER}/snps/train
!mkdir {IMAGES_FOLDER}/snps/val
!mkdir {IMAGES_FOLDER}/snps/train/iv
!mkdir {IMAGES_FOLDER}/snps/train/dnm
!mkdir {IMAGES_FOLDER}/snps/val/iv
!mkdir {IMAGES_FOLDER}/snps/val/dnm

In [10]:
!mkdir {IMAGES_FOLDER}/snps/test
!mkdir {IMAGES_FOLDER}/snps/test/iv
!mkdir {IMAGES_FOLDER}/snps/test/dnm

In [11]:
DATASET_NAME = 'snps'

train_variants_path = os.path.join(DATASET_FOLDER, 'snp_train.csv')

val_variants_path = os.path.join(DATASET_FOLDER, 'snp_val.csv')

test_variants_path = os.path.join(DATASET_FOLDER, 'snp_test.csv')


dataset[(dataset['Variant type'] == 'Substitution') & 
       (dataset['Dataset'] == 'train')].to_csv(
    train_variants_path, sep='\t', index=False
)

dataset[(dataset['Variant type'] == 'Substitution') & 
       (dataset['Dataset'] == 'val')].to_csv(
    val_variants_path, sep='\t', index=False
)

dataset[(dataset['Variant type'] == 'Substitution') & 
       (dataset['Dataset'] == 'test')].to_csv(
    test_variants_path, sep='\t', index=False
)

In [17]:
print('Building validation dataset based on file {}'.format(train_variants_path))

with open(os.path.join(DATASET_FOLDER, 'snp_train_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(train_variants_path, 'train', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/snp_train.csv


In [18]:
print('Building validation dataset based on file {}'.format(val_variants_path))

with open(os.path.join(DATASET_FOLDER, 'snp_val_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(val_variants_path, 'val', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/snp_val.csv


In [12]:
print('Building validation dataset based on file {}'.format(test_variants_path))

with open(os.path.join(DATASET_FOLDER, 'snp_test_out.log'), 'w') as f:
     with redirect_stdout(f):
        train_dataset = Dataset(test_variants_path, 'test', REREFERENCE_GENOME)
        train_dataset.save_images(IMAGES_FOLDER, DATASET_NAME)

Building validation dataset based on file /ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/snp_test.csv


# Picture generated stats

In [13]:
dataset_names = ['insertions', 'deletions', 'snps']
dataset_types = ['train', 'val', 'test']
class_types = ['iv', 'dnm']

for dataset_name in dataset_names:
    for dataset_type in dataset_types:
        for class_type in class_types:
            path = os.path.join(IMAGES_FOLDER, dataset_name, dataset_type, class_type)
            print (path, len(os.listdir(path)))
        
    print()

/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/insertions/train/iv 556
/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/insertions/train/dnm 644
/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/insertions/val/iv 76
/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/insertions/val/dnm 21
/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/insertions/test/iv 80
/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/insertions/test/dnm 25

/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/deletions/train/iv 733
/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/deletions/train/dnm 656
/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/deletions/val/iv 101
/ifs/data/research/projects/gelana/denovo_cnn/data/training_dataset/images/deletions/val/dnm 32
/ifs/data/research/proj