In [1]:
%run setup.ipynb

In [2]:
##output_dir = '/Users/Utente/Documents/UROP2018/Outputfiles/vector/observatory/analysis/107-sfs-data-request-imperial'
output_dir = '/home/joshjrey/Documents/mosquito_project/outputs/build_blocks'

In [3]:
tbl_features = etl.frompickle(os.path.join(output_dir, 'tbl_features.pkl.gz'))
tbl_features

0|seqid,1|type,2|start,3|stop,4|length,5|strand,6|phase,7|parent,8|ID,9|n_children,10|transcript_length,11|is_canonical,12|is_first,13|is_last
2L,chromosome,0,49364325,49364325,.,.,,2L,0,,,,
2L,gene,157347,186936,29589,-,.,,AGAP004677,2,,,,
2L,mRNA,157347,181305,23958,-,.,AGAP004677,AGAP004677-RA,10,1467.0,True,,
2L,mRNA,157347,186936,29589,-,.,AGAP004677,AGAP004677-RB,10,1147.0,False,,
2L,exon,157347,157623,276,-,.,AGAP004677-RA,,0,,True,False,True


In [4]:
CLS_UPSTREAM = 1
CLS_DOWNSTREAM = 2
CLS_5UTR = 3
CLS_3UTR = 4
CLS_CDS_FIRST = 5
CLS_CDS_MID = 6
CLS_CDS_LAST = 7
CLS_INTRON_FIRST = 8
CLS_INTRON_MID = 9
CLS_INTRON_LAST = 10

feature_cls_names = [
    'Unknown',
    'Upstream',
    'Downstream',
    "5' UTR",
    "3' UTR",
    "CDS (first)",
    "CDS (mid)",
    "CDS (last)",
    "Intron (first)",
    "Intron (mid)",
    "Intron (last)",
]

In [5]:
genome = phase2_ar1.genome_agamp3
genome

<pyfasta.fasta.Fasta at 0x7f2bd5369da0>

In [6]:
seq_cls = {k: np.zeros(len(genome[k]), dtype='u1') for k in genome.keys()}
seq_relpos_start = {k: np.zeros(len(genome[k]), dtype='u4') for k in genome.keys()}
seq_relpos_stop = {k: np.zeros(len(genome[k]), dtype='u4') for k in genome.keys()}
seq_flen = {k: np.zeros(len(genome[k]), dtype='u4') for k in genome.keys()}

In [7]:
seq_cls

{'2L': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 '2R': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 '3L': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 '3R': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'UNKN': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'X': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'Y_unplaced': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)}

In [8]:
# build the upstream and downstream classes
############################################

def build_upstream_downstream():
    prv_gene = None
    for gene in tbl_features.eq('type', 'gene').records():
        seqid = gene.seqid
        if seqid in genome and prv_gene is not None and (gene.start > prv_gene.stop) and gene.seqid == prv_gene.seqid:

            # midpoint between previous and current genes
            m = (prv_gene.stop + gene.start) // 2

            # deal with previous gene
            if prv_gene.strand == '+':
                prv_cls = CLS_DOWNSTREAM
                seq_relpos_start[seqid][prv_gene.stop:m] = np.arange(m - prv_gene.stop)
                seq_relpos_stop[seqid][prv_gene.stop:m] = np.arange(m - prv_gene.stop)[::-1]
            else:
                prv_cls = CLS_UPSTREAM
                seq_relpos_start[seqid][prv_gene.stop:m] = np.arange(m - prv_gene.stop)[::-1]
                seq_relpos_stop[seqid][prv_gene.stop:m] = np.arange(m - prv_gene.stop)
            seq_cls[seqid][prv_gene.stop:m] = prv_cls
            seq_flen[seqid][prv_gene.stop:m] = m - prv_gene.stop

            # deal with current gene
            if gene.strand == '+':
                cls = CLS_UPSTREAM
                seq_relpos_start[seqid][m:gene.start] = np.arange(gene.start - m)
                seq_relpos_stop[seqid][m:gene.start] = np.arange(gene.start - m)[::-1]
            else:
                cls = CLS_DOWNSTREAM
                seq_relpos_start[seqid][m:gene.start] = np.arange(gene.start - m)[::-1]
                seq_relpos_stop[seqid][m:gene.start] = np.arange(gene.start - m)  
            seq_cls[seqid][m:gene.start] = cls
            seq_flen[seqid][m:gene.start] = gene.start - m

        prv_gene = gene

build_upstream_downstream()

In [9]:
x = np.bincount(seq_cls['2L'])
for i, n in enumerate(feature_cls_names):
    if i < x.shape[0]:
        print(n, x[i])

Unknown 18853692
Upstream 16759432
Downstream 13751201


In [10]:
# build the UTR classes
########################

def build_utr():
    for f in tbl_features.records():
        seqid = f.seqid
        if seqid in genome:
            cls = None
            if f['type'] == 'five_prime_UTR':
                cls = CLS_5UTR
            elif f['type'] == 'three_prime_UTR':
                cls = CLS_3UTR

            if cls:
                seq_cls[seqid][f.start:f.stop] = cls
                if f.strand == '+':
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                else:
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                seq_flen[seqid][f.start:f.stop] = f.stop - f.start
            
build_utr()

In [11]:
x = np.bincount(seq_cls['2L'])
for i, n in enumerate(feature_cls_names):
    if i < x.shape[0]:
        print(n, x[i])

Unknown 18043830
Upstream 16745987
Downstream 13741042
5' UTR 366421
3' UTR 467045


In [12]:
# build the CDS classes
#########################

def build_cds():
    for f in tbl_features.eq('type', 'CDS').records():
        seqid = f.seqid
        if seqid in genome:
            cls = None
            if f.is_first and not f.is_last:
                cls = CLS_CDS_FIRST
            elif not f.is_first and f.is_last:
                cls = CLS_CDS_LAST
            elif not f.is_first and not f.is_last:
                cls = CLS_CDS_MID

            if cls:
                seq_cls[seqid][f.start:f.stop] = cls
                if f.strand == '+':
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                else:
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                seq_flen[seqid][f.start:f.stop] = f.stop - f.start
                
build_cds()

In [13]:
x = np.bincount(seq_cls['2L'])
for i, n in enumerate(feature_cls_names):
    if i < x.shape[0]:
        print(n, x[i])

Unknown 13818524
Upstream 16681539
Downstream 13675346
5' UTR 355454
3' UTR 457012
CDS (first) 744553
CDS (mid) 2413343
CDS (last) 1218554


In [14]:
def build_intron():
    for f in tbl_features.eq('type', 'intron').records():
        seqid = f.seqid
        if seqid in genome:
            cls = None
            if f.is_first and not f.is_last:
                cls = CLS_INTRON_FIRST
            elif not f.is_first and f.is_last:
                cls = CLS_INTRON_LAST
            elif not f.is_first and not f.is_last:
                cls = CLS_INTRON_MID

            if cls:
                seq_cls[seqid][f.start:f.stop] = cls
                if f.strand == '+':
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                else:
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                seq_flen[seqid][f.start:f.stop] = f.stop - f.start

build_intron()

In [15]:
x = np.bincount(seq_cls['2L'])
for i, n in enumerate(feature_cls_names):
    if i < x.shape[0]:
        print(n, x[i])

Unknown 888002
Upstream 15997953
Downstream 13072291
5' UTR 308767
3' UTR 432940
CDS (first) 676066
CDS (mid) 2305092
CDS (last) 1117685
Intron (first) 4750086
Intron (mid) 8355815
Intron (last) 1459628


In [16]:
import zarr

In [17]:
!rm -rv {output_dir}/seq_cls.zarr*

OSError: [Errno 12] Cannot allocate memory

In [18]:
zarr.save_group(os.path.join(output_dir, 'seq_cls.zarr.zip'), **seq_cls)

In [20]:
!ls -lh {output_dir}

OSError: [Errno 12] Cannot allocate memory

In [21]:
seq_cls_out = zarr.open_group(os.path.join(output_dir, 'seq_cls.zarr.zip'))
seq_cls_out.tree()

In [22]:
seq_cls_out['2L'][:]

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [23]:
zarr.save_group(os.path.join(output_dir, 'seq_relpos_start.zarr.zip'), **seq_relpos_start)

In [24]:
zarr.save_group(os.path.join(output_dir, 'seq_relpos_stop.zarr.zip'), **seq_relpos_stop)

In [25]:
zarr.save_group(os.path.join(output_dir, 'seq_flen.zarr.zip'), **seq_flen)

In [26]:
!ls -lh {output_dir}

OSError: [Errno 12] Cannot allocate memory

In [28]:
#output_dir = Path('/Users/Utente/Documents/UROP2018/Outputfiles/vector/observatory/analysis/107-sfs-data-request-imperial')
output_dir = Path('/home/joshjrey/Documents/mosquito_project/outputs/build_blocks')
codon_position = zarr.open_group(str(output_dir / 'codon_position.zarr.zip'))['3L'][:]
seq_cls_1 = seq_cls['3L']

unique, counts = np.unique(seq_cls_1[(codon_position!=-1)], return_counts=True)
dict(zip(unique, counts))

{0: 282086,
 3: 273,
 4: 619,
 5: 537994,
 6: 1524522,
 7: 759610,
 8: 27491,
 9: 33785,
 10: 21082}