In [1]:
%run setup.ipynb

In [2]:
tbl_features = (
    etl
    .fromgff3(phase2_ar1.geneset_agamp44_fn)
    .convert('start', lambda v: v-1)
    .rename('end', 'stop')
    .unpackdict('attributes', ['Parent', 'ID'])
    .rename('Parent', 'parent')
    .addfield('length', lambda rec: rec.stop-rec.start, index=5)
    .cutout('source', 'score')
    .cache()
)

tbl_features.display(20)

0|seqid,1|type,2|start,3|stop,4|length,5|strand,6|phase,7|parent,8|ID
2L,chromosome,0,49364325,49364325,.,.,,2L
2L,three_prime_UTR,157347,157495,148,-,.,AGAP004677-RA,
2L,three_prime_UTR,157347,157495,148,-,.,AGAP004677-RB,
2L,exon,157347,157623,276,-,.,AGAP004677-RA,
2L,exon,157347,157623,276,-,.,AGAP004677-RB,
2L,mRNA,157347,181305,23958,-,.,AGAP004677,AGAP004677-RA
2L,gene,157347,186936,29589,-,.,,AGAP004677
2L,mRNA,157347,186936,29589,-,.,AGAP004677,AGAP004677-RB
2L,CDS,157495,157623,128,-,2,AGAP004677-RA,AGAP004677-PA
2L,CDS,157495,157623,128,-,2,AGAP004677-RB,AGAP004677-PB


In [3]:
def exons2introns(parent, exons):
    exons = list(exons)
    seqid = exons[0].seqid
    strand = exons[0].strand
    type = 'intron'
    for i in range(1, len(exons)):
        start = exons[i-1].stop
        stop = exons[i].start
        yield (seqid, type, start, stop, stop-start, strand, '.', parent, None)
            

tbl_introns = (
    tbl_features
    .eq('type', 'exon')
    .rowgroupmap(key='parent',
                 mapper=exons2introns,
                 header=['seqid', 'type', 'start', 'stop', 'length', 'strand','phase', 'parent', 'ID'])
    .sort(key=('seqid', 'start', 'parent'))
)
tbl_introns.display(10)

0|seqid,1|type,2|start,3|stop,4|length,5|strand,6|phase,7|parent,8|ID
2L,intron,157623,157678,55,-,.,AGAP004677-RA,
2L,intron,157623,157678,55,-,.,AGAP004677-RB,
2L,intron,158297,159191,894,-,.,AGAP004677-RA,
2L,intron,158297,159191,894,-,.,AGAP004677-RB,
2L,intron,159366,180908,21542,-,.,AGAP004677-RA,
2L,intron,159366,186859,27493,-,.,AGAP004677-RB,
2L,intron,203924,203980,56,+,.,AGAP004678-RA,
2L,intron,207953,208392,439,+,.,AGAP004679-RB,
2L,intron,208581,208638,57,+,.,AGAP004679-RA,
2L,intron,208581,208638,57,+,.,AGAP004679-RB,


In [9]:
lkp_feature_children = tbl_features.recordlookup('parent')
lkp_feature_length = tbl_features.lookup('length','type')

In [10]:
lkp_feature_length[3000]

['gene', 'mRNA', 'mRNA', 'gene', 'mRNA', 'three_prime_UTR']

In [7]:
lkp_feature_children['AGAP004677']no

SyntaxError: invalid syntax (<ipython-input-7-ba4bfb42c590>, line 1)

In [11]:
def transcript_length(row):
    if row['type'] == 'mRNA':
        exons = [f for f in lkp_feature_children[row.ID] if f['type'] == 'exon']
        l = sum(e.length for e in exons)
        return l
    else:
        return None


lkp_transcript_length = (
    tbl_features.eq('type', 'mRNA')
    .addfield('transcript_length', transcript_length)
    .lookupone('ID', 'transcript_length')
)

In [12]:
lkp_transcript_length['AGAP004677-RA']

1467

In [13]:
def is_canonical_transcript(row):
    if row['type'] == 'mRNA':
        length = lkp_transcript_length[row.ID]
        lengths = [lkp_transcript_length[f.ID] for f in lkp_feature_children[row.parent]]
        return length == max(lengths)
    else:
        return None

In [14]:
lkp_transcript_is_canonical = (
    tbl_features
    .eq('type', 'mRNA')
    .addfield('is_canonical', is_canonical_transcript)
    .lookupone('ID', 'is_canonical')
)


In [15]:
def is_canonical(row):
    if row['type'] == 'mRNA':
        return lkp_transcript_is_canonical[row.ID]
    elif row.parent in lkp_transcript_is_canonical:
        return lkp_transcript_is_canonical[row.parent]
    else:
        return None

    
tbl_features_aug = (
    tbl_features
    .cat(tbl_introns)
    .sort(key=('seqid', 'start', 'parent', 'type'), cache=False)
    .addfield('n_children', lambda row: len(lkp_feature_children[row.ID]) 
                                        if row.ID is not None and row.ID in lkp_feature_children
                                        else 0)
    .addfield('transcript_length', transcript_length)
    .addfield('is_canonical', is_canonical)
    .cache()
)


In [16]:
lkp_feature_children_aug = tbl_features_aug.recordlookup('parent')

In [17]:
def first_last(row):
    if row['type'] in {'exon', 'intron', 'CDS'}:
        first, last = False, False
        sibs = [f for f in lkp_feature_children_aug[row.parent]
                if f['type'] == row['type']]
        if row.strand == '+':
            starts = sorted([f.start for f in sibs])
            index = starts.index(row.start)
        else:
            starts = sorted([f.stop for f in sibs], reverse=True)
            index = starts.index(row.stop)
        if index == 0:
            first = True
        if index == len(sibs) - 1:
            last = True
        return first, last
    return (None, None)

In [18]:
tbl_features_aug2 = (
    tbl_features_aug
    .addfield('first_last', first_last)
    .unpack('first_last', newfields=['is_first', 'is_last'])
)
tbl_features_aug2.display(10)

0|seqid,1|type,2|start,3|stop,4|length,5|strand,6|phase,7|parent,8|ID,9|n_children,10|transcript_length,11|is_canonical,12|is_first,13|is_last
2L,chromosome,0,49364325,49364325,.,.,,2L,0,,,,
2L,gene,157347,186936,29589,-,.,,AGAP004677,2,,,,
2L,mRNA,157347,181305,23958,-,.,AGAP004677,AGAP004677-RA,10,1467.0,True,,
2L,mRNA,157347,186936,29589,-,.,AGAP004677,AGAP004677-RB,10,1147.0,False,,
2L,exon,157347,157623,276,-,.,AGAP004677-RA,,0,,True,False,True
2L,three_prime_UTR,157347,157495,148,-,.,AGAP004677-RA,,0,,True,,
2L,exon,157347,157623,276,-,.,AGAP004677-RB,,0,,False,False,True
2L,three_prime_UTR,157347,157495,148,-,.,AGAP004677-RB,,0,,False,,
2L,CDS,157495,157623,128,-,2,AGAP004677-RA,AGAP004677-PA,0,,True,False,True
2L,CDS,157495,157623,128,-,2,AGAP004677-RB,AGAP004677-PB,0,,False,False,True


In [19]:
# Is 'RA' transcript always canonical?
(
    tbl_features_aug2
    .eq('type', 'mRNA')
    .addfield('check', lambda row: row.ID[-2:] == 'RA' and row.is_canonical)
    .valuecounts('check')
)

0|check,1|count,2|frequency
True,12447,0.8358740178631389
False,2444,0.1641259821368612


In [20]:
# How many transcripts do genes have?
(
    tbl_features_aug2
    .select(lambda row: row.type == 'gene')
    .valuecounts('n_children')
    .displayall()
)

0|n_children,1|count,2|frequency
1,11775,0.9040307101727448
2,910,0.0698656429942418
3,211,0.0161996161228406
4,74,0.0056813819577735
5,26,0.0019961612284069
6,10,0.000767754318618
8,5,0.000383877159309
7,5,0.000383877159309
11,3,0.0002303262955854
9,2,0.0001535508637236


In [21]:
output_dir = '/home/joshjrey/Documents/mosquito_project/outputs/build_blocks'
!mkdir -pv {output_dir}

mkdir: created directory '/home/joshjrey/Documents/mosquito_project/outputs/build_blocks'


In [22]:
(
    tbl_features_aug2
    .teetsv(os.path.join(output_dir, 'tbl_features.txt.gz'))
    .topickle(os.path.join(output_dir, 'tbl_features.pkl.gz'))
)

In [23]:
!ls -lh {output_dir}

total 4.4M
-rw-r--r-- 1 joshjrey joshjrey 2.3M May  3 16:12 tbl_features.pkl.gz
-rw-r--r-- 1 joshjrey joshjrey 2.2M May  3 16:12 tbl_features.txt.gz


In [24]:
etl.frompickle(os.path.join(output_dir, 'tbl_features.pkl.gz'))

0|seqid,1|type,2|start,3|stop,4|length,5|strand,6|phase,7|parent,8|ID,9|n_children,10|transcript_length,11|is_canonical,12|is_first,13|is_last
2L,chromosome,0,49364325,49364325,.,.,,2L,0,,,,
2L,gene,157347,186936,29589,-,.,,AGAP004677,2,,,,
2L,mRNA,157347,181305,23958,-,.,AGAP004677,AGAP004677-RA,10,1467.0,True,,
2L,mRNA,157347,186936,29589,-,.,AGAP004677,AGAP004677-RB,10,1147.0,False,,
2L,exon,157347,157623,276,-,.,AGAP004677-RA,,0,,True,False,True
