# RNAlight data prep
Prepare data files for LightGBM experiments.   
Generate sequence in the style of RNAlight inputs like 02_lncRNA_info_cyto_transcript.tsv:

    ENST00000296270	IGF2BP2-AS1	GTTATTGCGACTTTGATCTAAA...


In [1]:
import traceback
import numpy as np

From GenCode, download these two files:
* annotation = gencode.v44.long_noncoding_RNAs.gff3
* sequence = gencode.v44.lncRNA_transcripts.fa

In [2]:
ATLAS='lncATLAS_all_data_RCI.csv'
GFF_FILE='gencode.v44.long_noncoding_RNAs.gff3'
FASTA_FILE='gencode.v44.lncRNA_transcripts.fa'

In [3]:
def get_canonical_ids(gff_file):
    ids = set()
    with open(gff_file,'r') as fin:
        for line in fin:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if fields[2] != 'transcript':
                continue
            if 'Ensembl_canonical' not in fields[8]:
                continue
            tag_value_pairs = fields[8].split(';')
            for pair in tag_value_pairs:
                if pair.startswith('ID='):
                    ID_plus_ver = pair[3:]
                    just_ID = ID_plus_ver.split('.')[0]
                    ids.add(just_ID)
                    break
    return ids

In [4]:
canon_ids = get_canonical_ids(GFF_FILE)
print( len(canon_ids), 'canonical lncRNA IDs')

19922 canonical lncRNA IDs


In [5]:
def load_mean_rci(filename,exclude=None):
    cnrci_lists = dict()  # key=ENSG_ID, value=list of CN-RCI
    with open (filename, 'r') as fin:
        header = None
        for line in fin:
            try:
                fields = line.strip().split(',')
                if header is None:
                    header = fields
                    continue
                gid = fields[0]
                cell_type = fields[1]
                rci_type = fields[2]
                rci_value = fields[3]
                gene_type = fields[6]
                if exclude is None or cell_type==exclude:
                    if gene_type=='nc' and\
                        rci_type=='CNRCI' and\
                        rci_value!='NA':
                        rci_value=float(rci_value)
                        if gid not in cnrci_lists.keys():
                            cnrci_lists[gid] = []
                        cnrci_lists[gid].append(rci_value)
            except Exception as e:
                print(line)
                traceback.print_exc()
                raise(e)
    cnrci_means = dict()
    for k,v in cnrci_lists.items():
        mean = np.mean(v)
        cnrci_means[k] = mean
    return cnrci_means

In [6]:
mean_rcis = load_mean_rci(ATLAS,exclude='H1.hESC')
print( len(mean_rcis.keys()), 'genes with a mean CN-RCI value')

4923 genes with a mean CN-RCI value


In [7]:
def load_transcripts(seq_file,good_gids,good_tids):
    sequences = list()  # list of tuple
    seq = ''
    with open (seq_file,'r') as fin:
        loading_sequence = False
        for line in fin:
            line = line.strip()
            if line.startswith('>'):
                # Save the previous sequence
                if loading_sequence:
                    tup = (tid,gid,seq)
                    sequences.append(tup)
                    loading_sequence = False
                # Parse the defline like
                # >ENST00000456328.2|ENSG00000290825.1|-|OTTHUMT00000362751.1|DDX11L2-202|DDX11L2|1657|
                fields = line[1:].split('|')
                tid = fields[0].split('.')[0]
                gid = fields[1].split('.')[0]
                seq = ''
                if tid in good_tids and gid in good_gids:
                    loading_sequence = True
            elif loading_sequence:
                # Continuation of multi-line sequence
                seq += line
    if loading_sequence:
        # Save the last sequence
        tup = (tid,gid,seq)
        sequences.append(tup)    
    return sequences        

In [8]:
gene_ids = set(mean_rcis.keys())
print('Loading RNA sequence for canonical transcripts of genes with CN-RCI values...')
sequences = load_transcripts(FASTA_FILE,gene_ids,canon_ids)
print( len(sequences), 'sequences loaded')

Loading RNA sequence for canonical transcripts of genes with CN-RCI values...
4541 sequences loaded


In [9]:
DATAPATH = './'
# Middle defined as -2 to 0.
# no middle exclusion
cyt_no_me  = DATAPATH+'ForRNAlight.lncRNA_RCIgt-1.canonical.tsv'   
nuc_no_me  = DATAPATH+'ForRNAlight.lncRNA_RCIlt-1.canonical.tsv'
# yes middle exclusion
cyt_yes_me  = DATAPATH+'ForRNAlight.lncRNA_RCIgt0.canonical.tsv'    
nuc_yes_me  = DATAPATH+'ForRNAlight.lncRNA_RCIlt-2.canonical.tsv'
# just the middle
cyt_middle  = DATAPATH+'ForRNAlight.lncRNA_RCI-1to0.canonical.tsv'
nuc_middle  = DATAPATH+'ForRNAlight.lncRNA_RCI-2to-1.canonical.tsv'    
# Middle defined as -1 to +1.
cyt_zero_me = DATAPATH+'ForRNAlight.lncRNA_RCIgt1.canonical.tsv'
nuc_zero_me = DATAPATH+'ForRNAlight.lncRNA_RCIlt-1.canonical.tsv'

In [10]:
def print_fasta_files(nuc_fn, nuc_threshold, cyt_fn, cyt_threshold, seqtups, mean_rcis, just_middle=False):
    header = 'ensembl_transcript_id\tname\tcdna\n'
    nuc_handle = open(nuc_fn, 'w')
    nuc_handle.write(header)
    cyt_handle = open(cyt_fn, 'w') 
    cyt_handle.write(header)
    mid_value = (nuc_threshold+cyt_threshold)/2 # expect (-2+0)/2=-1
    for tup in seqtups:
        tid,gid,seq = tup
        cnrci = mean_rcis[gid]
        string = tid+'\t'+gid+'\t'+seq+'\n'
        if just_middle:
            if cnrci >= nuc_threshold and cnrci <= cyt_threshold:
                if cnrci > mid_value:
                    cyt_handle.write(string)
                else:
                    nuc_handle.write(string)
        else:
            if cnrci > cyt_threshold:
                cyt_handle.write(string)
            if cnrci < nuc_threshold:
                nuc_handle.write(string)
    nuc_handle.close()
    cyt_handle.close()

In [11]:
# no middle exclusion
print_fasta_files(nuc_no_me, -1, cyt_no_me, -1, sequences, mean_rcis)
print('Done')

Done


In [12]:
# yes middle exclusion
print_fasta_files(nuc_yes_me, -2, cyt_yes_me, 0, sequences, mean_rcis)
print('Done')

Done


In [13]:
# just the middle
print_fasta_files(nuc_middle, -2, cyt_middle, 0, sequences, mean_rcis, just_middle=True)
print('Done')

Done


In [14]:
# Exclude -1 to +1
print_fasta_files(nuc_zero_me, -1, cyt_zero_me, +1, sequences, mean_rcis)
print('Done')

Done


## Number of transcripts
middle exclusion

      1704 ForRNAlight.lncRNA_RCIgt0.canonical.tsv
       761 ForRNAlight.lncRNA_RCIlt-2.canonical.tsv

no middle exclusion

      2888 ForRNAlight.lncRNA_RCIgt-1.canonical.tsv
      1655 ForRNAlight.lncRNA_RCIlt-1.canonical.tsv

In [15]:
lncRNA_gt_Neg1 = 2888 
lncRNA_lt_Neg1 = 1655
grand_total = lncRNA_gt_Neg1 + lncRNA_lt_Neg1
print('Canonical transcripts in the mean-14-lines lncATLAS data:')
print('total %d genes' % grand_total)
print('Using -1 as the cyto/nuc threshold...')
print('cytoplasmic %d genes, %5.2f%% of total' % (lncRNA_gt_Neg1,100*lncRNA_gt_Neg1/grand_total))
print('nuclear %d genes, %5.2f%% of total' %     (lncRNA_lt_Neg1,100*lncRNA_lt_Neg1/grand_total))

Canonical transcripts in the mean-14-lines lncATLAS data:
total 4543 genes
Using -1 as the cyto/nuc threshold...
cytoplasmic 2888 genes, 63.57% of total
nuclear 1655 genes, 36.43% of total


In [16]:
lncRNA_gt_Zero = 1704 
lncRNA_lt_Neg2 = 761 
print('Using -2 and 0 as the cyto/nuc threshold...')
print('extreme cytoplasmic %d genes' % lncRNA_gt_Zero)
print('extreme nuclear %d genes' % lncRNA_lt_Neg2)
lncRNA_Neg1_to_Zero = lncRNA_gt_Neg1 - lncRNA_gt_Zero
lncRNA_Neg2_to_Neg1 = lncRNA_lt_Neg1 - lncRNA_lt_Neg2
print('middle cytoplasmic %d genes excluded' % lncRNA_Neg1_to_Zero)
print('middle nuclear %d genes excluded' % lncRNA_Neg2_to_Neg1)

Using -2 and 0 as the cyto/nuc threshold...
extreme cytoplasmic 1704 genes
extreme nuclear 761 genes
middle cytoplasmic 1184 genes excluded
middle nuclear 894 genes excluded


In [17]:
cyto_multiplier = (lncRNA_Neg1_to_Zero/lncRNA_gt_Zero)
print('For extreme cyto count C, add back C * %5.3f genes' % cyto_multiplier)
print('For example, %d * %5.3f = %d' % (lncRNA_gt_Zero, cyto_multiplier, int(lncRNA_gt_Zero * cyto_multiplier)))

For extreme cyto count C, add back C * 0.695 genes
For example, 1704 * 0.695 = 1184


In [18]:
nuc_multiplier = (lncRNA_Neg2_to_Neg1/lncRNA_lt_Neg2)
print('For extreme nuc count N, add back N * %5.3f genes' % nuc_multiplier)
print('For example, %d * %5.3f = %d' % (lncRNA_lt_Neg2, nuc_multiplier, int(lncRNA_lt_Neg2 * nuc_multiplier)))

For extreme nuc count N, add back N * 1.175 genes
For example, 761 * 1.175 = 894
