# TACOS
TACOS [web server](https://balalab-skku.org/TACOS/)

20 random genes per category from cell line NHEK

In [1]:
import numpy as np
np.random.seed(seed=1234)
FASTA_SIZE = 20
PREFIX='TACOS_input/NHEK'  # for output fasta files
RESULTS='TACOS_output/NHEK' # for csv files from TACOS
CELL_LINE='NHEK'   # for searching input lncATLAS file

In [2]:
# Assume these files or links are in the current directory.
# This is from GenCode downloads
GENCODE = 'gencode.v44.lncRNA_transcripts.fa'
# This is from lncATLAS downloads
LNCATLAS = 'lncATLAS_all_data_RCI.csv'
# Extract canonical IDs from the GenCode GFF file.
CANONICAL = 'gencode.v44.long_noncoding_RNAs.canonical_transcripts.txt'

In [3]:
def load_transcripts(seq_file,ids_file):
    good_tids = dict()
    with open (ids_file,'r') as fin:
        for line in fin:
            line = line.strip()
            good_tids[line]=1
    gid_to_index = dict()
    sequences = list()
    one_seq = ''
    with open (seq_file,'r') as fin:
        loading_sequence = False
        for line in fin:
            line = line.strip()
            if line.startswith('>'):
                if loading_sequence:
                    sequences.append(one_seq)
                    loading_sequence = False
                    one_seq = ''
                fields = line[1:].split('|')
                tid = fields[0].split('.')[0]
                gid = fields[1].split('.')[0]
                if tid in good_tids.keys():
                    gid_to_index[gid] = len(sequences)
                    loading_sequence = True
            elif loading_sequence:
                one_seq += line
    if loading_sequence:
        sequences.append(one_seq)    
    return gid_to_index,sequences    

In [4]:
gid_to_index,sequences = load_transcripts(GENCODE,CANONICAL)
print('We have %d gene ids.' % len(gid_to_index.keys()))
print('We have %d transcript sequences.' % len(sequences))

We have 19922 gene ids.
We have 19922 transcript sequences.


In [5]:
def load_rci_truth(filename,cell_line):
    all_rci = dict()
    with open (filename, 'r') as fin:
        header = None
        for line in fin:
            try:
                fields = line.strip().split(',')
                if header is None:
                    header = fields
                    continue
                gid = fields[0]
                cell_type = fields[1]
                rci_type = fields[2]
                rci_value = fields[3]
                gene_type = fields[6]
                if gene_type=='nc' and\
                    cell_type==cell_line and\
                    rci_type=='CNRCI' and\
                    rci_value!='NA':
                    rci_value=float(rci_value)
                    if gid in all_rci.keys():
                        raise Exception('Another value for gene',gid)
                    all_rci[gid]=rci_value
            except Exception as e:
                print(line)
                traceback.print_exc()
                raise(e)
    return all_rci

In [6]:
all_rci = load_rci_truth(LNCATLAS,CELL_LINE)
print('We have an RCI for %d genes.' % len(all_rci.keys()))

We have an RCI for 1588 genes.


In [7]:
def split_rci(gene_to_rci):
    low_rci = dict()
    middle_rci = dict()
    high_rci = dict()
    for gene in gene_to_rci.keys():
        value = gene_to_rci[gene]
        if value < -1:
            low_rci[gene]=value
        elif value > 1:
            high_rci[gene]=value
        else:
            middle_rci[gene]=value
    return low_rci,middle_rci,high_rci

In [8]:
low_rci,middle_rci,high_rci = split_rci(all_rci)

In [9]:
print('Loaded gene RCI values for cell line %s.'%CELL_LINE)
print('Total :',len(all_rci.keys()))
print('High  :',len(high_rci.keys()))
print('Middle:',len(middle_rci.keys()))
print('Low   :',len(low_rci.keys()))

Loaded gene RCI values for cell line NHEK.
Total : 1588
High  : 185
Middle: 620
Low   : 783


In [10]:
def randomize_genes(rci_values):
    gene_list = list(rci_values.keys())
    np.random.shuffle(gene_list)
    return gene_list

In [11]:
def write_fasta(output_filename, gene_list, max_seqs,
               gid_to_index, sequences):
    with open (output_filename,'w') as fout:
        writing_sequence = False
        num_seqs = 0
        for gene in gene_list:
            if gene in gid_to_index.keys():
                seq_ndx = gid_to_index[gene]
                sequence = sequences[seq_ndx]
                print('>'+gene, file=fout)
                print(sequence, file=fout)
                num_seqs += 1
                if num_seqs >= max_seqs:
                    break

In [12]:
random_low = randomize_genes(low_rci)
fn=PREFIX+'.nega.fasta'
write_fasta(fn,random_low,FASTA_SIZE,gid_to_index, sequences)

In [13]:
random_middle = randomize_genes(middle_rci)
fn=PREFIX+'.zero.fasta'
write_fasta(fn,random_middle,FASTA_SIZE,gid_to_index, sequences)

In [14]:
random_high = randomize_genes(high_rci)
fn=PREFIX+'.posi.fasta'
write_fasta(fn,random_high,FASTA_SIZE,gid_to_index, sequences)

## Process results from TACOS web server¶

In [15]:
nuc_label = 'Nucleus'
cyto_label = 'Cytoplasm'
def parse_tacos_output(filename,truth,gene_to_rci=None):
    # If truth is string, make it nuc_label or cyto_label, and gene_to_rci is None.
    # If truth is float, make it the threshold, and provide gene_to_rci.
    probabilities = []
    correct = 0
    incorrect = 0
    with open (filename, 'r') as fin:
        header = None
        for line in fin:
            line = line.strip()
            fields = line.split(',')
            if header is None:
                header = fields
                continue
            gene_id=fields[0]
            category = fields[1]
            prob = float(fields[2])
            probabilities.append(prob)
            if gene_to_rci is not None: 
                # If truth is a threshold, convert it to string.
                threshold = float(truth)
                this_rci = gene_to_rci[gene_id]
                if this_rci < threshold:
                    right_label = nuc_label
                else:
                    right_label = cyto_label
            else:
                right_label = truth
            if category == right_label:
                correct += 1
            else:
                incorrect += 1
    print(filename)
    print('Average score: %f' % np.mean(probabilities))
    print('Correct / Incorrect: %d/%d' % (correct,incorrect))
    print('Accuracy: %d%%' % (int(0.5+100*correct/(correct+incorrect))))

In [16]:
print('negative')
fn=RESULTS+'.nega.csv'
parse_tacos_output(fn,nuc_label)
print('zero -1.0')
fn=RESULTS+'.zero.csv'
parse_tacos_output(fn,-1,all_rci)
print('zero -0.5')
fn=RESULTS+'.zero.csv'
parse_tacos_output(fn,-0.5,all_rci)
print('zero 0.0')
fn=RESULTS+'.zero.csv'
parse_tacos_output(fn,0,all_rci)
print('zero 0.5')
fn=RESULTS+'.zero.csv'
parse_tacos_output(fn,0.5,all_rci)
print('zero 1.0')
fn=RESULTS+'.zero.csv'
parse_tacos_output(fn,1,all_rci)
print('positive')
fn=RESULTS+'.posi.csv'
parse_tacos_output(fn,cyto_label)

negative
TACOS_output/NHEK.nega.csv
Average score: 0.465750
Correct / Incorrect: 16/4
Accuracy: 80%
zero -1.0
TACOS_output/NHEK.zero.csv
Average score: 0.468300
Correct / Incorrect: 4/16
Accuracy: 20%
zero -0.5
TACOS_output/NHEK.zero.csv
Average score: 0.468300
Correct / Incorrect: 9/11
Accuracy: 45%
zero 0.0
TACOS_output/NHEK.zero.csv
Average score: 0.468300
Correct / Incorrect: 9/11
Accuracy: 45%
zero 0.5
TACOS_output/NHEK.zero.csv
Average score: 0.468300
Correct / Incorrect: 14/6
Accuracy: 70%
zero 1.0
TACOS_output/NHEK.zero.csv
Average score: 0.468300
Correct / Incorrect: 16/4
Accuracy: 80%
positive
TACOS_output/NHEK.posi.csv
Average score: 0.568100
Correct / Incorrect: 18/2
Accuracy: 90%


In [17]:
print('done')

done
