In [6]:
# imports:
import circRNA as circ
from collections import defaultdict
import sys, os, re, argparse, natsort
import pandas as pd
import numpy as np
from tqdm import tqdm
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


# functions:
def read_annotation(file):
    df = pd.read_table(file, sep = '\t', header=0)
    annot_df = df.replace(np.nan, '', regex=True) 
    return annot_df

def compute_nb_transcript_type(circ_rnas, df):
    nb_annot_tot = len(df)
    nb_annot, nb_annot_start_end = 0, 0
    nb_annot_start, nb_annot_end = 0, 0
    nb_codant, nb_codant_start_end, nb_mono = 0, 0, 0
    nb_no_annot = 0
    nb_intronic = 0
    nb_ccr_exonic_a = []
    nb_ccr_intronic_a = []
    
    intronics_a = []
    exonics_circ_names = []
    infra_exonics_a = []
    
    intronics_cas_1_sens = []
    intronics_cas_2_sens = []
    intronics_cas_3_sens = []
    intronics_cas_4_sens = []
    intronics_cas_5_sens = []
    intronics_cas_6_sens = []
    intronics_cas_7_sens = []
    intronics_cas_8_sens = []

    intronics_cas_1_antisens = []
    intronics_cas_2_antisens = []
    intronics_cas_3_antisens = []
    intronics_cas_4_antisens = []
    intronics_cas_5_antisens = []
    intronics_cas_6_antisens = []
    intronics_cas_7_antisens = []
    intronics_cas_8_antisens = []
    
    for index, row in df.iterrows():
                  
               
        # Exoniques:
        if row.nb_ccr >= 5:
            exons_id_start_str = str(row.exons_id_start)
            exons_id_end_str = str(row.exons_id_end)
            codant_circ_start = re.findall(r'\b(\w+_c)\b', exons_id_start_str)
            codant_circ_end = re.findall(r'\b(\w+_c)\b', exons_id_end_str)
            if (len(row.exons_id_start) or len(row.exons_id_end)) > 0:
                nb_annot += 1
                exonics_circ_names.append(row.circ_rna_name)
            if (len(row.exons_id_start) and len(row.exons_id_end)) > 0:
                nb_annot_start_end += 1
            if len(row.exons_id_start)>0 and len(row.exons_id_end)==0:
                nb_annot_start += 1
            if len(row.exons_id_end)>0 and len(row.exons_id_start)==0:
                nb_annot_end += 1
            if (len(codant_circ_start) or len(codant_circ_end)) > 0:
                nb_codant += 1
            if len(row.intron_name) > 0:
                nb_intronic += 1                
            if (row.exons_id_start and row.exons_id_end and row.transcript_id_start 
                and row.transcript_id_end and row.gene_id_start and row.gene_id_end
                and row.intron_name and row.start_i and row.end_i and row.exon_id_i
                and row.gene_id_i and row.gene_id_ife) !="":
                nb_no_annot += 1
            
            # Infra_exoniques:
            
            if len(row.gene_id_ife) > 0:
                infra_exonics_a.append(row)
            
            # Introniques:  
            
            if row.start_i and row.end_i != '':  
                
                if row.strand == "+":

                    if row.start == row.start_i:
                        intronics_cas_1_sens.append(row.circ_rna_name)

                    if (row.end_i - row.end) in range(-2,32):
                        intronics_cas_2_sens.append(row.circ_rna_name)

                    if (row.start - row.start_i) in range(-2,2):
                        intronics_cas_3_sens.append(row.circ_rna_name)

                    if ((row.start==row.start_i)
                    and ((row.end_i - row.end) in range(-2,32))):
                        intronics_cas_4_sens.append(row.circ_rna_name)

                    if (((row.start - row.start_i) in range(-2,2)) 
                    and ((row.end_i - row.end) in range(-2,32))):
                        intronics_cas_5_sens.append(row.circ_rna_name)

                    if ((((row.start - row.start_i) in range(-2,2)) 
                    or (row.start==row.start_i))
                    and ((row.end_i - row.end) in range(-2,32))):
                        intronics_cas_6_sens.append(row.circ_rna_name) ##
                        intronics_a.append(row)

                    if (row.end_i - row.end) > 32:
                        intronics_cas_7_sens.append(row.circ_rna_name)

                    if ((row.start == row.start_i) 
                    and ((row.end_i - row.end) > 32)):
                        intronics_cas_8_sens.append(row.circ_rna_name) ## 
                        intronics_a.append(row)


                elif row.strand == "-":

                    if row.end == row.end_i:
                        intronics_cas_1_antisens.append(row.circ_rna_name)

                    if (row.start - row.start_i) in range(-2,2):
                        intronics_cas_2_antisens.append(row.circ_rna_name) 

                    if (row.start - row.start_i) in range(-2,32):
                        intronics_cas_3_antisens.append(row.circ_rna_name) 

                    if ((row.end == row.end_i) 
                    and ((row.start - row.start_i) in range(-2,32))):
                        intronics_cas_4_antisens.append(row.circ_rna_name)

                    if (((row.end - row.end_i) in range(-2,2)) 
                    and ((row.start - row.start_i) in range(-2,32))):
                        intronics_cas_5_antisens.append(row.circ_rna_name)                       

                    if ((((row.end - row.end_i) in range(-2,2)) 
                    or (row.end == row.end_i))
                    and ((row.start - row.start_i) in range(-2,32))):
                        intronics_cas_6_antisens.append(row.circ_rna_name) ##
                        intronics_a.append(row)


                    if ((row.end - row.end_i) in range (-2,2)):
                        intronics_cas_7_antisens.append(row.circ_rna_name) 

                    if ((row.end == row.end_i)
                    and ((row.start - row.start_i) > 32)):
                        intronics_cas_8_antisens.append(row.circ_rna_name) ##
                        intronics_a.append(row)

    
    # Introniques:
    print("Brin + (cas 1) : ", len(intronics_cas_1_sens))
    print("Brin + (cas 2) : ", len(intronics_cas_2_sens))
    print("Brin + (cas 3) : ", len(intronics_cas_3_sens))
    print("Brin + (cas 4) : ", len(intronics_cas_4_sens))
    print("Brin + (cas 5) : ", len(intronics_cas_5_sens))
    print("Brin + (cas 6) : ", len(intronics_cas_6_sens))
    print("Brin + (cas 7) : ", len(intronics_cas_7_sens))
    print("Brin + (cas 8) : ", len(intronics_cas_8_sens))

    print("Brin - (cas 1) : ", len(intronics_cas_1_antisens))
    print("Brin - (cas 2) : ", len(intronics_cas_2_antisens))
    print("Brin - (cas 3) : ", len(intronics_cas_3_antisens))
    print("Brin - (cas 4) : ", len(intronics_cas_4_antisens))
    print("Brin - (cas 5) : ", len(intronics_cas_5_antisens))
    print("Brin - (cas 6) : ", len(intronics_cas_6_antisens))
    print("Brin - (cas 7) : ", len(intronics_cas_7_antisens))
    print("Brin - (cas 8) : ", len(intronics_cas_8_antisens))
    
    intronics_circ_names = list(set(intronics_cas_6_sens+intronics_cas_8_sens+
                                    intronics_cas_6_antisens+intronics_cas_8_antisens))
    nb_tot = len(intronics_circ_names)
    print("NB_intronics:", nb_tot)
    print("NB_not_annotated:", nb_no_annot)

#"chrom", "start", "end", "strand", "nb_ccr", 
#"circ_rna_name", "exons_id_start", "exons_id_end", 
#"transcript_id_start", "transcript_id_end", 
#"gene_id_start", "gene_id_end", 
#"intron_name", "start_i", "end_i", "exon_id_i", "gene_id_i",
#"ife_name", "ife_start", "ife_end", "gene_id_ife"
    
    
    #with open("../annotations_introns_exons_infra_e/2_2/intron_derived_circRNAs_f_1.tsv", "w") as txt_file:
        #for line in intronics_a:                            
            #txt_file.write("\t".join(map(str, s)))
            #txt_file.write("\n")
    
    # Infra_exoniques:
    with open("../annotations_introns_exons_infra_e/2_2/infra_exoniques_f_0_95.tsv", "w") as txt_file:
        for line in infra_exonics_a: 
            
            #s = [line.chrom, line.start, line.end, line.strand, line.nb_ccr, 
            #     line.circ_rna_name, line.exons_id_start, line.exons_id_end, 
            #     line.transcript_id_start, line.transcript_id_end, 
            #     line.gene_id_start, line.gene_id_end, line.intron_name, 
            #     line.start_i, line.end_i, line.exon_id_i, line.gene_id_i,
            #     line.gene_id_ife]
            
            s = [line.chrom, line.start, line.end, line.strand, line.nb_ccr, 
                 line.circ_rna_name]
            
            #s = [line.chrom, line.start, line.end, line.strand, line.nb_ccr, line.circ_rna_name, 
             #   line.exons_id_start, line.exons_id_end, line.transcript_id_start, line.transcript_id_end, 
            #    line.gene_id_start, line.gene_id_end, line.intron_name, line.start_i, line.end_i, line.intron]
                
            
            txt_file.write("\t".join(map(str, s)))
            txt_file.write("\n")
   
# main: 
circ_rnas_31 = circ.read_annotation("../results_pig_testis_31/circ_rnas.bed", "bed")

annot_31 = read_annotation("../annotations_introns_exons_infra_e/annotation_circRNAs_f_0_95.out")
#annot_31 = read_annotation("../annotations_introns_exons/annotation_circRNAs_f_0_95.out")

stats = compute_nb_transcript_type(circ_rnas_31, annot_31)
print(stats)

Brin + (cas 1) :  2
Brin + (cas 2) :  95
Brin + (cas 3) :  100
Brin + (cas 4) :  2
Brin + (cas 5) :  88
Brin + (cas 6) :  88
Brin + (cas 7) :  706
Brin + (cas 8) :  0
Brin - (cas 1) :  3
Brin - (cas 2) :  2
Brin - (cas 3) :  79
Brin - (cas 4) :  3
Brin - (cas 5) :  70
Brin - (cas 6) :  70
Brin - (cas 7) :  71
Brin - (cas 8) :  0
NB_intronics: 158
NB_not_annotated: 41
None
