In [1]:
from collections import defaultdict
import sys
import os
import pandas as pd
# Add directory above current directory to path horrible but working
import sys; sys.path.insert(0, '..')
import circRNA

In [2]:
def eprint(*args, **kwargs):
    print(*args,  file=sys.stderr, **kwargs)

    
def junctionkeys(circ):
    if circ.strand == "+":
        key_donor = "%s:%d:%s:d" % (circ.chrom, circ.end+1, circ.strand)
        key_acceptor = "%s:%d:%s:a" % (circ.chrom, circ.start-1, circ.strand)
    else:
        key_donor = "%s:%d:%s:d" % (circ.chrom, circ.start-1, circ.strand)
        key_acceptor = "%s:%d:%s:a" % (circ.chrom, circ.end+1, circ.strand)
    return key_donor, key_acceptor

    
def read_circrnas(sample, circrna_bed, nccr_cutoff=5):
    crnas = circRNA.read_annotation(circrna_bed, fmt="bed")
    selected = (circ for circ in crnas if circ.nccr > nccr_cutoff)
    circ_rnas_exp = []
    for circ in selected:
        if int(circ.nb_ccr) > nccr_cutoff:
            key_donor, key_acceptor = junctionkeys(circ)
            circ_rnas_exp.append([key_donor, circ.nccr])
            circ_rnas_exp.append([key_acceptor, circ.nccr ])
    # Je nomme sample la colonne avec le nb_ccr pour préparer la réunion de tous les individus au sein d'un même fichier
    return pd.DataFrame(circ_rnas_exp, columns=['junction',sample+"-circ"]).set_index('junction')

In [3]:
LOCALDIR = "/home/tfaraut/mnt/genologin/dynawork/CircRNA/"
result_dir = LOCALDIR + "circRNA/results"

samples_unit = pd.read_csv("samples_cow_testis_neonat.tsv", sep='\t', dtype=str)
samples = list(set(samples_unit['sample']))

circrnas = []
for sample in samples:
    eprint("reading cirrnas for %s" % sample)
    circrna_bed = os.path.join(result_dir, sample, "circrnas.bed")
    circrnas.append(read_circrnas(sample, circrna_bed, nccr_cutoff=5))

reading cirrnas for cow-testis-neonat3
reading cirrnas for cow-testis-neonat2
reading cirrnas for cow-testis-neonat1


In [4]:
# Join all the dataframes (one per individual) 
# using key : chr:start-end:strand:t where t is a or d (acceptor or donnor)
df = circrnas[0].join(circrnas[1:]).fillna(0)
# Selecting rows that have a minimum of 5 ccr for all individuals
df = df[df.min(axis=1)>5]

In [5]:
df.head()

Unnamed: 0_level_0,cow-testis-neonat3-circ,cow-testis-neonat2-circ,cow-testis-neonat1-circ
junction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10:100125352:+:a,15,16.0,11.0
10:100133931:+:a,19,14.0,36.0
10:100134061:+:d,15,16.0,11.0
10:100149043:+:d,19,14.0,36.0
10:100149043:+:d,8,14.0,36.0


In [6]:
df.to_csv("cirRNAcounts_singlejunction_cow_testis.tsv", sep="\t", index=True)