In [1]:
import os
import pandas as pd

riboDir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/RiboNovel_MultMap_1to1"

In [None]:
%%bash -s "$riboDir"

module load ucsc_tools
module load Python/3.8.6-GCCcore-10.2.0
module load BEDTools

for file in $1/RibORF/*/repre.valid.ORF.genepred.txt; do

    echo $file

    out=${file%%.txt}

    # Transform the genePred into GTF
    genePredToGtf file source="ORF" $file ${out}.tmp.gtf
    # Extract only the CDS lines of the GTF
    grep -P "\tCDS\t" ${out}.tmp.gtf > ${out}.tmp.CDS.gtf

    # Regenerate a GenePred file only of the CDS but this time the ORFs are correctly mapped
    gtfToGenePred ${out}.tmp.CDS.gtf ${out}.tmp.genePred
    genePredToBed ${out}.tmp.genePred ${out}.tmp.bed

    mv ${out}.tmp.bed ${out}.bed

    # then we have to slop by 3 
    bedtools slop -i ${out}.bed \
    -g /projects_eg/projects/marta/chromosomes_order.NOchr.txt \
    -s -l 0 -r 3 > ${out}.slopped.bed

    # And then we adjust with my python script
    python /projects_eg/projects/chris/Pipelines/ORF_conservation/adjust_bed12_after_slop.py \
    -i ${out}.slopped.bed \
    -o ${out}.slopped.adjusted.bed

    rm ${out}.tmp.CDS.gtf ${out}.tmp.gtf

    # Re-name the files
    #mv candidateORF.genepred.txt candidateORF.CDS.genePred
    mv ${out}.txt ${out}.CDS.genepred.txt
    mv ${out}.tmp.genePred ${out}.CDS.genePred

    # mv candidateORF.fa candidateORF.CDS.fa
done

In [None]:
%%bash -s "$riboDir"

## one bed file per tissue

tissues="testis liver brain"

for t in $tissues; do

    echo $t

    cat $1/RibORF/human*${t}*/repre.valid.ORF.genepred.slopped.adjusted.bed > $1/RibORF/${t}.bed
    sort $1/RibORF/${t}.bed | uniq > $1/RibORF/valid.ORF.${t}.bed

done

Brain Only in R

In [None]:
brainOnly = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q2_TestisRestricted/human/brain/brainRestricted_GTEx_translatedONLYbrain.csv")
brainOnly

Unnamed: 0,gene_id,gene_name,orfID,transcript_id,gene_type,orfType,length,geneORFtype,length_aa,start_codon,ORFpep,TranslatedLiver,TranslatedTestis
0,ENSG00000022355,GABRA1,ENST00000428797.7:5:+|11|4285:227:341|uORF|CTG,ENST00000428797,protein_coding,uORF,114,protein_coding_uORF,38,CTG,MARPDSDSQTRAGSSLSTILSPRLFPGLKRSCVQRGP*,no,no
1,ENSG00000022355,GABRA1,ENST00000428797.7:5:+|1|4285:42:96|uORF|CTG,ENST00000428797,protein_coding,uORF,54,protein_coding_uORF,18,CTG,MHLPVSELAFIHMQAVV*,no,no
2,ENSG00000022355,GABRA1,ENST00000428797.7:5:+|4|4285:109:196|uORF|CTG,ENST00000428797,protein_coding,uORF,87,protein_coding_uORF,29,CTG,MQIGYWEANLGVKSSAKEHAESMMAQTK*,no,no
3,ENSG00000101958,GLRA2,ENST00000218075.9:X:+|7|3208:338:452|uORF|ATG,ENST00000218075,protein_coding,uORF,114,protein_coding_uORF,38,ATG,MMPRTGTFSFFSANCTKPNLFLIFKETRFLPNFESGQ*,no,no
4,ENSG00000101958,GLRA2,ENST00000218075.9:X:+|15|3208:523:1882|canonic...,ENST00000218075,protein_coding,canonical,1359,protein_coding_canonical,453,ATG,MNRQLVNILTALFAFFLETNHFRTAFCKDHDSRSGKQPSQTLSPSD...,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,ENSG00000288563,ENSG00000288563,ENST00000673949.1:4:-|3|4225:62:107|noncoding|ATG,ENST00000673949,lncRNA,noncoding,45,lncRNA_noncoding,15,ATG,MKRIQLVGISREDA*,no,no
117,ENSG00000288563,ENSG00000288563,ENST00000673949.1:4:-|7|4225:99:192|noncoding|ATG,ENST00000673949,lncRNA,noncoding,93,lncRNA_noncoding,31,ATG,MHDSFMRRLMNKGLKMMMKSTECSNPACQC*,no,no
118,ENSG00000289441,ENSG00000289441,ENST00000728103.1:10:+|31|1182:529:700|noncodi...,ENST00000728103,lncRNA,noncoding,171,lncRNA_noncoding,57,ATG,MPTEPPHRKTGTRMALCKDSHQECQSQRLRDQRTQKGDLKNSCLGA...,no,no
119,ENSG00000289462,ENSG00000289462,ENST00000689020.2:5:-|18|2391:364:619|noncodin...,ENST00000689020,lncRNA,noncoding,255,lncRNA_noncoding,85,ATG,MPLQGLREGSARRLVGNAFLPLNTRLISLKHTMYLSCPKDDDRWFF...,no,no


In [None]:
bedBrain = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/RiboNovel_MultMap_1to1/RibORF/valid.ORF.brain.bed", sep="\t", header=None)

bedBrainOnly = bedBrain[bedBrain[3].isin(brainOnly.orfID.values.tolist())]
bedBrainOnly.to_csv("/projects_eg/projects/marta/250225_BrainMPs_Gabriel/brainOnly_GTEx_Wang.bed", sep="\t", index=None, header=None)

Brain & Testis

In [13]:
bedBrain = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/RiboNovel_MultMap_1to1/RibORF/valid.ORF.brain.bed", sep="\t", header=None)
bedTestis = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/RiboNovel_MultMap_1to1/RibORF/valid.ORF.testis.bed", sep="\t", header=None)
bedBrainTestis = pd.concat([bedBrain, bedTestis])

bothBrainTestis = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q2_TestisRestricted/human/brain/brainTestis_Restricted_GTEx_translated.csv")
bedBrainTestis = bedBrainTestis[bedBrainTestis[3].isin(bothBrainTestis.orfID.values.tolist())]
bedBrainTestis.to_csv("/projects_eg/projects/marta/250225_BrainMPs_Gabriel/BrainTestis_GTEx_Wang.bed", sep="\t", index=None, header=None)