## Let's predict the stability of ncORFs

- pepScore

In [45]:
import os
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq  # Import Seq from Bio.Seq

pepDir = "/data/genomics/marta/tools/PepScore"
riboDir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"

1. Calculate the false discovery rate (FDR) of ORF lengths considerating transcript lengths. 

`usage: perl CalculateFDR.ORFlength.pl -g SequenceFile -t lengthsOfInterest -o outputDir -s startCodon

	-g SequenceFile: input the random sequence file in the fasta format

	-t lengthsOfInterest: Tab delimited text file with two columns. The first column shows the transcript length, and the second shows the ORF length (e.g. 1000	36)
	
	-o outputDir: output director
	
	-r repeat [optional]: the number of times generating random transcript sequences, default: 1000
	
	-s startCodon [optional]: start codon types, default: ATG
	
	-l orfLengthCutoff [optional]: cutoff of minimum candidate ORF length, default: 6`

Example command line: perl CalculateFDR.ORFlength.pl -g rand.genome.fa -t transcript.length.txt -o outputDir

In [35]:
## Create fasta of translated sequences from candidates
specie = "human"
fasta=os.path.join("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction",specie,"240708_RiboNovel/Annotation/candidateORF.fa")
fasta_df_input = pd.read_csv(fasta, sep="\t", header=None)
fasta_df = pd.DataFrame({'header':fasta_df_input[0].iloc[::2].values, 'seq':fasta_df_input[0].iloc[1::2].values})
fasta_df['header'] = fasta_df.header.str[1:]
fasta_df


Unnamed: 0,header,seq
0,ENST00000431238.7:X:+|1|259:15:192|noncoding|CTG,CTGTACCGGCTGCATCCGGAGCAGGGCATGCCGGCCGGCGTGTGCG...
1,ENST00000431238.7:X:+|2|259:24:192|noncoding|CTG,CTGCATCCGGAGCAGGGCATGCCGGCCGGCGTGTGCGTGGACGCTG...
2,ENST00000431238.7:X:+|3|259:42:192|noncoding|ATG,ATGCCGGCCGGCGTGTGCGTGGACGCTGCGGGGAAGCTCTGGGTGG...
3,ENST00000431238.7:X:+|4|259:54:192|noncoding|GTG,GTGTGCGTGGACGCTGCGGGGAAGCTCTGGGTGGCCTCTGCGTCGA...
4,ENST00000431238.7:X:+|5|259:60:192|noncoding|GTG,GTGGACGCTGCGGGGAAGCTCTGGGTGGCCTCTGCGTCGATAGAGG...
...,...,...
8233062,ENST00000612925.1:KI270750.1:+|2|176:73:88|non...,CTGGATCCATCATAG
8233063,ENST00000612925.1:KI270750.1:+|3|176:102:150|n...,CTGGAATTGCCCCAAATGTGGGAAGCTCTACTGCAAAATTTTTGGTAG
8233064,ENST00000612925.1:KI270750.1:+|4|176:108:150|n...,TTGCCCCAAATGTGGGAAGCTCTACTGCAAAATTTTTGGTAG
8233065,ENST00000612925.1:KI270750.1:+|5|176:117:150|n...,ATGTGGGAAGCTCTACTGCAAAATTTTTGGTAG


In [50]:
dict_translated = dict()
dict_translated_prot = dict()

for infolder_results in os.listdir(os.path.join(riboDir,specie,"240724_RiboNovel/RibORF")):
    print(infolder_results)
    outfile=os.path.join(riboDir,specie,"240724_RiboNovel/RibORF",infolder_results,"repre.valid.pred.pvalue.parameters.fa")
    outfile_prot=os.path.join(riboDir,specie,"240724_RiboNovel/RibORF",infolder_results,"repre.valid.pred.pvalue.parameters.PROTEIN.fa")

    seqs_df = pd.read_csv(os.path.join(riboDir,specie,"240724_RiboNovel/RibORF",infolder_results,"repre.valid.pred.pvalue.parameters.txt"), sep="\t")
    candidates_in_repre = fasta_df[fasta_df['header'].isin(seqs_df.orfID.values.tolist())]

    with open(outfile, 'w') as out:
        with open(outfile_prot, 'w') as out_prot:
            for index, row in candidates_in_repre.iterrows():
                    out.write(">%s\n%s\n" %(row['header'], row['seq']))
                    ## add to non-redundant dictionary
                    dict_translated[row['header']] = row['seq']

                    ## translate to proteins
                    out_prot.write(">%s\n%s\n" %(row['header'], str(Seq(row['seq']).translate())))
                    ## add to non-redundant dictionary
                    dict_translated_prot[row['header']] = str(Seq(row['seq']).translate())

human_brain_ribo_3_r1
human_testis_ribo_3_r1
human_liver_ribo_1_r1
human_brain_ribo_1_r1
human_testis_ribo_2_r1
human_testis_ribo_1_r1
human_liver_ribo_3_r1
human_liver_ribo_2_r1
human_brain_ribo_2_r1


In [51]:
outfile_prot = os.path.join(riboDir,specie,"240724_RiboNovel/RibORF/repre.valid.pred.pvalue.parameters.allsamples.noredundant.PROTEIN.fa")
outfile = os.path.join(riboDir,specie,"240724_RiboNovel/RibORF/repre.valid.pred.pvalue.parameters.allsamples.noredundant.fa")

with open(outfile, 'w') as out:
    with open(outfile_prot, 'w') as out_prot:
        for key, value in dict_translated.items():
            out.write(">%s\n%s\n" %(key, value))

        ## proteins
        for key, value in dict_translated_prot.items():
            out_prot.write(">%s\n%s\n" %(key, value))



In [68]:
## get transcript Length
## Tab delimited text file with two columns. The first column shows the transcript length, and the second shows the ORF length (e.g. 1000	36)
df_translated = pd.DataFrame(dict_translated.items(), columns=['header', 'seq'])
## get transcript length
df_translated['transcriptLength'] = df_translated['header'].str.split("\|", expand=True)[2]
df_translated['transcriptLength'] = df_translated['transcriptLength'].str.split(":", expand=True)[0]
## get orf length
df_translated['orfLength_info'] = df_translated['header'].str.split("\|", expand=True)[2]
df_translated['endORF'] = df_translated['orfLength_info'].str.split(":", expand=True)[2]
df_translated['startORF'] = df_translated['orfLength_info'].str.split(":", expand=True)[1]
df_translated['orfLength'] = df_translated['endORF'].astype('int64') - df_translated['startORF'].astype('int64')
df_translated

transcript_length = df_translated[['transcriptLength', 'orfLength']]
transcript_length.to_csv(os.path.join(riboDir,specie,"240724_RiboNovel/RibORF/length_transcript_orf.allsamples.noredundant.txt"), header=None, sep="\t")

In [70]:
%%bash -s "$pepDir" "$riboDir"

specie="human"

module load Perl
module load R/4.2.1-foss-2020b

mkdir -p $2/$specie/stability/PepScore

perl $1/CalculateFDR.ORFlength.pl -g $2/$specie/240724_RiboNovel/RibORF/repre.valid.pred.pvalue.parameters.allsamples.noredundant.fa -t $2/$specie/240724_RiboNovel/RibORF/length_transcript_orf.allsamples.noredundant.txt -o $2/$specie/stability/PepScore