# IDENTIFICATION OF NOVEL CLASSES OF NEOANTIGENS IN CANCER | Translatable ORFs

In [None]:
%load_ext rpy2.ipython

## 0. Data preparation

This first cell should be modified according to the data that is going to be used. It is only available for datasets with paired samples per patient: normal and tumor. 

The **PROJECT** variable should be changed according to the GEO identifier.

From the GEO website, the *SRR_Acc_List.txt* and *SraRunTable.txt* files should be manually downloaded and save in a directory. This directory should be specified in **SRR** variable.

The pipeline is developed with the intention of running the most computationally expensive programs in a cluster. 
In this case, a Gluster File System has been used. The code to run on a cluster may need to be adapted.

In [None]:
import os,re,shutil,glob,openpyxl
import pandas as pd
from Bio import SeqIO
from gtfparse import read_gtf
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib import pyplot as plt
from IPython.display import Image

PROJECT="GSE193567"

DIR=os.path.join("data",PROJECT)

try:
    os.makedirs(DIR) #path where to store all the itermediate steps and outputs of the pipeline
except:
    print("Directory for %s already exists" %PROJECT)
    
CLUSTERDIR="/users/genomics/marta" #path where to run and store things that run in a cluster
SRR="/projects_eg/datasets/"+PROJECT # path where SRR_Acc_List.txt and SraRunTable.txt are stored. It should be inside a folder named with GEO accession
SRR_ACC=os.path.join(SRR,"SRR_Acc_List.txt") 
SRA=os.path.join(SRR,"SraRunTable.txt")

FASTQDIR=os.path.join(DIR,"fastq_files") #path where to store fastq files
try:
    os.mkdir(FASTQDIR)
except:
    print("Fastq_files directory exists")
    
shutil.copy(SRR_ACC, os.path.join(FASTQDIR,"SRR_Acc_List.txt"))
shutil.copy(SRA, os.path.join(FASTQDIR,"SraRunTable.txt"))

GENOMEDIR="genomes"

try:
    os.makedirs(os.path.join(DIR,"analysis"))
    os.makedirs(os.path.join(DIR,"results"))
    #os.makedirs(os.path.join(DIR,"scripts"))
except:
    print("Directory exists")



In [None]:
%%R

require(tidyr)
require(dplyr)
require(rtracklayer)
#library(purrr)
require(ggplot2)
require(RColorBrewer)
require(devtools)
require(stringr)
require(edgeR)

Get a three column file with patient_id normal_id tumor_id for latter usage 

In [None]:
metadata = pd.read_csv(os.path.join(FASTQDIR.split("/fastq_files")[0],"SraRunTable.txt"))
metadata = metadata[['Run','Individual','tissue']]

normal = metadata[metadata['tissue'] == "non-tumor"]
normal = normal[['Individual','Run']]

tumor = metadata[metadata['tissue'] == "tumor"]
tumor = tumor[['Individual','Run']].rename(columns ={'Run' : 'Run_t'})

patients = pd.merge(normal, tumor, on=['Individual'])
patients['Individual'] = patients['Individual'].str.split(' ').str[1]
patients.to_csv(os.path.join(DIR,"results/patients.csv"),index=False, header=False)
patients_summary = os.path.join(DIR,"results/patients.csv")

patients_id=list(patients.iloc[:,0])
normal_id=list(patients.iloc[:,1])
tumor_id=list(patients.iloc[:,2])

patients

## 09.CIPHER

CIPHER is a program for the prediction of coding sequences in transcripts. It calculates the coding score of each open reading frame (ORF) using a metric based on hexanucleotide frequencies (see Additional Information). It is specially well-suited to discover new small translated proteins.

http://evolutionarygenomics.imim.es/cipher

https://github.com/jorruior/CIPHER/blob/master/cipher.py

We want to predict translatable open reading frames in both non-canonical regions (lncRNA and processed pseudogenes) and novel transcripts.

**CIPHER NOCDS selected**

In [None]:
%%bash -s "$DIR" "$patients_summary"

module load Python/2.7.11

cat $2 | while IFS=, read p normal tumor; do
    INDIR=$1/analysis/08_tumor_specific/${p}
    OUTPUT=$1/analysis/09_CIPHER/$p
    mkdir $OUTPUT
    #predict the longest ORF
    python /genomics/users/marta/tools/CIPHER-master/cipher.py -i $INDIR/${p}_known_tumor_specific_genes_1FPKM_300kb_NOCDS_selected_gene.fa -o $OUTPUT/${p}_known_tumor_specific_genes_1FPKM_300kb_NOCDS_selected_8aa_cipher_longest -s human -x /genomics/users/marta/tools/CIPHER-master/tables/hsa_coding_to_intron_dicodon_usage.obj -t 8
    #predict all ORFs
    python /genomics/users/marta/tools/CIPHER-master/cipher.py -n all -i $INDIR/${p}_known_tumor_specific_genes_1FPKM_300kb_NOCDS_selected_gene.fa -o $OUTPUT/${p}_known_tumor_specific_genes_1FPKM_300kb_NOCDS_selected_8aa_cipher_all -s human -x /genomics/users/marta/tools/CIPHER-master/tables/hsa_coding_to_intron_dicodon_usage.obj -t 8
done

Generate list file with ENST of NOCDS selected

In [None]:
for p in patients_id:
    INDIR=DIR+"/analysis/09_CIPHER/"+p 
    filename= p + "_known_tumor_specific_genes_1FPKM_300kb_NOCDS_selected_8aa_cipher_longest_orfs.fa" #each identifier only once
    file = os.path.join(INDIR, filename)
    outname= p + "_NOCDS_id_list_FPKM1.csv"
    outfile= os.path.join(INDIR,outname)
    with open(outfile, 'w') as out:
        with open(file) as fasta_file:  
            identifiers = dict()
            out.write('transcript_id,transcript_type,transcript_name\n')
            for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)            
                m = re.search(r'.*transcript_type=(.*);.*', seq_record.id)
                m2=re.search(r'.*transcript_name=(.*)_.*', seq_record.id)
                identifiers[seq_record.id[0:15]] = m.group(1) + "," + m2.group(1) #considering ENST identifiers have 15 characters


        for ENST, type in identifiers.items():
            out.write('%s,%s\n' %(ENST,type))

For the NOCDS, compare with `nuORFdb` 

Ouspenskaia T, Law T, Clauser KR, Klaeger S, Sarkizova S, Aguet F, Li B, Christian E, Knisbacher BA, Le PM, Hartigan CR, Keshishian H, Apffel A, Oliveira G, Zhang W, Chen S, Chow YT, Ji Z, Jungreis I, Shukla SA, Justesen S, Bachireddy P, Kellis M, Getz G, Hacohen N, Keskin DB, Carr SA, Wu CJ, Regev A. Unannotated proteins expand the MHC-I-restricted immunopeptidome in cancer. Nat Biotechnol. 2022 Feb;40(2):209-217. doi: 10.1038/s41587-021-01021-3. Epub 2021 Oct 18. PMID: 34663921.


**nuORFdb**

In [None]:
%%bash -s "$GENOMEDIR"

mkdir $1/nuORFdb
cd $1/nuORFdb

wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE143nnn/GSE143263/suppl/GSE143263_nuORFdb_v1.0.bed.gz

In [None]:
nuORFdb = os.path.join(GENOMEDIR,"nuORFdb/GSE143263_nuORFdb_v1.0.bed")
df = pd.read_csv(nuORFdb, sep="\t", header=None)
nuORFdb_identifiers = df.iloc[:,3].str[:15]
nuORFdb_identifiers.to_csv(nuORFdb_list, index=False, header=False)

In [None]:
#select those lncRNA and pseudogenes we considered with coidng potential and also did Ouspenskaia et al. The ORFs does not need to be the same. It's just orientative
nuorfdb_df = pd.read_csv(nuORFdb_list, header=None)
nuorfdb_list = nuorfdb_df[0].to_list()

try:
    os.mkdir(DIR+"/analysis/nuORFdb")
except:
    print("The directory is not create because it already exists")
    
for p in patients_id:
    NOCDS=DIR+"/analysis/09_CIPHER/"+p+"/"+p+"_NOCDS_id_list_FPKM1.csv" #from the transcris with coding potential predictred by cipher
    OUTNOCDS=DIR+"/analysis/nuORFdb/" + p + "_NOCDS_transcripts_in_db.csv" #save those that are also considered in nuORFdb
    
    nocds_df = pd.read_csv(NOCDS, names=['gene_id','transcript_type','transcript_name'], header=None)
    shared_nocds = nocds_df[nocds_df.gene_id.isin(nuorfdb_list)]
    print(p, " nocds FPKM1: ",len(shared_nocds))
    shared_nocds.to_csv(OUTNOCDS, index=None, header=None)



and `TransLNC`

Dezhong Lv, Zhenghong Chang, Yangyang Cai, Junyi Li, Liping Wang, Qiushuang Jiang, Kang Xu, Na Ding, Xia Li, Juan Xu, Yongsheng Li, TransLnc: a comprehensive resource for translatable lncRNAs extends immunopeptidome, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D413–D420, https://doi.org/10.1093/nar/gkab847


**TransLnc**

In [None]:
%%bash -s "$GENOMEDIR"

mkdir $1/translnc
cd $1/translnc

wget http://bio-bigdata.hrbmu.edu.cn/TransLnc/download/lncRNA_peptide_information.txt

In [None]:
translnc=os.path.join(GENOMEDIR,"translnc/lncRNA_peptide_information.txt?x=3")
df = pd.read_csv(translnc, sep="\t")
translnc_identifiers_unique = set(df.iloc[:,9])
translnc_identifiers = pd.DataFrame(translnc_identifiers_unique)
translnc_identifiers.to_csv(translnc_list_dir, index=False, header=False)

In [None]:
#get a list of the lncRNAs we considered as potentially coding and also did TransLnc. The ORF does not need to be the same. It's just and approximation.
translnc_df = pd.read_csv(translnc_list_dir, header=None)
translnc_list = translnc_df[0].to_list()

try:
    os.mkdir(DIR+"/analysis/translnc")
except:
    print("The directory is not create because it already exists")
    
for p in patients_id:    
    OUT=DIR+"/analysis/translnc/" + p + "_lncRNA_transcripts_in_db.csv"
    translnc_df = pd.read_csv(translnc_list_dir, header=None)
    translnc_list = translnc_df[0].tolist()
    NOCDS=DIR+"/analysis/09_CIPHER/"+p+"/"+p+"_NOCDS_id_list_FPKM1.csv"
    NOCDS_df = pd.read_csv(NOCDS, names=['gene_id', 'transcript_type', 'transcript_name'])
    NOCDS_df['transcript_name'] = NOCDS_df['transcript_name']
    shared_NOCDS = NOCDS_df[NOCDS_df.transcript_name.isin(translnc_list)]

    print(p + " lncRNA FPKM1: ", len(shared_NOCDS))
    shared_NOCDS.to_csv(OUT, sep="\t", index=None)

Check which NOCDS selected genes are shared across patients

In [None]:
%%R -i DIR,patients
#generate a file with all lncRNA and the information about whether they are private or shared between patients of the same dataset 
############PATIENTS INFO#############
colnames(patients) <- c("patient", "normal", "tumor")
patients_full <- patients %>% pivot_longer(cols = !patient, names_to = "normal_tumor", values_to = "sample")

all <- data.frame(transcript_id = character(),
                  transcript_type = factor(), 
                  transcript_name = character(),
                  patient = factor(),
                  stringsAsFactors=FALSE)

for (i in 1:nrow(patients)) {
  
  df <- read.csv(paste0(DIR, "/analysis/09_CIPHER/",patients[i,1],"/",patients[i,1],"_NOCDS_id_list_FPKM1.csv"),header=TRUE)
  df$patients <- patients[i,1]
  all <- rbind(all,df)
}  

times <- all %>% count(transcript_id)
times_ordered <- times[order(times$n, decreasing=TRUE),]
print(times_ordered %>% head)

##add gene id
t_g_id <- read.csv("/projects_eg/projects/marta/biomart.v38.geneid_transid.csv")[,c('Gene_stable_ID', 'Transcript_stable_ID', 'Gene_name')]
names(t_g_id) <- c("gene_id","transcript_id","gene_name")
total <- merge(times_ordered,t_g_id, by="transcript_id")
total <- total[,c(1,3,4,2)]
total_ordered <- total[order(total$n, decreasing=TRUE),]
total_ordered <- total_ordered %>% distinct()

write.csv(total_ordered, file.path(DIR,"analysis/09_CIPHER/common_noncoding_genes.csv"),row.names = FALSE, quote = FALSE)

ggplot(times, aes(x=n)) +
  geom_bar(fill="#004D40") +
  geom_text(stat="count", aes(label=..count..), vjust=-1, size = 2.5) +
ggtitle("Common non-coding genes across patients | lncRNA & processed_pseudogenes") +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), plot.title = element_text(face="bold", size = 9), legend.position = "none") 
ggsave(file.path(DIR,"results/plots/common_noncoding_genes_patients.png"))


**Novel genes**

In [None]:
%%bash -s "$DIR" "$patients_summary" "$GENOMEDIR"

export PATH=/genomics/users/marta/tools/gffread-0.12.7.Linux_x86_64/:$PATH
GENOME_FASTA=$3/GRCh38/GRCh38.primary_assembly.genome.fa

cat $2 | while IFS=, read p normal tumor; do
    file=$1/analysis/08_tumor_specific/${p}/${p}_novel_tumor_specific_genes_1FPKM_300kb.gtf
    #get fasta. Little and not informative header is generated
    gffread -w ${file%%.*}.fa -g $GENOME_FASTA $file
done


In [None]:
#an informative heaer with the coodinates and chromosome is added
for p in patients_id:
    print(p)
    INDIR=DIR+"/analysis/08_tumor_specific/"+p
    try:
        df = read_gtf(INDIR+"/" + p + "_novel_tumor_specific_genes_1FPKM_300kb.gtf")
        t_df = df.loc[df['feature'] == "transcript"]
        g_id = list(t_df['gene_id'])
    except:
        print("%s patients has no novel genes" %(p))

    for file in os.listdir(DIR):
        if "novel" in file and file.endswith(".fa"):

            full_file = os.path.join(DIR,file)
            outname = file[:-3] + "_fullheader.fa"
            out = os.path.join(DIR,outname)
            with open(out, 'w') as outfile:
                for seq_record in SeqIO.parse(full_file, 'fasta'):
                    identifier = str(seq_record.id)
                    name=identifier[:-2]
                    #if identifier.startswith("STRG"):
                    #    name=re.findall(r"[^.]*.[^.]*", identifier)[0]
                    row_index = g_id.index(name)
                    info_to_add = t_df.iloc[row_index,[0,3,4]]
                    listToStr = ','.join(map(str, info_to_add))
                    outfile.write(">%s,%s\n%s\n" %(identifier,listToStr,str(seq_record.seq)))



**CIPHER on NOVEL GENES**

Both strands are being considered, since we do not have information about the strand for novel genes.

In [None]:
%%bash -s "$DIR" "$patients_summary"
module load Python/2.7.11

cat $2 | while IFS=, read p normal tumor; do
    INDIR=$1/analysis/08_tumor_specific/$p
    OUTPUT=$1/analysis/09_CIPHER/$p

    #predict the longest ORF
    python /genomics/users/marta/tools/CIPHER-master/cipher.py -i $INDIR/${p}_novel_tumor_specific_genes_1FPKM_300kb_fullheader.fa -o $OUTPUT/${p}_novel_tumor_specific_genes_1FPKM_300kb_8aa_cipher_longest -s human -x /genomics/users/marta/tools/CIPHER-master/tables/hsa_coding_to_intron_dicodon_usage.obj -t 8
    #predict all ORFs
    python /genomics/users/marta/tools/CIPHER-master/cipher.py -n all -i $INDIR/${p}_novel_tumor_specific_genes_1FPKM_300kb_fullheader.fa -o $OUTPUT/${p}_novel_tumor_specific_genes_1FPKM_300kb_8aa_cipher_all -s human -x /genomics/users/marta/tools/CIPHER-master/tables/hsa_coding_to_intron_dicodon_usage.obj -t 8

done

In [None]:
#get reverse strand sequence
for p in patients_id:
    file=DIR+"/analysis/08_tumor_specific/"+p+"/"+p+"_novel_tumor_specific_genes_1FPKM_300kb.fa"
    output=DIR+"/analysis/08_tumor_specific/"+p+"/"+p+"_novel_tumor_specific_genes_1FPKM_300kb_REVERSE.fa"
    with open(output, 'w') as out:
        for seq_record in SeqIO.parse(file, 'fasta'):
            out.write(">%s\n%s\n" %(seq_record.id, seq_record.seq.complement()))

In [None]:
#add coordinates to the header. Since it's reversed strand, the coordinates are not well added. They give an idea of the paired forward gene
for p in patients_id:
    print(p)
    INDIR=DIR+"/analysis/08_tumor_specific/"+p+"/"
    try:
        df = read_gtf(INDIR+ p + "_novel_tumor_specific_genes_1FPKM_300kb.gtf")
        t_df = df.loc[df['feature'] == "transcript"]
        g_id = list(t_df['gene_id'])
    except:
        print("%s patients has no novel genes" %(p))

    for file in os.listdir(INDIR):
        if "novel" in file and file.endswith("REVERSE.fa"):

            full_file = os.path.join(INDIR,file)
            outname = file[:-3] + "_fullheader.fa"
            out = os.path.join(INDIR,outname)
            with open(out, 'w') as outfile:
                for seq_record in SeqIO.parse(full_file, 'fasta'):
                    identifier = str(seq_record.id)
                    name=identifier[:-2]
                    #if identifier.startswith("STRG"):
                    #    name=re.findall(r"[^.]*.[^.]*", identifier)[0]
                    row_index = g_id.index(name)
                    info_to_add = t_df.iloc[row_index,[0,3,4]]
                    listToStr = ','.join(map(str, info_to_add))
                    outfile.write(">R%s,%s\n%s\n" %(identifier,listToStr,str(seq_record.seq)))



In [None]:
%%bash -s "$DIR" "$patients_summary"
module load Python/2.7.11

cat $2 | while IFS=, read p normal tumor; do
    INDIR=$1/analysis/08_tumor_specific/$p
    OUTPUT=$1/analysis/09_CIPHER/$p
    
    #predict the longest ORF
    python /genomics/users/marta/tools/CIPHER-master/cipher.py -i $INDIR/${p}_novel_tumor_specific_genes_1FPKM_300kb_REVERSE_fullheader.fa -o $OUTPUT/${p}_novel_tumor_specific_genes_1FPKM_300kb_REVERSE_8aa_cipher_longest -s human -x /genomics/users/marta/tools/CIPHER-master/tables/hsa_coding_to_intron_dicodon_usage.obj -t 8
    #predict all ORFs
    python /genomics/users/marta/tools/CIPHER-master/cipher.py -n all -i $INDIR/${p}_novel_tumor_specific_genes_1FPKM_300kb_REVERSE_fullheader.fa -o $OUTPUT/${p}_novel_tumor_specific_genes_1FPKM_300kb_REVERSE_8aa_cipher_all -s human -x /genomics/users/marta/tools/CIPHER-master/tables/hsa_coding_to_intron_dicodon_usage.obj -t 8
done

Merge all non-canonical information (both lncRNA/processed pseudogenes(NOCDS selected) and novel genes)

In [None]:
%%bash -s "$DIR" "$patients_summary"
if [ -f "${INDIR}/${p}_noncanonical_all_orfs_FPKM1.fa" ] ; then
    rm "${INDIR}/${p}_noncanonical_all_orfs_FPKM1.fa"
fi
cat $2 | while IFS=, read p normal tumor; do
    INDIR=$1/analysis/09_CIPHER/${p}
    listfiles="${p}_novel_tumor_specific_genes_1FPKM_300kb_8aa_cipher_all_orfs.fa ${p}_novel_tumor_specific_genes_1FPKM_300kb_REVERSE_8aa_cipher_all_orfs.fa ${p}_known_tumor_specific_genes_1FPKM_300kb_NOCDS_selected_8aa_cipher_all_orfs.fa"

    for file in $listfiles; do
        cat $INDIR/$file >> ${INDIR}/${p}_noncanonical_all_orfs_FPKM1.fa #generate a file with all the novel open reading frames
    done
done

To consider all potential ORFs can generate some redundancies we are not interested in.

In [None]:
#remove ORF redundancies. If an ORF is inside another one under the same identifier, the nested is removed.
for p in patients_id:
    fasta_file=DIR+"/analysis/09_CIPHER/" + p + "/" + p + "_noncanonical_all_orfs_FPKM1.fa"
    fasta_dict = dict()
    IDs= []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):
        
        if seq_record.id.startswith('ENST'):
            identifier=seq_record.id[:-2]
            #identifier=re.findall(r"[^.]*", seq_record.id)[0]
            if identifier not in IDs:
                IDs.append(identifier)
                fasta_dict[identifier] = {}
            if identifier in IDs:
                orf=re.findall(r".*transcript_name=.*_(.*)", seq_record.id)[0]
                fasta_dict[str(identifier)][orf] = str(seq_record.seq)
            #print(identifier)
        elif seq_record.id.startswith('STRG') or seq_record.id.startswith('RSTRG'):
            identifier=seq_record.id.split("_")[0]
            if identifier not in IDs:
                IDs.append(identifier)
                fasta_dict[identifier] = {}
            if identifier in IDs:
                orf=seq_record.id.split("_")[1]
                fasta_dict[str(identifier)][orf] = str(seq_record.seq)
            #print(identifier)

    clean_dict = {}
    duplis = {}
    total = 0

    length_to_delete = 0
    summary_out=DIR+"/results/predicted_ORF.txt"
    with open(summary_out, 'w') as summary:
        with open(DIR+"/analysis/09_CIPHER/" + p +"/id_num_orfs.txt",'w') as num_orfs: # file with the number of orfs per identifier
            for item in IDs:
                individual_dict = fasta_dict[item]
                total = total + len(individual_dict)
                num_orfs.write('%s\t%s\n' %(item,len(individual_dict)))

                clean_dict[item] = {}
                if len(individual_dict) == 1:
                    clean_dict[item] = individual_dict
                    next

                else:
                    count = 0
                    to_delete = []
                    for orf, initial_seq in individual_dict.items():
                        values=list(individual_dict.values())
                        for seq in values:
                            if values[count] in seq and values[count] != seq:
                                name=item + "_" + orf
                                to_delete.append(values[count])
                                duplis[name] = values[count]
                        count += 1

                    length_to_delete =length_to_delete + len(to_delete)
                    myDict = {key:val for key, val in individual_dict.items() if val not in to_delete}
                    clean_dict[item] = myDict
        summary.write(p)
        summary.write("initial: "+str(total))

        DUPLIS=DIR+"/analysis/09_CIPHER/"+p+"/" + p + "_noncanonical_all_orfs_FPKM1_duplis.fa"
        with open(DUPLIS, 'w') as d:
            summary.write("duplis: "+str(len(duplis)))
            for k, v in duplis.items():
                d.write(">%s\n%s\n" %(k,v))

        OUT=DIR+"/analysis/09_CIPHER/" + p + "/" + p + "_noncanonical_all_orfs_FPKM1_notduplis.fa"
        with open(OUT, 'w') as out:
            length = 0
            for k, v in clean_dict.items():
                length = length + len(v)
                for orf, seq in v.items():
                    out.write('>%s_%s\n%s\n' %(k,orf,seq))
            summary.write("norepetead: "+str(length))


In [None]:
for p in patients_id:
    identifiers= []
    INDIR=DIR+"/analysis/09_CIPHER/" + p 
    filename= p + "_noncanonical_all_orfs_FPKM1.fa"
    file = os.path.join(INDIR, filename)
    outname= p + "_noncanonical_id_list_FPKM1.csv" #list with all the noncanonical orfs
    outfile= os.path.join(INDIR,outname)
    with open(outfile, 'w') as out:
        with open(file) as fasta_file:  
            for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)  
                if seq_record.id.startswith("STRG") or seq_record.id.startswith("RSTRG"):
                    name=re.findall(r"[^.]*.[^.]*", seq_record.id)[0]
                    identifiers.append(name)
                else:
                    identifiers.append(seq_record.id)

        to_write = set(identifiers)
        for i in to_write:
            if i.startswith("ENST"):
                out.write(i[0:15]+"\n")
            else:
                out.write(i + "\n")

**Translate the ORFs into potential protein sequences**

In [None]:
for p in patients_id:
    INDIR=DIR+"/analysis/09_CIPHER/"+p

    file= p + "_noncanonical_all_orfs_FPKM1_notduplis.fa"
    outfile = file[:-3] + "_PROTEIN.fa"
    with open (os.path.join(INDIR,outfile), 'w') as out:
        for seq_record in SeqIO.parse(os.path.join(INDIR,file), "fasta"):
            identif=str(seq_record.id)
            sequence=str(seq_record.seq.translate())
            out.write(">" + identif + "\n" + sequence[:-1] + "\n")
