## Evolutionary origin

Macaca - Mouse - Human

In [9]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
GENOMEDIR = "/genomics/users/marta/genomes"
species=["mouse","macaca","platypus","chicken","opossum"]

## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")

## evoDir
evoDir=os.path.join(users_dir,"EvolutionaryOrigin_MMseqs")

def translate_dna_to_protein(dna_seq):
    return str(Seq(dna_seq).translate())

# Function to save DF in fasta format
def create_seqrecord_PROT(row):
    return SeqRecord(Seq(row['protein']), id=row['header'][:-4], description="")
def create_seqrecord_DNA(row):
    return SeqRecord(Seq(row['seq']), id=row['header'], description="")



In [10]:
## get repre sequences - both DNA and PROTEINS
for s in species:
    print(s)
    riboseq_dir = os.path.join(users_dir,s,"RiboSeq/RiboQC_RiboNovel")

    ## get all candidates
    print("Reading all the candidate ORF sequences")
    df = pd.read_csv(os.path.join(riboseq_dir,"Annotation/candidateORF.fa"), header=None, sep="\t")
    ## convert to df
    candidates_fasta = pd.DataFrame({'header':df[0].iloc[::2].values, 'seq':df[0].iloc[1::2].values})
    candidates_fasta['header'] = candidates_fasta['header'].str[1:]
    candidates_fasta = candidates_fasta[~candidates_fasta['header'].str.contains("iORF")]

    all_repre_candidates = pd.DataFrame()
    for sample in os.listdir(os.path.join(riboseq_dir,"RibORF")):
        if sample.endswith("r1"):
            print(sample)

            repre = pd.read_csv(os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.txt"), sep="\t")
            repre_fasta = candidates_fasta[candidates_fasta['header'].isin(repre.orfID.values.tolist())]
            repre_fasta['header'] = s + "_" +repre_fasta['header']

            ## keep only the coding and the non-coding
            repre_fasta = repre_fasta[repre_fasta['header'].str.contains("canonical")| repre_fasta['header'].str.contains("noncoding")]

            repre_fasta['protein'] = repre_fasta['seq'].apply(translate_dna_to_protein)
            all_repre_candidates = pd.concat([all_repre_candidates, repre_fasta])

            ## translate to protein
            print("Translating to protein")
            out_proteins = os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.PROTEIN.fa")
            seq_records_proteins = repre_fasta.apply(create_seqrecord_PROT, axis=1).tolist()
            SeqIO.write(seq_records_proteins, out_proteins, "fasta")

            ## keep the dna sequences as well
            print("Saving the original DNA ORF")
            out_dna = os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.DNA.fa")
            seq_records_dna = repre_fasta.apply(create_seqrecord_DNA, axis=1).tolist()
            SeqIO.write(seq_records_dna, out_dna, "fasta")

            ## get only the nonredundant
            print("Getting the non-redundant")
            all_repre_candidates.drop_duplicates(inplace=True)
            out_nonredundant_proteins = os.path.join(riboseq_dir,"RibORF")+"/"+s+"_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa"
            seq_records_proteins = all_repre_candidates.apply(create_seqrecord_PROT, axis=1).tolist()
            SeqIO.write(seq_records_proteins, out_nonredundant_proteins, "fasta")

mouse
Reading all the candidate ORF sequences
mouse_liver_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_liver_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_roundSpermatids_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_liver_ribo_5_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_brain_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_brain_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_roundSpermatids_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_spermatozoa_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_testis_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_testis_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_elongatingSpermatids_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_elongatingSpermatids_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_spermatocytes_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_testis_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_brain_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_spermatocytes_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_spermatozoa_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_liver_ribo_4_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
mouse_liver_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaca
Reading all the candidate ORF sequences
macaque_liver_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaque_liver_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaque_liver_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaque_testis_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaque_brain_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaque_testis_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaque_testis_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaque_brain_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
macaque_brain_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus
Reading all the candidate ORF sequences
platypus_testis_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus_brain_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus_brain_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus_liver_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus_testis_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus_liver_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus_liver_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus_brain_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
platypus_testis_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken
Reading all the candidate ORF sequences
chicken_brain_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_brain_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_liver_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_testis_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_liver_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_testis_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_testis_ribo_2_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_liver_ribo_1_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_liver_ribo_4_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_liver_ribo_5_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
chicken_brain_ribo_3_r1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repre_fasta['header'] = s + "_" +repre_fasta['header']


Translating to protein
Saving the original DNA ORF
Getting the non-redundant
opossum
Reading all the candidate ORF sequences


OSError: [Errno 40] Too many levels of symbolic links: '/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/Annotation/candidateORF.fa'

## MMseqs2

Against all expressed in mouse, macaca, platypus, opossum in any tissue (non-redunant set)

#### TESTIS-EXPRESSED

In [4]:
## Testis-expressed ORFs human
TestisExpressed_translatedINtestis = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanTestis_in1.csv")
TestisExpressed_translatedINtestis

def create_seqrecord_PROThuman(row):
    return SeqRecord(Seq(row['ORFpep']), id=row['orfID'], description="")

out_nonredundant_proteins = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa"
seq_records_proteins = TestisExpressed_translatedINtestis.apply(create_seqrecord_PROThuman, axis=1).tolist()
SeqIO.write(seq_records_proteins, out_nonredundant_proteins, "fasta")

15040

Creating database of human (expressed in testis)

In [12]:
%%bash -s "$evoDir" "$users_dir"

# mkdir $1/MMseqsDB

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mmseqs createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa $1/MMseqsDB/ribORF_humanTestis_in1_DB


createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin/MMseqsDB/ribORF_humanTestis_in1_DB 

MMseqs Version:       	14.7e284
Database type         	0
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
[=
Time for merging to ribORF_humanTestis_in1_DB_h: 0h 0m 0s 518ms
Time for merging to ribORF_humanTestis_in1_DB: 0h 0m 0s 708ms
Database type: Aminoacid
Time for processing: 0h 0m 4s 150ms


Creating a joint database with all the specie, everything translated, no matter the tissue

In [11]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

# species="platypus mouse macaca chicken"
# for s in $species; do
#     echo $s

#     ### create DB
#     mmseqs createdb $2/$s/RiboSeq/RiboQC_RiboNovel/RibORF/repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa $1/MMseqsDB/${s}_noredunantProts_DB

# done

mmseqs createdb $2/*/RiboSeq/RiboQC_RiboNovel*/RibORF/*_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa $1/MMseqsDB/speciesDB



/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/speciesDB exists and will be overwritten
createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/platypus/RiboSe

Creating a joint database with all the specie AND human for clustering, everything translated for the species, no matter the tissue, but only translated in 1 for testis

In [12]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mmseqs createdb $2/*/RiboSeq/RiboQC_RiboNovel*/RibORF/*_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa $1/MMseqsDB/species_and_humanDB



/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanDB exists and will be overwritten
createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/platy

Searching

In [15]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2
mkdir -p $1/Searching

rm $1/Searching/resultDB*
## The alignment consists of two steps the prefilter and alignment. To run the search, type:
mmseqs search -a -s 6 $1/MMseqsDB/ribORF_humanTestis_in1_DB $1/MMseqsDB/speciesDB $1/Searching/resultDB $1/tmp

# Then, convert the result database into a BLAST tab formatted file (option -m 8 in legacy blast, -outfmt 6 in blast+):
# Format alignment output	query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits

mmseqs convertalis $1/MMseqsDB/ribORF_humanTestis_in1_DB $1/MMseqsDB/speciesDB $1/Searching/resultDB $1/Searching/resultDB.m8

search -a -s 6 /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanTestis_in1_DB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/speciesDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/Searching/resultDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/tmp 

MMseqs Version:                        	14.7e284
Substitution matrix                    	aa:blosum62.out,nucl:nucleotide.out
Add backtrace                          	true
Alignment mode                         	2
Alignment mode                         	0
Allow wrapped scoring                  	false
E-value threshold                      	0.001
Seq. id. threshold                     	0
Min alignment length                   	0
Seq. id. mode                          	0
Altern

Clustering

In [16]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mkdir -p $1/Clustering

rm -r $1/Clustering/species_and_human_cluDB*

## cluster all species and human
mmseqs cluster -c 0.5 --min-seq-id 0.5 $1/MMseqsDB/species_and_humanDB $1/Clustering/species_and_human_cluDB $1/tmp
mmseqs createtsv $1/MMseqsDB/species_and_humanDB $1/MMseqsDB/species_and_humanDB $1/Clustering/species_and_human_cluDB $1/Clustering/species_and_human_cluDB.tsv

cluster -c 0.5 --min-seq-id 0.5 /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/Clustering/species_and_human_cluDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/tmp 

MMseqs Version:                     	14.7e284
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit            

## Analysis

In [33]:
transcript_gene

Unnamed: 0,gene_id,transcript_id,gene_type,gene_name
0,ENSG00000210049,ENST00000387314,Mt_tRNA,MT-TF
1,ENSG00000211459,ENST00000389680,Mt_rRNA,MT-RNR1
2,ENSG00000210077,ENST00000387342,Mt_tRNA,MT-TV
3,ENSG00000210082,ENST00000387347,Mt_rRNA,MT-RNR2
4,ENSG00000209082,ENST00000386347,Mt_tRNA,MT-TL1
...,...,...,...,...
62857,XLOC_000009,TCONS_00000010,novel,XLOC_000009
62858,XLOC_000009,TCONS_00000008,novel,XLOC_000009
62859,XLOC_000010,TCONS_00000011,novel,XLOC_000010
62860,XLOC_000011,TCONS_00000013,novel,XLOC_000011


In [36]:
clustering = pd.read_csv(os.path.join(evoDir,"Clustering/species_and_human_cluDB.tsv"), header=None, sep="\t")
clustering = clustering[clustering[0].str.startswith("ENST")]
clustering['transcript_id'] = clustering[0].str.split(":", expand=True)[0]
clustering['transcript_id'] = clustering[0].str.split(".", expand=True)[0]

clustering = clustering.merge(transcript_gene, on="transcript_id")
clustering = clustering[["gene_name","gene_type",0,1]]
## non-coding
noncoding = clustering[clustering[0].str.contains("noncoding")]
noncoding.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_speciesDB.tsv"), header=None, index=None, sep="\t")

noncoding_conserved = noncoding[~noncoding[1].str.startswith("ENST")]
noncoding_conserved.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved noncoding human ORFs")
print(len(set(noncoding_conserved[0].values.tolist())))

noncoding_NonConserved = noncoding[noncoding[1].str.startswith("ENST")]
noncoding_NonConserved.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved noncoding human ORFs")
print(len(set(noncoding_NonConserved[0].values.tolist())))


## canonical
canonical = clustering[clustering[0].str.contains("canonical")]
canonical.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_speciesDB.tsv"), header=None, index=None, sep="\t")

canonical_conserved = canonical[~canonical[1].str.startswith("ENST")]
canonical_conserved.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs")
print(len(set(canonical_conserved[0].values.tolist())))

canonical_conserved_noncoding = canonical_conserved[~canonical_conserved[1].str.contains("noncoding")]
canonical_conserved_noncoding.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_ConservedAsNoncoding.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs as noncoding in other species")
print(len(set(canonical_conserved_noncoding[0].values.tolist())))

canonical_conserved_canonical = canonical_conserved[~canonical_conserved[1].str.contains("canonical")]
canonical_conserved_canonical.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_ConservedAsCanonical.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs as canonical in other species")
print(len(set(canonical_conserved_canonical[0].values.tolist())))

canonical_NonConserved = canonical[canonical[1].str.startswith("ENST")]
canonical_NonConserved.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved canonical human ORFs")
print(len(set(canonical_NonConserved[0].values.tolist())))


Conserved noncoding human ORFs
81
Non-Conserved noncoding human ORFs
1580
Conserved canonical human ORFs
2025
Conserved canonical human ORFs as noncoding in other species
1999
Conserved canonical human ORFs as canonical in other species
353
Non-Conserved canonical human ORFs
2427
