## Evolutionary origin

Macaca - Mouse - Human

In [1]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

species_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47"

GENOMEDIR = "/genomics/users/marta/genomes"
species=["mouse","macaca","platypus","chicken","opossum"]

## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/transID_geneID_isoforms_selected.1to1.csv")

## evoDir
evoDir=os.path.join(users_dir,"EvolutionaryOrigin_MMseqs")

def translate_dna_to_protein(dna_seq):
    return str(Seq(dna_seq).translate())

# Function to save DF in fasta format
def create_seqrecord_PROT(row):
    return SeqRecord(Seq(row['protein']), id=row['header'][:-4], description="")
def create_seqrecord_DNA(row):
    return SeqRecord(Seq(row['seq']), id=row['header'], description="")

species_and_years_dict = {"specie" : ["human","macaca","mouse","opossum","platypus","chicken"],
                          "age" : [0,25,90,180,200,310]}
species_and_years = pd.DataFrame(data=species_and_years_dict, index=[1,2,3,4,5,6])
species_and_years


Unnamed: 0,specie,age
1,human,0
2,macaca,25
3,mouse,90
4,opossum,180
5,platypus,200
6,chicken,310


In [None]:
## get repre sequences - both DNA and PROTEINS
for s in species:
    print(s)
    riboseq_dir = os.path.join(users_dir,s,"RiboSeq/RiboQC_RiboNovel")

    ## get all candidates
    print("Reading all the candidate ORF sequences")
    df = pd.read_csv(os.path.join(riboseq_dir,"Annotation/candidateORF.fa"), header=None, sep="\t")
    ## convert to df
    candidates_fasta = pd.DataFrame({'header':df[0].iloc[::2].values, 'seq':df[0].iloc[1::2].values})
    candidates_fasta['header'] = candidates_fasta['header'].str[1:]
    candidates_fasta = candidates_fasta[~candidates_fasta['header'].str.contains("iORF")]

    all_repre_candidates = pd.DataFrame()
    for sample in os.listdir(os.path.join(riboseq_dir,"RibORF")):
        if sample.endswith("r1"):
            print(sample)

            repre = pd.read_csv(os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.txt"), sep="\t")
            repre_fasta = candidates_fasta[candidates_fasta['header'].isin(repre.orfID.values.tolist())]
            repre_fasta['header'] = s + "_" +repre_fasta['header']

            ## keep only the coding and the non-coding
            repre_fasta = repre_fasta[repre_fasta['header'].str.contains("canonical")| repre_fasta['header'].str.contains("noncoding")]

            repre_fasta['protein'] = repre_fasta['seq'].apply(translate_dna_to_protein)
            all_repre_candidates = pd.concat([all_repre_candidates, repre_fasta])

            ## translate to protein
            print("Translating to protein")
            out_proteins = os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.PROTEIN.fa")
            seq_records_proteins = repre_fasta.apply(create_seqrecord_PROT, axis=1).tolist()
            SeqIO.write(seq_records_proteins, out_proteins, "fasta")

            ## keep the dna sequences as well
            print("Saving the original DNA ORF")
            out_dna = os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.DNA.fa")
            seq_records_dna = repre_fasta.apply(create_seqrecord_DNA, axis=1).tolist()
            SeqIO.write(seq_records_dna, out_dna, "fasta")

            ## get only the nonredundant
            print("Getting the non-redundant")
            all_repre_candidates.drop_duplicates(inplace=True)
            out_nonredundant_proteins = os.path.join(riboseq_dir,"RibORF")+"/"+s+"_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa"
            seq_records_proteins = all_repre_candidates.apply(create_seqrecord_PROT, axis=1).tolist()
            SeqIO.write(seq_records_proteins, out_nonredundant_proteins, "fasta")

## MMseqs2

Against all expressed in mouse, macaca, platypus, opossum in any tissue (non-redunant set)

#### TESTIS-EXPRESSED

In [3]:
## Testis-expressed ORFs human
TestisExpressed_translatedINtestis = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.csv")
TestisExpressed_translatedINtestis

def create_seqrecord_PROThuman(row):
    return SeqRecord(Seq(row['ORFpep']), id=row['orfID'], description="")

out_nonredundant_proteins = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa"
seq_records_proteins = TestisExpressed_translatedINtestis.apply(create_seqrecord_PROThuman, axis=1).tolist()
SeqIO.write(seq_records_proteins, out_nonredundant_proteins, "fasta")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

Creating database of human (expressed in testis)

In [2]:
%%bash -s "$evoDir" "$users_dir"

# mkdir $1/MMseqsDB

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mkdir -p $1/MMseqsDB

mmseqs createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa $1/MMseqsDB/ribORF_humanTestis_in1_DB


Process is interrupted.



# >>>>>>>>>>>>>>>>>>>>>> ERROR REPORT <<<<<<<<<<<<<<<<<<<<<<

    Traceback (most recent call last):
      File "/soft/system/software/Miniconda3/4.9.2/lib/python3.8/site-packages/conda/exceptions.py", line 1124, in __call__
        return func(*args, **kwargs)
      File "/soft/system/software/Miniconda3/4.9.2/lib/python3.8/site-packages/conda/cli/main.py", line 94, in main_sourced
        print(activator.execute(), end="")
    BrokenPipeError: [Errno 32] Broken pipe

`$ /soft/system/software/Miniconda3/4.9.2/bin/conda shell.posix activate MMseqs2`

  environment variables:
                 CIO_TEST=<not set>
                CONDA_EXE=/soft/system/software/Miniconda3/4.9.2/bin/conda
         CONDA_PYTHON_EXE=/soft/system/software/Miniconda3/4.9.2/bin/python
               CONDA_ROOT=/soft/system/software/Miniconda3/4.9.2
              CONDA_SHLVL=0
           CURL_CA_BUNDLE=<not set>
               LD_PRELOAD=<not set>
  LMOD_DEFAULT_MODULEPATH=/soft/system/modules/all:/etc/lmod/modu

Creating a joint database with all the specie, everything translated, no matter the tissue

In [13]:
%%bash -s "$evoDir" "$species_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

# species="platypus mouse macaca chicken"
# for s in $species; do
#     echo $s

#     ### create DB
#     mmseqs createdb $2/$s/RiboSeq/RiboQC_RiboNovel/RibORF/repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa $1/MMseqsDB/${s}_noredunantProts_DB

# done

mmseqs createdb $2/*/RiboSeq/RiboQC_RiboNovel*/RibORF/*_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa $1/MMseqsDB/speciesDB



createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/platypus/RiboSeq/RiboQC_RiboNovel/RibORF/platypus_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_Transcriptom

Creating a joint database with all the specie AND human for clustering, everything translated for the species, no matter the tissue, but only translated in 1 for testis

In [15]:
%%bash -s "$evoDir" "$species_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mmseqs createdb $2/*/RiboSeq/RiboQC_RiboNovel*/RibORF/*_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa $1/MMseqsDB/species_and_humanDB



createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/platypus/RiboSeq/RiboQC_RiboNovel/RibORF/platypus_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/wi

Searching

In [16]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2
mkdir -p $1/Searching

rm $1/Searching/resultDB*
## The alignment consists of two steps the prefilter and alignment. To run the search, type:
mmseqs search -a -s 6 $1/MMseqsDB/ribORF_humanTestis_in1_DB $1/MMseqsDB/speciesDB $1/Searching/resultDB $1/tmp

# Then, convert the result database into a BLAST tab formatted file (option -m 8 in legacy blast, -outfmt 6 in blast+):
# Format alignment output	query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits

mmseqs convertalis $1/MMseqsDB/ribORF_humanTestis_in1_DB $1/MMseqsDB/speciesDB $1/Searching/resultDB $1/Searching/resultDB.m8

rm: cannot remove '/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/Searching/resultDB*': No such file or directory


Create directory /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/tmp
search -a -s 6 /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanTestis_in1_DB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/speciesDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/Searching/resultDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/tmp 

MMseqs Version:                        	14.7e284
Substitution matrix                    	aa:blosum62.out,nucl:nucleotide.out
Add backtrace                          	true
Alignment mode                         	2
Alignment mode                         	0
Allow wrapped scoring                  	false
E-value threshold             

Clustering

In [17]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mkdir -p $1/Clustering

rm -r $1/Clustering/species_and_human_cluDB*

## cluster all species and human
mmseqs cluster -c 0.5 --min-seq-id 0.5 $1/MMseqsDB/species_and_humanDB $1/Clustering/species_and_human_cluDB $1/tmp
mmseqs createtsv $1/MMseqsDB/species_and_humanDB $1/MMseqsDB/species_and_humanDB $1/Clustering/species_and_human_cluDB $1/Clustering/species_and_human_cluDB.tsv

rm: cannot remove '/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/Clustering/species_and_human_cluDB*': No such file or directory


cluster -c 0.5 --min-seq-id 0.5 /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/Clustering/species_and_human_cluDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/tmp 

MMseqs Version:                     	14.7e284
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit

## Analysis

In [18]:
clustering = pd.read_csv(os.path.join(evoDir,"Clustering/species_and_human_cluDB.tsv"), header=None, sep="\t")
clustering = clustering[clustering[0].str.startswith("ENST")]
print("all human")
print(len(clustering))
clustering['transcript_id'] = clustering[0].str.split(":", expand=True)[0]
clustering['transcript_id'] = clustering[0].str.split(".", expand=True)[0]

clustering = clustering.merge(transcript_gene, on="transcript_id")
print("all human after annotation")
print(len(clustering))
clustering = clustering[["gene_name","gene_type",0,1]]

## non-coding
noncoding = clustering[clustering[0].str.contains("noncoding")]
noncoding.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_speciesDB.tsv"), header=None, index=None, sep="\t")
print("noncoding all")
print(len(noncoding))
print(len(set(noncoding[0].values.tolist())))

noncoding_conserved = noncoding[~noncoding[1].str.startswith("ENST")]
noncoding_conserved.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved noncoding human ORFs")
print(len(set(noncoding_conserved[0].values.tolist())))
noncoding_conserved.columns = ['gene_name','gene_type','orfID',1]

noncoding_NonConserved = noncoding[noncoding[1].str.startswith("ENST")]
noncoding_NonConserved.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved noncoding human ORFs")
print(len(set(noncoding_NonConserved[0].values.tolist())))
noncoding_NonConserved.columns = ['gene_name','gene_type','orfID',1]


## canonical
canonical = clustering[clustering[0].str.contains("canonical")]
print("Canonical all")
print(len(canonical))
print(len(set(canonical[0].values.tolist())))
canonical.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_speciesDB.tsv"), header=None, index=None, sep="\t")

canonical_conserved = canonical[~canonical[1].str.startswith("ENST")]
canonical_conserved.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs")
print(len(set(canonical_conserved[0].values.tolist())))
canonical_conserved.columns = ['gene_name','gene_type','orfID',1]

canonical_conserved_noncoding = canonical_conserved[~canonical_conserved[1].str.contains("canonical")]
canonical_conserved_noncoding.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_ConservedAsNoncoding.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs as noncoding in other species")
print(len(set(canonical_conserved_noncoding['orfID'].values.tolist())))

canonical_conserved_canonical = canonical_conserved[canonical_conserved[1].str.contains("canonical")]
canonical_conserved_canonical.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_ConservedAsCanonical.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs as canonical in other species")
print(len(set(canonical_conserved_canonical['orfID'].values.tolist())))

canonical_NonConserved = canonical[canonical[1].str.startswith("ENST")]
canonical_NonConserved.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved canonical human ORFs")
print(len(set(canonical_NonConserved[0].values.tolist())))
canonical_NonConserved.columns = ['gene_name','gene_type','orfID',1]


all human
25675
all human after annotation
25675
noncoding all
3766
3027
Conserved noncoding human ORFs
216
Non-Conserved noncoding human ORFs
3027
Canonical all
14204
2899
Conserved canonical human ORFs
2585
Conserved canonical human ORFs as noncoding in other species
309
Conserved canonical human ORFs as canonical in other species
2573
Non-Conserved canonical human ORFs
2899


In [20]:
noncoding_conserved

Unnamed: 0,gene_name,gene_type,orfID,1,specie
9,ENSG00000288700,lncRNA,ENST00000683452.1:3:+|19|3753:358:1843|noncodi...,macaca_ENSMMUT00000083577:2:+|3|2913:26:2021|c...,macaca
11,ENSG00000288700,lncRNA,ENST00000683452.1:3:+|19|3753:358:1843|noncodi...,chicken_ENSGALT00010059861:9:+|25|2797:297:243...,chicken
12,ENSG00000288700,lncRNA,ENST00000683452.1:3:+|19|3753:358:1843|noncodi...,opossum_ENSMODT00000064442:4:+|17|2707:380:235...,opossum
13,ENSG00000288700,lncRNA,ENST00000683452.1:3:+|19|3753:358:1843|noncodi...,mouse_ENSMUST00000162960.8:chr9:-|12|5861:295:...,mouse
14,ENSG00000288700,lncRNA,ENST00000683452.1:3:+|19|3753:358:1843|noncodi...,platypus_ENSOANT00000040792:1:+|44|2106:562:21...,platypus
...,...,...,...,...,...
25597,ENSG00000258435,lncRNA,ENST00000718702.1:12:+|6|955:107:200|noncoding...,TCONS_00000607:12:+|6|619:107:200|noncoding|ATG,TCONS
25602,ENSG00000298182,lncRNA,ENST00000753709.1:11:-|44|1051:563:803|noncodi...,TCONS_00000477:11:-|4|587:101:341|noncoding|ATG,TCONS
25609,ENSG00000303265,lncRNA,ENST00000793280.1:18:+|12|1392:147:285|noncodi...,TCONS_00001196:18:+|16|1060:196:334|noncoding|ATG,TCONS
25612,ENSG00000305541,lncRNA,ENST00000811623.1:1:-|1|798:54:429|noncoding|CTG,TCONS_00000085:1:-|1|798:54:429|noncoding|CTG,TCONS


In [21]:
noncoding_conserved_to_matrix = noncoding_conserved
print(len(set(noncoding_conserved_to_matrix.orfID.values.tolist())))
# Assuming noncoding_conserved_to_matrix is already defined
noncoding_conserved_to_matrix['specie'] = noncoding_conserved_to_matrix[1].str.split("_", expand=True)[0]
noncoding_conserved_to_matrix = noncoding_conserved_to_matrix[['orfID', 'specie']]
noncoding_conserved_to_matrix['presence'] = 1
noncoding_conserved_to_matrix = noncoding_conserved_to_matrix[noncoding_conserved_to_matrix['specie'] != "TCONS"]

print(len(set(noncoding_conserved_to_matrix.orfID.values.tolist())))
noncoding_conserved_to_matrix

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_conserved_matrix = noncoding_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')

# Fill NaN values with 0
noncoding_conserved_matrix = noncoding_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
noncoding_conserved_matrix = noncoding_conserved_matrix.reset_index()

# Rename columns as necessary
noncoding_conserved_matrix.columns = ["orfID", "chicken", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
noncoding_conserved_matrix_ordered = noncoding_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
noncoding_conserved_matrix_ordered['sum'] = noncoding_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_conserved_matrix_ordered = noncoding_conserved_matrix_ordered.sort_values(by="sum", ascending=False)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
noncoding_conserved_matrix_ordered =  noncoding_conserved_matrix_ordered.merge(noncoding_conserved[['gene_name','gene_type','orfID']], on="orfID")
noncoding_conserved_matrix_ordered.drop_duplicates(inplace=True)

noncoding_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))


216
174
174
174


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_to_matrix['specie'] = noncoding_conserved_to_matrix[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_to_matrix['presence'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_matrix_ordered['sum'] = noncoding

In [23]:
### all noncoding
noncoding_conserved_touse = noncoding_conserved
noncoding_conserved_touse['specie'] = noncoding_conserved_touse[1].str.split("_", expand=True)[0]
# noncoding_NonConserved.drop("specie",axis=1, inplace=True)
noncoding_to_matrix_raw = pd.concat([noncoding_conserved, noncoding_NonConserved])

noncoding_to_matrix = noncoding_to_matrix_raw[['orfID', 'specie']]
noncoding_to_matrix = noncoding_to_matrix[noncoding_to_matrix['specie'] != "TCONS"]
noncoding_to_matrix['presence'] = np.where(noncoding_to_matrix['specie'].isna(), 0, 1)


human = pd.DataFrame({
    'orfID': noncoding_to_matrix['orfID'].unique(),
    'specie': 'human',
    'presence': 1
})

noncoding_to_matrix = pd.concat([human, noncoding_to_matrix], ignore_index=True)
noncoding_to_matrix.dropna(inplace=True)

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_matrix = noncoding_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
noncoding_matrix
# Fill NaN values with 0
noncoding_matrix = noncoding_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
noncoding_matrix = noncoding_matrix.reset_index()

# Rename columns as necessary
noncoding_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
noncoding_matrix_ordered = noncoding_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_matrix_ordered = noncoding_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
noncoding_matrix_ordered =  noncoding_matrix_ordered.merge(noncoding_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
noncoding_matrix_ordered.drop_duplicates(inplace=True)

noncoding_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_matrix_ordered.orfID.values.tolist())))

3027


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_touse['specie'] = noncoding_conserved_touse[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)


In [24]:
canonical_conserved_to_matrix = canonical_conserved

# Assuming noncoding_conserved_to_matrix is already defined
canonical_conserved_to_matrix['specie'] = canonical_conserved_to_matrix[1].str.split("_", expand=True)[0]
canonical_conserved_to_matrix = canonical_conserved_to_matrix[['orfID', 'specie']]
canonical_conserved_to_matrix['presence'] = 1
canonical_conserved_to_matrix = canonical_conserved_to_matrix[canonical_conserved_to_matrix['specie'] != "TCONS"]

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
canonical_conserved_matrix = canonical_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')

# Fill NaN values with 0
canonical_conserved_matrix = canonical_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
canonical_conserved_matrix = canonical_conserved_matrix.reset_index()

# Rename columns as necessary
canonical_conserved_matrix.columns = ["orfID", "chicken", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
canonical_conserved_matrix_ordered = canonical_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
canonical_conserved_matrix_ordered['sum'] = canonical_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
canonical_conserved_matrix_ordered = canonical_conserved_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
canonical_conserved_matrix_ordered =  canonical_conserved_matrix_ordered.merge(canonical_conserved[['gene_name','gene_type','orfID']], on="orfID")
canonical_conserved_matrix_ordered.drop_duplicates(inplace=True)

canonical_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_Conserved_matrix.tsv"), sep="\t", index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_to_matrix['specie'] = canonical_conserved_to_matrix[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_to_matrix['presence'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_matrix_ordered['sum'] = canonical

In [25]:
### all canonical
canonical_conserved_touse = canonical_conserved
canonical_conserved_touse['specie'] = canonical_conserved_touse[1].str.split("_", expand=True)[0]
# noncoding_NonConserved.drop("specie",axis=1, inplace=True)
canonical_to_matrix_raw = pd.concat([canonical_conserved, canonical_NonConserved])

canonical_to_matrix = canonical_to_matrix_raw[['orfID', 'specie']]
canonical_to_matrix = canonical_to_matrix[canonical_to_matrix['specie'] != "TCONS"]
canonical_to_matrix['presence'] = np.where(canonical_to_matrix['specie'].isna(), 0, 1)


human = pd.DataFrame({
    'orfID': canonical_to_matrix['orfID'].unique(),
    'specie': 'human',
    'presence': 1
})

canonical_to_matrix = pd.concat([human, canonical_to_matrix], ignore_index=True)
canonical_to_matrix.dropna(inplace=True)

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
canonical_matrix = canonical_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
canonical_matrix
# Fill NaN values with 0
canonical_matrix = canonical_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
canonical_matrix = canonical_matrix.reset_index()

# Rename columns as necessary
canonical_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
canonical_matrix_ordered = canonical_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
canonical_matrix_ordered['sum'] = canonical_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
canonical_matrix_ordered = canonical_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
canonical_matrix_ordered =  canonical_matrix_ordered.merge(canonical_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
canonical_matrix_ordered.drop_duplicates(inplace=True)

canonical_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_matrix.tsv"), sep="\t", index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_touse['specie'] = canonical_conserved_touse[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_matrix_ordered['sum'] = canonical_matrix_ordered.select_dtypes(include='number').sum(axis=1)


Find the oldest specie containg the ORF translated

In [26]:
species_ordered = ['chicken','platypus','opossum','mouse','macaca','human']
def find_last_species(row):
    for sp in species_ordered:
        if row[sp] == 1.0:  # Check if the value is 1.0
            return sp
    return None  # Return None if no species has a 1

In [27]:
# Apply the function to each row to create a new column 'oldest_species'
canonical_matrix_ordered['oldest_species'] = canonical_matrix_ordered.apply(find_last_species, axis=1)
canonical_matrix_ordered = species_and_years.merge(canonical_matrix_ordered, left_on=["specie"], right_on="oldest_species")
canonical_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_matrix_oldestAge.tsv"), sep="\t", index=None)

noncoding_matrix_ordered['oldest_species'] = noncoding_matrix_ordered.apply(find_last_species, axis=1)
noncoding_matrix_ordered = species_and_years.merge(noncoding_matrix_ordered, left_on=["specie"], right_on="oldest_species")
noncoding_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_matrix_oldestAge.tsv"), sep="\t", index=None)

complete_matrix_age = pd.concat([noncoding_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']], canonical_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']]])
complete_matrix_age.to_csv(os.path.join(evoDir,"Clustering/allHuman_matrix_oldestAge.tsv"), sep="\t", index=None)


## Liver-EXPRESSED

In [6]:
## Liver-expressed ORFs human
LiverExpressed_translatedINliver = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanLiver_translated.csv")
LiverExpressed_translatedINliver

def create_seqrecord_PROThuman(row):
    return SeqRecord(Seq(row['ORFpep']), id=row['orfID'], description="")

out_nonredundant_proteins = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanLiver_translated.fa"
seq_records_proteins = LiverExpressed_translatedINliver.apply(create_seqrecord_PROThuman, axis=1).tolist()
SeqIO.write(seq_records_proteins, out_nonredundant_proteins, "fasta")

KeyError: 'ORFpep'

Creating database of human (expressed in liver)


In [62]:
%%bash -s "$evoDir" "$users_dir"

# mkdir $1/MMseqsDB

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mmseqs createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanLiver.fa $1/MMseqsDB/ribORF_humanLiver_in1_DB


/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanLiver_in1_DB exists and will be overwritten
createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanLiver.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanLiver_in1_DB 

MMseqs Version:       	14.7e284
Database type         	0
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
[=
Time for merging to ribORF_humanLiver_in1_DB_h: 0h 0m 7s 140ms
Time for merging to ribORF_humanLiver_in1_DB: 0h 0m 20s 525ms
Database type: Aminoacid
Time for processing: 0h 1m 59s 895ms


createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/platypus/RiboSeq/RiboQC_RiboNovel/RibORF/platypus_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/wi

In [63]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mmseqs createdb $2/*/RiboSeq/RiboQC_RiboNovel*/RibORF/*_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanLiver.fa $1/MMseqsDB/species_and_humanLiverDB



/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanLiverDB exists and will be overwritten
createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/

In [64]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

## cluster all species and human
mmseqs cluster -c 0.5 --min-seq-id 0.5 $1/MMseqsDB/species_and_humanLiverDB $1/Clustering/species_and_humanLiver_cluDB $1/tmp
mmseqs createtsv $1/MMseqsDB/species_and_humanLiverDB $1/MMseqsDB/species_and_humanLiverDB $1/Clustering/species_and_humanLiver_cluDB $1/Clustering/species_and_humanLiver_cluDB.tsv

cluster -c 0.5 --min-seq-id 0.5 /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanLiverDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/Clustering/species_and_humanLiver_cluDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/tmp 

MMseqs Version:                     	14.7e284
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit  

Analysis

In [65]:
clustering = pd.read_csv(os.path.join(evoDir,"Clustering/species_and_humanLiver_cluDB.tsv"), header=None, sep="\t")
clustering = clustering[clustering[0].str.startswith("ENST")]
print("all human")
print(len(clustering))
clustering['transcript_id'] = clustering[0].str.split(":", expand=True)[0]
clustering['transcript_id'] = clustering[0].str.split(".", expand=True)[0]

clustering = clustering.merge(transcript_gene, on="transcript_id")
print("all human after annotation")
print(len(clustering))
clustering = clustering[["gene_name","gene_type",0,1]]

## non-coding
noncoding = clustering[clustering[0].str.contains("noncoding")]
noncoding.to_csv(os.path.join(evoDir,"Clustering/LIVER_noncodingHuman_speciesDB.tsv"), header=None, index=None, sep="\t")
print("noncoding all")
print(len(noncoding))
print(len(set(noncoding[0].values.tolist())))

noncoding_conserved = noncoding[~noncoding[1].str.startswith("ENST")]
noncoding_conserved.to_csv(os.path.join(evoDir,"Clustering/LIVER_noncodingHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved noncoding human ORFs")
print(len(set(noncoding_conserved[0].values.tolist())))
noncoding_conserved.columns = ['gene_name','gene_type','orfID',1]

noncoding_NonConserved = noncoding[noncoding[1].str.startswith("ENST")]
noncoding_NonConserved.to_csv(os.path.join(evoDir,"Clustering/LIVER_noncodingHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved noncoding human ORFs")
print(len(set(noncoding_NonConserved[0].values.tolist())))
noncoding_NonConserved.columns = ['gene_name','gene_type','orfID',1]


## canonical
canonical = clustering[clustering[0].str.contains("canonical")]
print("Canonical all")
print(len(canonical))
print(len(set(canonical[0].values.tolist())))
canonical.to_csv(os.path.join(evoDir,"Clustering/LIVER_canonicalHuman_speciesDB.tsv"), header=None, index=None, sep="\t")

canonical_conserved = canonical[~canonical[1].str.startswith("ENST")]
canonical_conserved.to_csv(os.path.join(evoDir,"Clustering/LIVER_canonicalHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs")
print(len(set(canonical_conserved[0].values.tolist())))
canonical_conserved.columns = ['gene_name','gene_type','orfID',1]

canonical_conserved_noncoding = canonical_conserved[~canonical_conserved[1].str.contains("noncoding")]
canonical_conserved_noncoding.to_csv(os.path.join(evoDir,"Clustering/LIVER_canonicalHuman_ConservedAsNoncoding.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs as noncoding in other species")
print(len(set(canonical_conserved_noncoding['orfID'].values.tolist())))

canonical_conserved_canonical = canonical_conserved[~canonical_conserved[1].str.contains("canonical")]
canonical_conserved_canonical.to_csv(os.path.join(evoDir,"Clustering/LIVER_canonicalHuman_ConservedAsCanonical.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs as canonical in other species")
print(len(set(canonical_conserved_canonical['orfID'].values.tolist())))

canonical_NonConserved = canonical[canonical[1].str.startswith("ENST")]
canonical_NonConserved.to_csv(os.path.join(evoDir,"Clustering/LIVER_canonicalHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved canonical human ORFs")
print(len(set(canonical_NonConserved[0].values.tolist())))
canonical_NonConserved.columns = ['gene_name','gene_type','orfID',1]


all human
10317
all human after annotation
10317
noncoding all
798
683
Conserved noncoding human ORFs
36
Non-Conserved noncoding human ORFs
683
Canonical all
9519
1952
Conserved canonical human ORFs
1663
Conserved canonical human ORFs as noncoding in other species
1643
Conserved canonical human ORFs as canonical in other species
204
Non-Conserved canonical human ORFs
1952


In [66]:
noncoding_conserved

Unnamed: 0,gene_name,gene_type,orfID,1
12,AC002116.8,lncRNA,ENST00000473572.2:19:-|56|1130:683:1046|noncod...,opossum_ENSMODT00000033099:4:-|1|879:6:639|can...
13,AC002116.8,lncRNA,ENST00000473572.2:19:-|56|1130:683:1046|noncod...,macaca_ENSMMUT00000038552:19:-|1|801:1:802|can...
15,AC002116.8,lncRNA,ENST00000473572.2:19:-|56|1130:683:1046|noncod...,platypus_ENSOANT00000056035:5:-|30|1579:380:11...
200,RP11-197N18.2,lncRNA,ENST00000540866.2:12:+|141|5618:1896:2205|nonc...,mouse_ENSMUST00000130140.8:chr5:+|94|1838:1216...
217,RP11-15H20.7,lncRNA,ENST00000594653.1:19:-|10|574:128:269|noncodin...,macaca_ENSMMUT00000104102:15:-|19|2098:297:444...
...,...,...,...,...
9232,AC016292.3,processed_pseudogene,ENST00000451056.1:17:-|1|408:1:409|noncoding|ATG,opossum_ENSMODT00000062873:2:+|76|1618:965:161...
9704,RP11-186B7.4,lncRNA,ENST00000581621.1:17:+|261|4554:3517:4042|nonc...,mouse_ENSMUST00000018918.12:chr11:-|4|1230:92:...
9706,RP11-186B7.4,lncRNA,ENST00000581621.1:17:+|261|4554:3517:4042|nonc...,macaca_ENSMMUT00000018950:16:+|2|3788:41:1106|...
9707,RP11-186B7.4,lncRNA,ENST00000581621.1:17:+|261|4554:3517:4042|nonc...,platypus_ENSOANT00000072290:X5:-|32|2765:470:1...


In [67]:
noncoding_conserved_to_matrix = noncoding_conserved
noncoding_conserved_to_matrix
print(len(set(noncoding_conserved_to_matrix.orfID.values.tolist())))
# Assuming noncoding_conserved_to_matrix is already defined
noncoding_conserved_to_matrix['specie'] = noncoding_conserved_to_matrix[1].str.split("_", expand=True)[0]
noncoding_conserved_to_matrix = noncoding_conserved_to_matrix[~noncoding_conserved_to_matrix['specie'].str.contains("TCONS")]
noncoding_conserved_to_matrix = noncoding_conserved_to_matrix[['orfID', 'specie']]
noncoding_conserved_to_matrix['presence'] = 1
print(len(set(noncoding_conserved_to_matrix.orfID.values.tolist())))
print(set(noncoding_conserved_to_matrix.specie.values.tolist()))
## Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_conserved_matrix = noncoding_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max').reset_index()
# Rename columns as necessary
noncoding_conserved_matrix.columns = ["orfID", "chicken", "macaca", "mouse", "opossum", "platypus"]
# Fill NaN values with 0
noncoding_conserved_matrix = noncoding_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index

# Reorder columns for the final matrix
noncoding_conserved_matrix_ordered = noncoding_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
noncoding_conserved_matrix_ordered['sum'] = noncoding_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_conserved_matrix_ordered = noncoding_conserved_matrix_ordered.sort_values(by="sum", ascending=False)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
noncoding_conserved_matrix_ordered =  noncoding_conserved_matrix_ordered.merge(noncoding_conserved[['gene_name','gene_type','orfID']], on="orfID")
noncoding_conserved_matrix_ordered.drop_duplicates(inplace=True)

noncoding_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/LIVER_noncodingHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))


36
36
{'mouse', 'macaca', 'opossum', 'platypus', 'chicken'}
36


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_to_matrix['specie'] = noncoding_conserved_to_matrix[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_matrix_ordered['sum'] = noncoding_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)


36


In [68]:
### all noncoding
# noncoding_conserved_touse = noncoding_conserved
# noncoding_conserved_touse['specie'] = noncoding_conserved_touse[1].str.split("_", expand=True)[0]
# noncoding_conserved_touse = noncoding_conserved_touse[~noncoding_conserved_touse['specie'].str.contains("TCONS")]

noncoding_to_matrix_raw = pd.concat([noncoding_conserved, noncoding_NonConserved])
noncoding_to_matrix_raw
noncoding_to_matrix = noncoding_to_matrix_raw[['orfID', 'specie']]
# noncoding_to_matrix
noncoding_to_matrix['presence'] = np.where(noncoding_to_matrix['specie'].isna(), 0, 1)


human = pd.DataFrame({
    'orfID': noncoding_to_matrix['orfID'].unique(),
    'specie': 'human',
    'presence': 1
})

noncoding_to_matrix = pd.concat([human, noncoding_to_matrix], ignore_index=True)
noncoding_to_matrix.dropna(inplace=True)
noncoding_to_matrix = noncoding_to_matrix[~noncoding_to_matrix['specie'].str.contains("TCONS")]

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_matrix = noncoding_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
noncoding_matrix
# Fill NaN values with 0
noncoding_matrix = noncoding_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
noncoding_matrix = noncoding_matrix.reset_index()
noncoding_matrix
# Rename columns as necessary
noncoding_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
noncoding_matrix_ordered = noncoding_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_matrix_ordered = noncoding_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
noncoding_matrix_ordered =  noncoding_matrix_ordered.merge(noncoding_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
noncoding_matrix_ordered.drop_duplicates(inplace=True)

noncoding_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/LIVER_noncodingHuman_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_matrix_ordered.orfID.values.tolist())))

683


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_to_matrix['presence'] = np.where(noncoding_to_matrix['specie'].isna(), 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)


In [69]:
canonical_conserved_to_matrix = canonical_conserved

# Assuming noncoding_conserved_to_matrix is already defined
canonical_conserved_to_matrix['specie'] = canonical_conserved_to_matrix[1].str.split("_", expand=True)[0]
canonical_conserved_to_matrix = canonical_conserved_to_matrix[['orfID', 'specie']]
canonical_conserved_to_matrix['presence'] = 1

canonical_conserved_to_matrix = canonical_conserved_to_matrix[~canonical_conserved_to_matrix['specie'].str.contains("TCONS")]

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
canonical_conserved_matrix = canonical_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')

# Fill NaN values with 0
canonical_conserved_matrix = canonical_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
canonical_conserved_matrix = canonical_conserved_matrix.reset_index()

# Rename columns as necessary
canonical_conserved_matrix.columns = ["orfID", "chicken", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
canonical_conserved_matrix_ordered = canonical_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
canonical_conserved_matrix_ordered['sum'] = canonical_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
canonical_conserved_matrix_ordered = canonical_conserved_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
canonical_conserved_matrix_ordered =  canonical_conserved_matrix_ordered.merge(canonical_conserved[['gene_name','gene_type','orfID']], on="orfID")
canonical_conserved_matrix_ordered.drop_duplicates(inplace=True)

canonical_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/LIVER_canonicalHuman_Conserved_matrix.tsv"), sep="\t", index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_to_matrix['specie'] = canonical_conserved_to_matrix[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_to_matrix['presence'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_matrix_ordered['sum'] = canonical

In [70]:
### all canonical
canonical_conserved_touse = canonical_conserved
canonical_conserved_touse['specie'] = canonical_conserved_touse[1].str.split("_", expand=True)[0]
# noncoding_NonConserved.drop("specie",axis=1, inplace=True)
canonical_to_matrix_raw = pd.concat([canonical_conserved, canonical_NonConserved])

canonical_to_matrix = canonical_to_matrix_raw[['orfID', 'specie']]
canonical_to_matrix['presence'] = np.where(canonical_to_matrix['specie'].isna(), 0, 1)


human = pd.DataFrame({
    'orfID': canonical_to_matrix['orfID'].unique(),
    'specie': 'human',
    'presence': 1
})

canonical_to_matrix = pd.concat([human, canonical_to_matrix], ignore_index=True)
canonical_to_matrix.dropna(inplace=True)
canonical_to_matrix = canonical_to_matrix[~canonical_to_matrix['specie'].str.contains("TCONS")]
# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
canonical_matrix = canonical_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
canonical_matrix
# Fill NaN values with 0
canonical_matrix = canonical_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
canonical_matrix = canonical_matrix.reset_index()

# Rename columns as necessary
canonical_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
canonical_matrix_ordered = canonical_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
canonical_matrix_ordered['sum'] = canonical_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
canonical_matrix_ordered = canonical_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
canonical_matrix_ordered =  canonical_matrix_ordered.merge(canonical_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
canonical_matrix_ordered.drop_duplicates(inplace=True)

canonical_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/LIVER_canonicalHuman_matrix.tsv"), sep="\t", index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_touse['specie'] = canonical_conserved_touse[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_to_matrix['presence'] = np.where(canonical_to_matrix['specie'].isna(), 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_matrix_ord

Find the oldest specie containg the ORF translated

In [71]:
species_ordered = ['chicken','platypus','opossum','mouse','macaca','human']
def find_last_species(row):
    for sp in species_ordered:
        if row[sp] == 1.0:  # Check if the value is 1.0
            return sp
    return None  # Return None if no species has a 1

In [72]:
# Apply the function to each row to create a new column 'oldest_species'
canonical_matrix_ordered['oldest_species'] = canonical_matrix_ordered.apply(find_last_species, axis=1)
canonical_matrix_ordered = species_and_years.merge(canonical_matrix_ordered, left_on=["specie"], right_on="oldest_species")
canonical_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/LIVER_canonicalHuman_matrix_oldestAge.tsv"), sep="\t", index=None)

noncoding_matrix_ordered['oldest_species'] = noncoding_matrix_ordered.apply(find_last_species, axis=1)
noncoding_matrix_ordered = species_and_years.merge(noncoding_matrix_ordered, left_on=["specie"], right_on="oldest_species")
noncoding_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/LIVER_noncodingHuman_matrix_oldestAge.tsv"), sep="\t", index=None)

complete_matrix_age = pd.concat([noncoding_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']], canonical_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']]])
complete_matrix_age.to_csv(os.path.join(evoDir,"Clustering/LIVER_allHuman_matrix_oldestAge.tsv"), sep="\t", index=None)


## Brain-EXPRESSED

In [73]:
## Brain-expressed ORFs human
BrainExpressed_translatedINbrain = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanBrain_in1.csv")
BrainExpressed_translatedINbrain

def create_seqrecord_PROThuman(row):
    return SeqRecord(Seq(row['ORFpep']), id=row['orfID'], description="")

out_nonredundant_proteins = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanBrain.fa"
seq_records_proteins = BrainExpressed_translatedINbrain.apply(create_seqrecord_PROThuman, axis=1).tolist()
SeqIO.write(seq_records_proteins, out_nonredundant_proteins, "fasta")

13297

In [74]:
%%bash -s "$evoDir" "$users_dir"

# mkdir $1/MMseqsDB

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mmseqs createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanBrain.fa $1/MMseqsDB/ribORF_humanBrain_in1_DB


/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanBrain_in1_DB exists and will be overwritten
createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanBrain.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanBrain_in1_DB 

MMseqs Version:       	14.7e284
Database type         	0
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
[=
Time for merging to ribORF_humanBrain_in1_DB_h: 0h 0m 7s 812ms
Time for merging to ribORF_humanBrain_in1_DB: 0h 0m 11s 606ms
Database type: Aminoacid
Time for processing: 0h 1m 28s 705ms


In [75]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mmseqs createdb $2/*/RiboSeq/RiboQC_RiboNovel*/RibORF/*_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q1_TestisORFs/human/ribORF_humanBrain.fa $1/MMseqsDB/species_and_humanBrainDB



/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanBrainDB exists and will be overwritten
createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/

In [76]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

## cluster all species and human
mmseqs cluster -c 0.5 --min-seq-id 0.5 $1/MMseqsDB/species_and_humanBrainDB $1/Clustering/species_and_humanBrain_cluDB $1/tmp
mmseqs createtsv $1/MMseqsDB/species_and_humanBrainDB $1/MMseqsDB/species_and_humanBrainDB $1/Clustering/species_and_humanBrain_cluDB $1/Clustering/species_and_humanBrain_cluDB.tsv

cluster -c 0.5 --min-seq-id 0.5 /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanBrainDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/Clustering/species_and_humanBrain_cluDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/EvolutionaryOrigin_MMseqs/tmp 

MMseqs Version:                     	14.7e284
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit  

Analysis

In [77]:
clustering = pd.read_csv(os.path.join(evoDir,"Clustering/species_and_humanBrain_cluDB.tsv"), header=None, sep="\t")
clustering = clustering[clustering[0].str.startswith("ENST")]
print("all human")
print(len(clustering))
clustering['transcript_id'] = clustering[0].str.split(":", expand=True)[0]
clustering['transcript_id'] = clustering[0].str.split(".", expand=True)[0]

clustering = clustering.merge(transcript_gene, on="transcript_id")
print("all human after annotation")
print(len(clustering))
clustering = clustering[["gene_name","gene_type",0,1]]

## non-coding
noncoding = clustering[clustering[0].str.contains("noncoding")]
noncoding.to_csv(os.path.join(evoDir,"Clustering/BRAIN_noncodingHuman_speciesDB.tsv"), header=None, index=None, sep="\t")
print("noncoding all")
print(len(noncoding))
print(len(set(noncoding[0].values.tolist())))

noncoding_conserved = noncoding[~noncoding[1].str.startswith("ENST")]
noncoding_conserved.to_csv(os.path.join(evoDir,"Clustering/BRAIN_noncodingHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved noncoding human ORFs")
print(len(set(noncoding_conserved[0].values.tolist())))
noncoding_conserved.columns = ['gene_name','gene_type','orfID',1]

noncoding_NonConserved = noncoding[noncoding[1].str.startswith("ENST")]
noncoding_NonConserved.to_csv(os.path.join(evoDir,"Clustering/BRAIN_noncodingHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved noncoding human ORFs")
print(len(set(noncoding_NonConserved[0].values.tolist())))
noncoding_NonConserved.columns = ['gene_name','gene_type','orfID',1]


## canonical
canonical = clustering[clustering[0].str.contains("canonical")]
print("Canonical all")
print(len(canonical))
print(len(set(canonical[0].values.tolist())))
canonical.to_csv(os.path.join(evoDir,"Clustering/BRAIN_canonicalHuman_speciesDB.tsv"), header=None, index=None, sep="\t")

canonical_conserved = canonical[~canonical[1].str.startswith("ENST")]
canonical_conserved.to_csv(os.path.join(evoDir,"Clustering/BRAIN_canonicalHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs")
print(len(set(canonical_conserved[0].values.tolist())))
canonical_conserved.columns = ['gene_name','gene_type','orfID',1]

canonical_conserved_noncoding = canonical_conserved[~canonical_conserved[1].str.contains("noncoding")]
canonical_conserved_noncoding.to_csv(os.path.join(evoDir,"Clustering/BRAIN_canonicalHuman_ConservedAsNoncoding.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs as noncoding in other species")
print(len(set(canonical_conserved_noncoding['orfID'].values.tolist())))

canonical_conserved_canonical = canonical_conserved[~canonical_conserved[1].str.contains("canonical")]
canonical_conserved_canonical.to_csv(os.path.join(evoDir,"Clustering/BRAIN_canonicalHuman_ConservedAsCanonical.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs as canonical in other species")
print(len(set(canonical_conserved_canonical['orfID'].values.tolist())))

canonical_NonConserved = canonical[canonical[1].str.startswith("ENST")]
canonical_NonConserved.to_csv(os.path.join(evoDir,"Clustering/BRAIN_canonicalHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved canonical human ORFs")
print(len(set(canonical_NonConserved[0].values.tolist())))
canonical_NonConserved.columns = ['gene_name','gene_type','orfID',1]


all human
12417
all human after annotation
12417
noncoding all
1400
1175
Conserved noncoding human ORFs
72
Non-Conserved noncoding human ORFs
1175
Canonical all
11017
2194
Conserved canonical human ORFs
1872
Conserved canonical human ORFs as noncoding in other species
1852
Conserved canonical human ORFs as canonical in other species
235
Non-Conserved canonical human ORFs
2194


In [78]:
noncoding_conserved_to_matrix = noncoding_conserved
noncoding_conserved_to_matrix
print(len(set(noncoding_conserved_to_matrix.orfID.values.tolist())))
# Assuming noncoding_conserved_to_matrix is already defined
noncoding_conserved_to_matrix['specie'] = noncoding_conserved_to_matrix[1].str.split("_", expand=True)[0]
noncoding_conserved_to_matrix = noncoding_conserved_to_matrix[~noncoding_conserved_to_matrix['specie'].str.contains("TCONS")]
noncoding_conserved_to_matrix = noncoding_conserved_to_matrix[['orfID', 'specie']]
noncoding_conserved_to_matrix['presence'] = 1
print(len(set(noncoding_conserved_to_matrix.orfID.values.tolist())))
print(set(noncoding_conserved_to_matrix.specie.values.tolist()))
## Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_conserved_matrix = noncoding_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max').reset_index()
# Rename columns as necessary
noncoding_conserved_matrix.columns = ["orfID", "chicken", "macaca", "mouse", "opossum", "platypus"]
# Fill NaN values with 0
noncoding_conserved_matrix = noncoding_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index

# Reorder columns for the final matrix
noncoding_conserved_matrix_ordered = noncoding_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
noncoding_conserved_matrix_ordered['sum'] = noncoding_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_conserved_matrix_ordered = noncoding_conserved_matrix_ordered.sort_values(by="sum", ascending=False)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
noncoding_conserved_matrix_ordered =  noncoding_conserved_matrix_ordered.merge(noncoding_conserved[['gene_name','gene_type','orfID']], on="orfID")
noncoding_conserved_matrix_ordered.drop_duplicates(inplace=True)

noncoding_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/BRAIN_noncodingHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))


72
72
{'mouse', 'macaca', 'opossum', 'platypus', 'chicken'}
72


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_to_matrix['specie'] = noncoding_conserved_to_matrix[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_matrix_ordered['sum'] = noncoding_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)


72


In [79]:
### all noncoding
# noncoding_conserved_touse = noncoding_conserved
# noncoding_conserved_touse['specie'] = noncoding_conserved_touse[1].str.split("_", expand=True)[0]
# noncoding_conserved_touse = noncoding_conserved_touse[~noncoding_conserved_touse['specie'].str.contains("TCONS")]

noncoding_to_matrix_raw = pd.concat([noncoding_conserved, noncoding_NonConserved])
noncoding_to_matrix_raw
noncoding_to_matrix = noncoding_to_matrix_raw[['orfID', 'specie']]
# noncoding_to_matrix
noncoding_to_matrix['presence'] = np.where(noncoding_to_matrix['specie'].isna(), 0, 1)


human = pd.DataFrame({
    'orfID': noncoding_to_matrix['orfID'].unique(),
    'specie': 'human',
    'presence': 1
})

noncoding_to_matrix = pd.concat([human, noncoding_to_matrix], ignore_index=True)
noncoding_to_matrix.dropna(inplace=True)
noncoding_to_matrix = noncoding_to_matrix[~noncoding_to_matrix['specie'].str.contains("TCONS")]

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_matrix = noncoding_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
noncoding_matrix
# Fill NaN values with 0
noncoding_matrix = noncoding_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
noncoding_matrix = noncoding_matrix.reset_index()
noncoding_matrix
# Rename columns as necessary
noncoding_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
noncoding_matrix_ordered = noncoding_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_matrix_ordered = noncoding_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
noncoding_matrix_ordered =  noncoding_matrix_ordered.merge(noncoding_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
noncoding_matrix_ordered.drop_duplicates(inplace=True)

noncoding_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/BRAIN_noncodingHuman_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_matrix_ordered.orfID.values.tolist())))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_to_matrix['presence'] = np.where(noncoding_to_matrix['specie'].isna(), 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)


1175


In [80]:
canonical_conserved_to_matrix = canonical_conserved

# Assuming noncoding_conserved_to_matrix is already defined
canonical_conserved_to_matrix['specie'] = canonical_conserved_to_matrix[1].str.split("_", expand=True)[0]
canonical_conserved_to_matrix = canonical_conserved_to_matrix[['orfID', 'specie']]
canonical_conserved_to_matrix['presence'] = 1

canonical_conserved_to_matrix = canonical_conserved_to_matrix[~canonical_conserved_to_matrix['specie'].str.contains("TCONS")]

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
canonical_conserved_matrix = canonical_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')

# Fill NaN values with 0
canonical_conserved_matrix = canonical_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
canonical_conserved_matrix = canonical_conserved_matrix.reset_index()

# Rename columns as necessary
canonical_conserved_matrix.columns = ["orfID", "chicken", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
canonical_conserved_matrix_ordered = canonical_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
canonical_conserved_matrix_ordered['sum'] = canonical_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
canonical_conserved_matrix_ordered = canonical_conserved_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
canonical_conserved_matrix_ordered =  canonical_conserved_matrix_ordered.merge(canonical_conserved[['gene_name','gene_type','orfID']], on="orfID")
canonical_conserved_matrix_ordered.drop_duplicates(inplace=True)

canonical_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/BRAIN_canonicalHuman_Conserved_matrix.tsv"), sep="\t", index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_to_matrix['specie'] = canonical_conserved_to_matrix[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_to_matrix['presence'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_matrix_ordered['sum'] = canonical

In [81]:
### all canonical
canonical_conserved_touse = canonical_conserved
canonical_conserved_touse['specie'] = canonical_conserved_touse[1].str.split("_", expand=True)[0]
# noncoding_NonConserved.drop("specie",axis=1, inplace=True)
canonical_to_matrix_raw = pd.concat([canonical_conserved, canonical_NonConserved])

canonical_to_matrix = canonical_to_matrix_raw[['orfID', 'specie']]
canonical_to_matrix['presence'] = np.where(canonical_to_matrix['specie'].isna(), 0, 1)


human = pd.DataFrame({
    'orfID': canonical_to_matrix['orfID'].unique(),
    'specie': 'human',
    'presence': 1
})

canonical_to_matrix = pd.concat([human, canonical_to_matrix], ignore_index=True)
canonical_to_matrix.dropna(inplace=True)
canonical_to_matrix = canonical_to_matrix[~canonical_to_matrix['specie'].str.contains("TCONS")]
# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
canonical_matrix = canonical_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
canonical_matrix
# Fill NaN values with 0
canonical_matrix = canonical_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
canonical_matrix = canonical_matrix.reset_index()

# Rename columns as necessary
canonical_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
canonical_matrix_ordered = canonical_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
canonical_matrix_ordered['sum'] = canonical_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
canonical_matrix_ordered = canonical_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
canonical_matrix_ordered =  canonical_matrix_ordered.merge(canonical_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
canonical_matrix_ordered.drop_duplicates(inplace=True)

canonical_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/BRAIN_canonicalHuman_matrix.tsv"), sep="\t", index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_touse['specie'] = canonical_conserved_touse[1].str.split("_", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_to_matrix['presence'] = np.where(canonical_to_matrix['specie'].isna(), 0, 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_matrix_ord

Find the oldest specie containg the ORF translated

In [82]:
species_ordered = ['chicken','platypus','opossum','mouse','macaca','human']
def find_last_species(row):
    for sp in species_ordered:
        if row[sp] == 1.0:  # Check if the value is 1.0
            return sp
    return None  # Return None if no species has a 1

In [83]:
# Apply the function to each row to create a new column 'oldest_species'
canonical_matrix_ordered['oldest_species'] = canonical_matrix_ordered.apply(find_last_species, axis=1)
canonical_matrix_ordered = species_and_years.merge(canonical_matrix_ordered, left_on=["specie"], right_on="oldest_species")
canonical_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/BRAIN_canonicalHuman_matrix_oldestAge.tsv"), sep="\t", index=None)

noncoding_matrix_ordered['oldest_species'] = noncoding_matrix_ordered.apply(find_last_species, axis=1)
noncoding_matrix_ordered = species_and_years.merge(noncoding_matrix_ordered, left_on=["specie"], right_on="oldest_species")
noncoding_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/BRAIN_noncodingHuman_matrix_oldestAge.tsv"), sep="\t", index=None)

complete_matrix_age = pd.concat([noncoding_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']], canonical_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']]])
complete_matrix_age.to_csv(os.path.join(evoDir,"Clustering/BRAIN_allHuman_matrix_oldestAge.tsv"), sep="\t", index=None)
