## Evolutionary origin

Macaca - Mouse - Human

In [1]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

species_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47"

GENOMEDIR = "/genomics/users/marta/genomes"
species=["mouse","macaca","platypus","chicken","opossum"]

## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/transID_geneID_isoforms_selected.1to1.csv")

## evoDir
evoDir=os.path.join(users_dir,"EvolutionaryOrigin_MMseqs")

def translate_dna_to_protein(dna_seq):
    return str(Seq(dna_seq).translate())

# Function to save DF in fasta format
def create_seqrecord_PROT(row):
    return SeqRecord(Seq(row['protein']), id=row['header'][:-4], description="")
def create_seqrecord_DNA(row):
    return SeqRecord(Seq(row['seq']), id=row['header'], description="")

species_and_years_dict = {"specie" : ["human","macaca","mouse","opossum","platypus","chicken"],
                          "age" : [0,25,90,180,200,310]}
species_and_years = pd.DataFrame(data=species_and_years_dict, index=[1,2,3,4,5,6])
species_and_years


Unnamed: 0,specie,age
1,human,0
2,macaca,25
3,mouse,90
4,opossum,180
5,platypus,200
6,chicken,310


In [None]:
## get repre sequences - both DNA and PROTEINS
for s in species:
    print(s)
    riboseq_dir = os.path.join(users_dir,s,"RiboSeq/RiboQC_RiboNovel")

    ## get all candidates
    print("Reading all the candidate ORF sequences")
    df = pd.read_csv(os.path.join(riboseq_dir,"Annotation/candidateORF.fa"), header=None, sep="\t")
    ## convert to df
    candidates_fasta = pd.DataFrame({'header':df[0].iloc[::2].values, 'seq':df[0].iloc[1::2].values})
    candidates_fasta['header'] = candidates_fasta['header'].str[1:]
    candidates_fasta = candidates_fasta[~candidates_fasta['header'].str.contains("iORF")]

    all_repre_candidates = pd.DataFrame()
    for sample in os.listdir(os.path.join(riboseq_dir,"RibORF")):
        if sample.endswith("r1"):
            print(sample)

            repre = pd.read_csv(os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.txt"), sep="\t")
            repre_fasta = candidates_fasta[candidates_fasta['header'].isin(repre.orfID.values.tolist())]
            repre_fasta['header'] = s + "_" +repre_fasta['header']

            ## keep only the coding and the non-coding
            repre_fasta = repre_fasta[repre_fasta['header'].str.contains("canonical")| repre_fasta['header'].str.contains("noncoding")]

            repre_fasta['protein'] = repre_fasta['seq'].apply(translate_dna_to_protein)
            all_repre_candidates = pd.concat([all_repre_candidates, repre_fasta])

            ## translate to protein
            print("Translating to protein")
            out_proteins = os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.PROTEIN.fa")
            seq_records_proteins = repre_fasta.apply(create_seqrecord_PROT, axis=1).tolist()
            SeqIO.write(seq_records_proteins, out_proteins, "fasta")

            ## keep the dna sequences as well
            print("Saving the original DNA ORF")
            out_dna = os.path.join(riboseq_dir,"RibORF",sample,"repre.valid.pred.pvalue.parameters.DNA.fa")
            seq_records_dna = repre_fasta.apply(create_seqrecord_DNA, axis=1).tolist()
            SeqIO.write(seq_records_dna, out_dna, "fasta")

            ## get only the nonredundant
            print("Getting the non-redundant")
            all_repre_candidates.drop_duplicates(inplace=True)
            out_nonredundant_proteins = os.path.join(riboseq_dir,"RibORF")+"/"+s+"_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa"
            seq_records_proteins = all_repre_candidates.apply(create_seqrecord_PROT, axis=1).tolist()
            SeqIO.write(seq_records_proteins, out_nonredundant_proteins, "fasta")

## MMseqs2

Against all expressed in mouse, macaca, platypus, opossum in any tissue (non-redunant set)

#### TESTIS-EXPRESSED

In [2]:
## Testis-expressed ORFs human
TestisExpressed_translatedINtestis = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.csv")
TestisExpressed_translatedINtestis

def create_seqrecord_PROThuman(row):
    return SeqRecord(Seq(row['ORFpep']), id=row['orfID'], description="")

out_nonredundant_proteins = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa"
seq_records_proteins = TestisExpressed_translatedINtestis.apply(create_seqrecord_PROThuman, axis=1).tolist()
SeqIO.write(seq_records_proteins, out_nonredundant_proteins, "fasta")

27106

Creating database of human (expressed in testis)

In [3]:
%%bash -s "$evoDir" "$users_dir"

# mkdir $1/MMseqsDB

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mkdir -p $1/MMseqsDB

mmseqs createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa $1/MMseqsDB/ribORF_humanTestis_in1_DB


/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanTestis_in1_DB exists and will be overwritten
createdb /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanTestis_in1_DB 

MMseqs Version:       	14.7e284
Database type         	0
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
[==
Time for merging to ribORF_humanTestis_in1_DB_h: 0h 0m 3s 213ms
Time for merging to ribORF_humanTestis_in1_DB: 0h 0m 10s 476ms
Database type: Aminoacid
Time for processing: 0h 0m 58s 317ms


Creating a joint database with all the specie, everything translated, no matter the tissue

In [4]:
%%bash -s "$evoDir" "$species_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

# species="platypus mouse macaca chicken"
# for s in $species; do
#     echo $s

#     ### create DB
#     mmseqs createdb $2/$s/RiboSeq/RiboQC_RiboNovel/RibORF/repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa $1/MMseqsDB/${s}_noredunantProts_DB

# done

mmseqs createdb $2/*/RiboSeq/RiboQC_RiboNovel*/RibORF/*_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa $1/MMseqsDB/speciesDB



/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/speciesDB exists and will be overwritten
createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/platypus/Ri

Creating a joint database with all the specie AND human for clustering, everything translated for the species, no matter the tissue, but only translated in 1 for testis

In [5]:
%%bash -s "$evoDir" "$species_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mmseqs createdb $2/*/RiboSeq/RiboQC_RiboNovel*/RibORF/*_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q1_TestisORFs/human/ribORF_humanTestis_in1.fa $1/MMseqsDB/species_and_humanDB



/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanDB exists and will be overwritten
createdb /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/chicken/RiboSeq/RiboQC_RiboNovel/RibORF/chicken_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/macaca/RiboSeq/RiboQC_RiboNovel/RibORF/macaca_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/mouse/RiboSeq/RiboQC_RiboNovel/RibORF/mouse_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/opossum/RiboSeq/RiboQC_RiboNovel/RibORF/opossum_repre.valid.pred.pvalue.parameters.PROTEIN.noredunant.fa /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/p

Searching

In [None]:
# %%bash -s "$evoDir" "$users_dir"

# module load Miniconda3/4.9.2
# source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2
# mkdir -p $1/Searching

# rm $1/Searching/resultDB*
# ## The alignment consists of two steps the prefilter and alignment. To run the search, type:
# mmseqs search -a -s 6 $1/MMseqsDB/ribORF_humanTestis_in1_DB $1/MMseqsDB/speciesDB $1/Searching/resultDB $1/tmp

# # Then, convert the result database into a BLAST tab formatted file (option -m 8 in legacy blast, -outfmt 6 in blast+):
# # Format alignment output	query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits

# mmseqs convertalis $1/MMseqsDB/ribORF_humanTestis_in1_DB $1/MMseqsDB/speciesDB $1/Searching/resultDB $1/Searching/resultDB.m8

rm: cannot remove '/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/Searching/resultDB*': No such file or directory


Create directory /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/tmp
search -a -s 6 /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/ribORF_humanTestis_in1_DB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/speciesDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/Searching/resultDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/tmp 

MMseqs Version:                        	14.7e284
Substitution matrix                    	aa:blosum62.out,nucl:nucleotide.out
Add backtrace                          	true
Alignment mode                         	2
Alignment mode                         	0
Allow wrapped scoring                  	false
E-value threshold             

Clustering

In [6]:
%%bash -s "$evoDir" "$users_dir"

module load Miniconda3/4.9.2
source /soft/system/software/Miniconda3/4.9.2/bin/activate MMseqs2

mkdir -p $1/Clustering

rm -r $1/Clustering/species_and_human_cluDB*

## cluster all species and human
mmseqs cluster -c 0.5 --min-seq-id 0.5 $1/MMseqsDB/species_and_humanDB $1/Clustering/species_and_human_cluDB $1/tmp
mmseqs createtsv $1/MMseqsDB/species_and_humanDB $1/MMseqsDB/species_and_humanDB $1/Clustering/species_and_human_cluDB $1/Clustering/species_and_human_cluDB.tsv

cluster -c 0.5 --min-seq-id 0.5 /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/MMseqsDB/species_and_humanDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/Clustering/species_and_human_cluDB /users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/EvolutionaryOrigin_MMseqs/tmp 

MMseqs Version:                     	14.7e284
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit

## Analysis

In [71]:
clustering = pd.read_csv(os.path.join(evoDir,"Clustering/species_and_human_cluDB.tsv"), header=None, sep="\t")
clustering = clustering[clustering[0].str.startswith("ENST") | clustering[0].str.startswith("TCONS")]
print("all human")
print(len(clustering))
clustering['transcript_id'] = clustering[0].str.split(":", expand=True)[0]
clustering['transcript_id'] = clustering["transcript_id"].str.split(".", expand=True)[0]

clustering = clustering.merge(transcript_gene, on="transcript_id")
print("all human after annotation")
print(len(clustering))
small_clust = clustering[["gene_name","gene_type",0,1]]
small_clust.groupby("gene_type").count()


all human
25985
all human after annotation
25985


Unnamed: 0_level_0,gene_name,0,1
gene_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lncRNA,2311,2311,2311
novel,310,310,310
processed_pseudogene,1455,1455,1455
protein_coding,21909,21909,21909


In [72]:
small_clust

Unnamed: 0,gene_name,gene_type,0,1
0,STARD3NL,protein_coding,ENST00000009041.12:7:+|10|1703:194:239|ouORF|ATG,ENST00000009041.12:7:+|10|1703:194:239|ouORF|ATG
1,AKAP11,protein_coding,ENST00000025301.4:13:+|2|9915:45:105|uORF|ATG,ENST00000025301.4:13:+|2|9915:45:105|uORF|ATG
2,NFE2L3,protein_coding,ENST00000056233.4:7:+|17|3740:302:2387|canonic...,ENST00000056233.4:7:+|17|3740:302:2387|canonic...
3,NFE2L3,protein_coding,ENST00000056233.4:7:+|17|3740:302:2387|canonic...,mouse_ENSMUST00000005103.12:chr6:+|24|2544:238...
4,NFE2L3,protein_coding,ENST00000056233.4:7:+|17|3740:302:2387|canonic...,macaca_ENSMMUT00000005121:3:-|22|2456:375:2457...
...,...,...,...,...
25980,POLG-DT,lncRNA,ENST00000837147.1:15:+|17|1247:426:630|noncodi...,ENST00000837147.1:15:+|17|1247:426:630|noncodi...
25981,XLOC_000656,novel,TCONS_00000740:13:+|17|1854:344:740|noncoding|CTG,TCONS_00000740:13:+|17|1854:344:740|noncoding|CTG
25982,XLOC_000656,novel,TCONS_00000740:13:+|17|1854:344:740|noncoding|CTG,ENST00000685848.2:13:+|17|1854:344:740|noncodi...
25983,XLOC_002098,novel,TCONS_00002320:7:+|2|352:56:149|noncoding|ATG,TCONS_00002320:7:+|2|352:56:149|noncoding|ATG


In [73]:
## non-coding
noncoding = clustering[clustering[0].str.contains("noncoding")]
noncoding.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_speciesDB.tsv"), header=None, index=None, sep="\t")
print("noncoding all")
print(len(noncoding))
print(len(set(noncoding[0].values.tolist())))

## altORFs
altORFs = clustering[clustering[0].str.contains("ORF")]
altORFs.to_csv(os.path.join(evoDir,"Clustering/altORFsHuman_speciesDB.tsv"), header=None, index=None, sep="\t")
print("altORFs all")
print(len(altORFs))
print(len(set(altORFs[0].values.tolist())))

## canonical
canonical = clustering[clustering[0].str.contains("canonical")]
print("Canonical all")
print(len(canonical))
print(len(set(canonical[0].values.tolist())))
canonical.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_speciesDB.tsv"), header=None, index=None, sep="\t")

noncoding all
4076
3279
altORFs all
7705
7369
Canonical all
14204
2899


#### NONCODING

In [74]:
# if column 1 starts with a specie, it is conserved
noncoding_conserved = noncoding[~noncoding[1].str.startswith("ENST")]
noncoding_conserved = noncoding_conserved[~noncoding_conserved[1].str.startswith("TCONS")]
noncoding_conserved.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved noncoding human ORFs")
print(len(set(noncoding_conserved[0].values.tolist())))
noncoding_conserved = noncoding_conserved[['gene_name','gene_type',0,1]]
noncoding_conserved.columns = ['gene_name','gene_type','orfID',1]

# if column 1 starts with a human gene, it is not conserved
noncoding_NonConserved = noncoding[noncoding[1].str.startswith("ENST") | noncoding[1].str.startswith("TCONS")]
noncoding_NonConserved.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Non-Conserved noncoding human ORFs")
print(len(set(noncoding_NonConserved[0].values.tolist())))
noncoding_NonConserved = noncoding_NonConserved[['gene_name','gene_type',0,1]]
noncoding_NonConserved.columns = ['gene_name','gene_type','orfID',1]
noncoding_NonConserved

Conserved noncoding human ORFs
178
Non-Conserved noncoding human ORFs
3279


Unnamed: 0,gene_name,gene_type,orfID,1
170,ENSG00000217612,processed_pseudogene,ENST00000404707.2:6:-|73|1879:1490:1748|noncod...,ENST00000404707.2:6:-|73|1879:1490:1748|noncod...
182,ENSG00000232952,processed_pseudogene,ENST00000415279.1:1:+|4|1779:134:167|noncoding...,ENST00000415279.1:1:+|4|1779:134:167|noncoding...
183,ENSG00000232952,processed_pseudogene,ENST00000415279.1:1:+|6|1779:167:503|noncoding...,ENST00000415279.1:1:+|6|1779:167:503|noncoding...
184,ENSG00000232952,processed_pseudogene,ENST00000415279.1:1:+|90|1779:1594:1684|noncod...,ENST00000415279.1:1:+|90|1779:1594:1684|noncod...
185,ENSG00000232952,processed_pseudogene,ENST00000415279.1:1:+|25|1779:571:1231|noncodi...,ENST00000415279.1:1:+|25|1779:571:1231|noncodi...
...,...,...,...,...
25980,POLG-DT,lncRNA,ENST00000837147.1:15:+|17|1247:426:630|noncodi...,ENST00000837147.1:15:+|17|1247:426:630|noncodi...
25981,XLOC_000656,novel,TCONS_00000740:13:+|17|1854:344:740|noncoding|CTG,TCONS_00000740:13:+|17|1854:344:740|noncoding|CTG
25982,XLOC_000656,novel,TCONS_00000740:13:+|17|1854:344:740|noncoding|CTG,ENST00000685848.2:13:+|17|1854:344:740|noncodi...
25983,XLOC_002098,novel,TCONS_00002320:7:+|2|352:56:149|noncoding|ATG,TCONS_00002320:7:+|2|352:56:149|noncoding|ATG


In [75]:
## conserved non-coding
noncoding_conserved_to_matrix = noncoding_conserved
print(len(set(noncoding_conserved_to_matrix.orfID.values.tolist())))
# Assuming noncoding_conserved_to_matrix is already defined
noncoding_conserved_to_matrix['specie'] = noncoding_conserved_to_matrix[1].str.split("_", expand=True)[0]
noncoding_conserved_to_matrix = noncoding_conserved_to_matrix[['orfID', 'specie']]
noncoding_conserved_to_matrix.groupby('specie').count()
noncoding_conserved_to_matrix['presence'] = 1
## control
print(len(set(noncoding_conserved_to_matrix.orfID.values.tolist())))

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_conserved_matrix = noncoding_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')

# Fill NaN values with 0
noncoding_conserved_matrix = noncoding_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
noncoding_conserved_matrix = noncoding_conserved_matrix.reset_index()

# Reorder columns for the final matrix
noncoding_conserved_matrix_ordered = noncoding_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
noncoding_conserved_matrix_ordered['sum'] = noncoding_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_conserved_matrix_ordered = noncoding_conserved_matrix_ordered.sort_values(by="sum", ascending=False)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
noncoding_conserved_matrix_ordered =  noncoding_conserved_matrix_ordered.merge(noncoding_conserved[['gene_name','gene_type','orfID']], on="orfID")
noncoding_conserved_matrix_ordered.drop_duplicates(inplace=True)

noncoding_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
noncoding_conserved_matrix_ordered =  noncoding_conserved_matrix_ordered.merge(noncoding_conserved[['gene_name','gene_type','orfID']], on="orfID")
noncoding_conserved_matrix_ordered.drop_duplicates(inplace=True)

noncoding_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_conserved_matrix_ordered.orfID.values.tolist())))


178
178
178
178
178


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_to_matrix['presence'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_conserved_matrix_ordered['sum'] = noncoding_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)


In [76]:
### all noncoding
noncoding_conserved_touse = noncoding_conserved
noncoding_conserved_touse['specie'] = noncoding_conserved_touse[1].str.split("_", expand=True)[0]
# noncoding_NonConserved.drop("specie",axis=1, inplace=True)
noncoding_to_matrix_raw = pd.concat([noncoding_conserved, noncoding_NonConserved])
noncoding_to_matrix = noncoding_to_matrix_raw[['orfID', 'specie']]
noncoding_to_matrix = noncoding_to_matrix.fillna('human').drop_duplicates()
noncoding_to_matrix['presence'] = 1
print(noncoding_to_matrix.groupby("specie").count())
noncoding_to_matrix

          orfID  presence
specie                   
chicken      42        42
human      3279      3279
macaca      116       116
mouse        93        93
opossum      59        59
platypus     50        50


Unnamed: 0,orfID,specie,presence
189,ENST00000416599.5:13:-|4|3494:38:398|noncoding...,macaca,1
360,ENST00000612689.1:19:+|37|2813:798:1206|noncod...,macaca,1
784,ENST00000659090.2:12:-|18|2245:444:516|noncodi...,macaca,1
786,ENST00000659090.2:12:-|16|2245:365:443|noncodi...,macaca,1
1366,ENST00000399435.2:3:+|1|453:3:255|noncoding|TTG,opossum,1
...,...,...,...
25978,ENST00000819284.1:1:+|11|1237:225:420|noncodin...,human,1
25979,ENST00000828720.1:19:-|9|1805:71:146|noncoding...,human,1
25980,ENST00000837147.1:15:+|17|1247:426:630|noncodi...,human,1
25981,TCONS_00000740:13:+|17|1854:344:740|noncoding|CTG,human,1


In [77]:
# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_matrix = noncoding_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
# Fill NaN values with 0
noncoding_matrix = noncoding_matrix.fillna(0)
# Reset index so 'orfID' becomes a column instead of an index
noncoding_matrix = noncoding_matrix.reset_index()

# Rename columns as necessary
noncoding_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]
print(len(noncoding_matrix))
# Reorder columns for the final matrix
noncoding_matrix_ordered = noncoding_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# # Calculate sum across species columns to identify how many species are present
noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_matrix_ordered = noncoding_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
noncoding_matrix_ordered = noncoding_matrix_ordered.merge(noncoding_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
noncoding_matrix_ordered.drop_duplicates(inplace=True)

noncoding_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_matrix_ordered.orfID.values.tolist())))
noncoding_matrix_ordered

3279


3279


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)


Unnamed: 0,orfID,human,macaca,mouse,opossum,platypus,chicken,sum,gene_name,gene_type
0,ENST00000605113.1:17:+|9|538:101:536|noncoding...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,NDUFB8P2,processed_pseudogene
7,ENST00000406554.2:6:-|1|418:16:253|noncoding|ATG,1.0,1.0,1.0,1.0,1.0,1.0,6.0,ENSG00000220522,processed_pseudogene
14,ENST00000540175.1:12:+|1|1423:34:1423|noncodin...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,ENSG00000256663,processed_pseudogene
28,ENST00000433675.1:2:-|1|447:1:448|noncoding|ATG,1.0,1.0,1.0,1.0,1.0,1.0,6.0,RPS15P4,processed_pseudogene
37,ENST00000433698.2:9:-|1|843:1:685|noncoding|ATG,1.0,1.0,1.0,1.0,1.0,1.0,6.0,EEF1DP2,processed_pseudogene
...,...,...,...,...,...,...,...,...,...,...
4069,ENST00000514947.1:5:-|23|1732:433:601|noncodin...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,ENSG00000250080,processed_pseudogene
4070,ENST00000515045.1:5:-|3|853:61:256|noncoding|ATG,1.0,0.0,0.0,0.0,0.0,0.0,1.0,ENSG00000290968,lncRNA
4072,ENST00000515049.1:17:+|13|698:192:531|noncodin...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,ENSG00000249176,processed_pseudogene
4073,ENST00000515049.1:17:+|43|698:572:620|noncodin...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,ENSG00000249176,processed_pseudogene


In [78]:

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
noncoding_matrix = noncoding_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
noncoding_matrix
# Fill NaN values with 0
noncoding_matrix = noncoding_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
noncoding_matrix = noncoding_matrix.reset_index()

# Rename columns as necessary
noncoding_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]

# Reorder columns for the final matrix
noncoding_matrix_ordered = noncoding_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
noncoding_matrix_ordered = noncoding_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
noncoding_matrix_ordered =  noncoding_matrix_ordered.merge(noncoding_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
noncoding_matrix_ordered.drop_duplicates(inplace=True)

noncoding_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_matrix.tsv"), sep="\t", index=None)
print(len(set(noncoding_matrix_ordered.orfID.values.tolist())))

3279


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding_matrix_ordered['sum'] = noncoding_matrix_ordered.select_dtypes(include='number').sum(axis=1)


### alternative ORFs

In [79]:
# if column 1 starts with a specie, it is conserved
altORFs_conserved = altORFs[~altORFs[1].str.startswith("ENST")]
altORFs_conserved.to_csv(os.path.join(evoDir,"Clustering/altORFsHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved altORFs human ORFs")
print(len(set(altORFs_conserved[0].values.tolist())))
altORFs_conserved = altORFs_conserved[['gene_name','gene_type',0,1]]
altORFs_conserved.columns = ['gene_name','gene_type','orfID',1]

# if column 1 starts with a human gene, it is not conserved
altORFs_NonConserved = altORFs[altORFs[1].str.startswith("ENST")]
altORFs_NonConserved.to_csv(os.path.join(evoDir,"Clustering/altORFsHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("Not-conserved altORFs human ORFs")
print(len(set(altORFs_NonConserved[0].values.tolist())))
altORFs_NonConserved = altORFs_NonConserved[['gene_name','gene_type',0,1]]
altORFs_NonConserved.columns = ['gene_name','gene_type','orfID',1]


Conserved altORFs human ORFs
106
Not-conserved altORFs human ORFs
7369


In [80]:
## conserved altORFs
altORFs_conserved_to_matrix = altORFs_conserved
print(len(set(altORFs_conserved_to_matrix.orfID.values.tolist())))
# Assuming noncoding_conserved_to_matrix is already defined
altORFs_conserved_to_matrix['specie'] = altORFs_conserved_to_matrix[1].str.split("_", expand=True)[0]
altORFs_conserved_to_matrix = altORFs_conserved_to_matrix[['orfID', 'specie']]
altORFs_conserved_to_matrix.groupby('specie').count()
altORFs_conserved_to_matrix['presence'] = 1
## control
print(len(set(altORFs_conserved_to_matrix.orfID.values.tolist())))

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
altORFs_conserved_matrix = altORFs_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')

# Fill NaN values with 0
altORFs_conserved_matrix = altORFs_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
altORFs_conserved_matrix = altORFs_conserved_matrix.reset_index()

# Reorder columns for the final matrix
altORFs_conserved_matrix_ordered = altORFs_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
altORFs_conserved_matrix_ordered['sum'] = altORFs_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
altORFs_conserved_matrix_ordered = altORFs_conserved_matrix_ordered.sort_values(by="sum", ascending=False)
print(len(set(altORFs_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
altORFs_conserved_matrix_ordered =  altORFs_conserved_matrix_ordered.merge(altORFs_conserved[['gene_name','gene_type','orfID']], on="orfID")
altORFs_conserved_matrix_ordered.drop_duplicates(inplace=True)

altORFs_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/altORFsHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(altORFs_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
altORFs_conserved_matrix_ordered =  altORFs_conserved_matrix_ordered.merge(altORFs_conserved[['gene_name','gene_type','orfID']], on="orfID")
altORFs_conserved_matrix_ordered.drop_duplicates(inplace=True)

altORFs_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/altORFsHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(altORFs_conserved_matrix_ordered.orfID.values.tolist())))


106
106
106
106
106


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  altORFs_conserved_to_matrix['presence'] = 1


In [81]:
### all altORFs
altORFs_conserved_touse = altORFs_conserved
altORFs_conserved_touse['specie'] = altORFs_conserved_touse[1].str.split("_", expand=True)[0]
altORFs_to_matrix_raw = pd.concat([altORFs_conserved, altORFs_NonConserved])
altORFs_to_matrix = altORFs_to_matrix_raw[['orfID', 'specie']]
altORFs_to_matrix = altORFs_to_matrix.fillna('human').drop_duplicates()
altORFs_to_matrix['presence'] = 1
altORFs_to_matrix = altORFs_to_matrix.replace("TCONS", "human")
print(altORFs_to_matrix.groupby("specie").count())


          orfID  presence
specie                   
chicken       8         8
human      7371      7371
macaca       44        44
mouse        47        47
opossum      25        25
platypus     13        13


In [82]:
# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
altORFs_matrix = altORFs_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
# Fill NaN values with 0
altORFs_matrix = altORFs_matrix.fillna(0)
# Reset index so 'orfID' becomes a column instead of an index
altORFs_matrix = altORFs_matrix.reset_index()
altORFs_matrix
# Rename columns as necessary
altORFs_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]
print(len(altORFs_matrix))
# Reorder columns for the final matrix
altORFs_matrix_ordered = altORFs_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# # Calculate sum across species columns to identify how many species are present
altORFs_matrix_ordered['sum'] = altORFs_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
altORFs_matrix_ordered = altORFs_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
altORFs_matrix_ordered = altORFs_matrix_ordered.merge(altORFs_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
altORFs_matrix_ordered.drop_duplicates(inplace=True)

altORFs_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/altORFsHuman_matrix.tsv"), sep="\t", index=None)
print(len(set(altORFs_matrix_ordered.orfID.values.tolist())))
altORFs_matrix_ordered

7369
7369


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  altORFs_matrix_ordered['sum'] = altORFs_matrix_ordered.select_dtypes(include='number').sum(axis=1)


Unnamed: 0,orfID,human,macaca,mouse,opossum,platypus,chicken,sum,gene_name,gene_type
0,ENST00000618666.4:12:-|46|6915:1015:2341|dORF|ATG,1.0,1.0,1.0,1.0,1.0,1.0,6.0,CCNT1,protein_coding
6,ENST00000532219.5:5:+|456|8246:7706:7940|odORF...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,ANKHD1-EIF4EBP3,protein_coding
19,ENST00000619721.6:11:-|203|3949:2829:3561|dORF...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,MFRP,protein_coding
26,ENST00000695752.1:1:+|41|1548:541:1222|odORF|ATG,1.0,1.0,1.0,1.0,0.0,1.0,5.0,C1QC,protein_coding
31,ENST00000643349.2:11:-|42|2861:727:1270|dORF|ATG,1.0,1.0,1.0,0.0,1.0,1.0,5.0,ENSG00000284779,protein_coding
...,...,...,...,...,...,...,...,...,...,...
7700,ENST00000339697.5:6:-|159|3125:2425:2530|dORF|CTG,1.0,0.0,0.0,0.0,0.0,0.0,1.0,SERINC1,protein_coding
7701,ENST00000339656.8:19:+|3|6885:22:76|uORF|ATG,1.0,0.0,0.0,0.0,0.0,0.0,1.0,ZNF587,protein_coding
7702,ENST00000339656.8:19:+|113|6885:1827:2277|odOR...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,ZNF587,protein_coding
7703,ENST00000339618.8:17:+|109|3828:1745:1805|odOR...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,ALDH3A2,protein_coding



### CANONICAL

In [83]:
# if column 1 starts with a specie, it is conserved
canonical_conserved = canonical[~canonical[1].str.startswith("ENST")]
canonical_conserved.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_Conserved.tsv"), header=None, index=None, sep="\t")
print("Conserved canonical human ORFs")
print(len(set(canonical_conserved[0].values.tolist())))
canonical_conserved = canonical_conserved[['gene_name','gene_type',0,1]]
canonical_conserved.columns = ['gene_name','gene_type','orfID',1]

# if column 1 starts with a human gene, it is not conserved
canonical_NonConserved = canonical[canonical[1].str.startswith("ENST")]
canonical_NonConserved.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_NonConserved.tsv"), header=None, index=None, sep="\t")
print("AltORFs noncoding human ORFs")
print(len(set(canonical_NonConserved[0].values.tolist())))
canonical_NonConserved = canonical_NonConserved[['gene_name','gene_type',0,1]]
canonical_NonConserved.columns = ['gene_name','gene_type','orfID',1]


Conserved canonical human ORFs
2585
AltORFs noncoding human ORFs
2899


In [84]:
## conserved canonical
canonical_conserved_to_matrix = canonical_conserved
print(len(set(canonical_conserved_to_matrix.orfID.values.tolist())))
# Assuming noncoding_conserved_to_matrix is already defined
canonical_conserved_to_matrix['specie'] = canonical_conserved_to_matrix[1].str.split("_", expand=True)[0]
canonical_conserved_to_matrix = canonical_conserved_to_matrix[['orfID', 'specie']]
canonical_conserved_to_matrix.groupby('specie').count()
canonical_conserved_to_matrix['presence'] = 1
## control
print(len(set(canonical_conserved_to_matrix.orfID.values.tolist())))

# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
canonical_conserved_matrix = canonical_conserved_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')

# Fill NaN values with 0
canonical_conserved_matrix = canonical_conserved_matrix.fillna(0)

# Reset index so 'orfID' becomes a column instead of an index
canonical_conserved_matrix = canonical_conserved_matrix.reset_index()

# Reorder columns for the final matrix
canonical_conserved_matrix_ordered = canonical_conserved_matrix[["orfID", "macaca", "mouse", "opossum", "platypus", "chicken"]]

# Calculate sum across species columns to identify how many species are present
canonical_conserved_matrix_ordered['sum'] = canonical_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
canonical_conserved_matrix_ordered = canonical_conserved_matrix_ordered.sort_values(by="sum", ascending=False)
print(len(set(canonical_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
canonical_conserved_matrix_ordered =  canonical_conserved_matrix_ordered.merge(canonical_conserved[['gene_name','gene_type','orfID']], on="orfID")
canonical_conserved_matrix_ordered.drop_duplicates(inplace=True)

canonical_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(canonical_conserved_matrix_ordered.orfID.values.tolist())))

# Display the result
canonical_conserved_matrix_ordered =  canonical_conserved_matrix_ordered.merge(canonical_conserved[['gene_name','gene_type','orfID']], on="orfID")
canonical_conserved_matrix_ordered.drop_duplicates(inplace=True)

canonical_conserved_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_Conserved_matrix.tsv"), sep="\t", index=None)
print(len(set(canonical_conserved_matrix_ordered.orfID.values.tolist())))


2585
2585
2585
2585
2585


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_to_matrix['presence'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_conserved_matrix_ordered['sum'] = canonical_conserved_matrix_ordered.select_dtypes(include='number').sum(axis=1)


In [85]:
### all canonical
canonical_conserved_touse = canonical_conserved
canonical_conserved_touse['specie'] = canonical_conserved_touse[1].str.split("_", expand=True)[0]
canonical_to_matrix_raw = pd.concat([canonical_conserved, canonical_NonConserved])
canonical_to_matrix = canonical_to_matrix_raw[['orfID', 'specie']]
canonical_to_matrix = canonical_to_matrix.fillna('human').drop_duplicates()
canonical_to_matrix['presence'] = 1
canonical_to_matrix = canonical_to_matrix.replace("TCONS", "human")
print(canonical_to_matrix.groupby("specie").count())


          orfID  presence
specie                   
chicken    1440      1440
human      2899      2899
macaca     2316      2316
mouse      1954      1954
opossum    1610      1610
platypus   1473      1473


In [86]:
# Pivoting the data without creating an 'id', and keeping 'orfID' as the unique identifier
canonical_matrix = canonical_to_matrix.pivot_table(index='orfID', columns='specie', values='presence', aggfunc='max')
# Fill NaN values with 0
canonical_matrix = canonical_matrix.fillna(0)
# Reset index so 'orfID' becomes a column instead of an index
canonical_matrix = canonical_matrix.reset_index()
canonical_matrix
# Rename columns as necessary
canonical_matrix.columns = ["orfID", "chicken", "human", "macaca", "mouse", "opossum", "platypus"]
print(len(canonical_matrix))
# Reorder columns for the final matrix
canonical_matrix_ordered = canonical_matrix[["orfID", "human","macaca", "mouse", "opossum", "platypus", "chicken"]]

# # Calculate sum across species columns to identify how many species are present
canonical_matrix_ordered['sum'] = canonical_matrix_ordered.select_dtypes(include='number').sum(axis=1)

# Sort by the sum column to order by the number of species present
canonical_matrix_ordered = canonical_matrix_ordered.sort_values(by="sum", ascending=False)

# Display the result
canonical_matrix_ordered = canonical_matrix_ordered.merge(canonical_to_matrix_raw[['gene_name','gene_type','orfID']], on="orfID")
canonical_matrix_ordered.drop_duplicates(inplace=True)

canonical_matrix_ordered.to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_matrix.tsv"), sep="\t", index=None)
print(len(set(canonical_matrix_ordered.orfID.values.tolist())))
canonical_matrix_ordered

2899


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_matrix_ordered['sum'] = canonical_matrix_ordered.select_dtypes(include='number').sum(axis=1)


2899


Unnamed: 0,orfID,human,macaca,mouse,opossum,platypus,chicken,sum,gene_name,gene_type
0,ENST00000440944.6:9:+|1|3566:158:2186|canonica...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,GPSM1,protein_coding
11,ENST00000334701.11:14:-|9|3510:283:2848|canoni...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,HSP90AA1,protein_coding
23,ENST00000451261.7:X:+|24|3485:439:2335|canonic...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,DCAF8L2,protein_coding
34,ENST00000335025.12:22:+|6|4929:92:1655|canonic...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,PPIL2,protein_coding
40,ENST00000449105.8:2:-|7|4144:303:1932|canonica...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,HNRNPLL,protein_coding
...,...,...,...,...,...,...,...,...,...,...
14196,ENST00000370518.4:X:+|5|408:61:355|canonical|ATG,1.0,0.0,0.0,0.0,0.0,0.0,1.0,SPANXA2,protein_coding
14200,ENST00000488965.1:1:+|2|3926:48:228|canonical|ATG,1.0,0.0,0.0,0.0,0.0,0.0,1.0,SCP2,protein_coding
14201,ENST00000432920.2:1:-|6|1625:169:1249|canonica...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,ENSG00000255835,protein_coding
14202,ENST00000240316.5:17:-|2|2634:32:1763|canonica...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,COIL,protein_coding


Find the oldest specie containg the ORF translated

In [87]:
species_ordered = ['chicken','platypus','opossum','mouse','macaca','human']
def find_last_species(row):
    for sp in species_ordered:
        if row[sp] == 1.0:  # Check if the value is 1.0
            return sp
    return None  # Return None if no species has a 1

In [88]:
# Apply the function to each row to create a new column 'oldest_species'
canonical_matrix_ordered['oldest_species'] = canonical_matrix_ordered.apply(find_last_species, axis=1)
canonical_matrix_ordered = species_and_years.merge(canonical_matrix_ordered, left_on=["specie"], right_on="oldest_species")
canonical_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/canonicalHuman_matrix_oldestAge.tsv"), sep="\t", index=None)

noncoding_matrix_ordered['oldest_species'] = noncoding_matrix_ordered.apply(find_last_species, axis=1)
noncoding_matrix_ordered = species_and_years.merge(noncoding_matrix_ordered, left_on=["specie"], right_on="oldest_species")
noncoding_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/noncodingHuman_matrix_oldestAge.tsv"), sep="\t", index=None)


altORFs_matrix_ordered['oldest_species'] = altORFs_matrix_ordered.apply(find_last_species, axis=1)
altORFs_matrix_ordered = species_and_years.merge(altORFs_matrix_ordered, left_on=["specie"], right_on="oldest_species")
altORFs_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']].to_csv(os.path.join(evoDir,"Clustering/altORFsHuman_matrix_oldestAge.tsv"), sep="\t", index=None)


complete_matrix_age = pd.concat([noncoding_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']],altORFs_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']], canonical_matrix_ordered[['orfID','gene_name','gene_type','oldest_species','age']]])
complete_matrix_age.to_csv(os.path.join(evoDir,"Clustering/allHuman_matrix_oldestAge.tsv"), sep="\t", index=None)
