## PHYLOSTRATIGRAPHY

In [29]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
GENOMEDIR = "/genomics/users/marta/genomes"

## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")

## evoDir
evoDir=os.path.join(users_dir,"EvolutionaryOrigin")
phyloDir = os.path.join(users_dir,"Phylostratigraphy")

lineage=["cellular organisms", "Eukaryota", "Opisthokonta", "Metazoa", "Eumetazoa",
    "Bilateria", "Deuterostomia", "Chordata", "Vertebrata",
    "Gnathostomata", "Teleostomi", "Euteleostomi", "Sarcopterygii",
    "Dipnotetrapodomorpha", "Tetrapoda", "Amniota", "Mammalia", "Theria",
    "Eutheria", "Boreoeutheria", "Euarchontoglires", "Primates",
    "Simiiformes", "Catarrhini", "Hominoidea", "Hominidae", "Homininae", "Homo sapiens"]
print(len(lineage))

28


In [30]:
ribORF_in_2 = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.csv")

ribORF_in_2_noncanonical = ribORF_in_2[ribORF_in_2['gene_type'] != "protein_coding"]
ribORF_in_2_noncanonical.to_csv(os.path.join(evoDir,"ribORF_humanTestis_in2.noncanonical.csv"), index=None)

ribORF_in_2_canonical = ribORF_in_2[ribORF_in_2['gene_type'] == "protein_coding"]
print(len(set(ribORF_in_2_canonical.gene_name.values.tolist())))
ribORF_in_2_canonical.to_csv(os.path.join(evoDir,"ribORF_humanTestis_in2.canonical.csv"), index=None)

11827


In [31]:
input_file = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.fa"
output_file_canonical = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.canonical.fa"
output_file_noncanonical = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.noncanonical.fa"

fasta_sequences = SeqIO.parse(open(input_file),'fasta')

with open(output_file_canonical, "w") as out_c:
    with open(output_file_noncanonical, "w") as out_nc:
        for fasta in fasta_sequences:
            if "noncoding" in fasta.id:
                out_nc.write(">%s\n%s\n" %(fasta.id, str(fasta.seq)))
            else:
                out_c.write(">%s\n%s\n" %(fasta.id, str(fasta.seq)))


In [32]:
DENSE=pd.read_csv("/projects_eg/projects/marta/DENSE_AnneLopes/denovotable.csv")
DENSE_candidates = DENSE[DENSE['Gene'].isin(ribORF_in_2_canonical.gene_name.values.tolist())]
DENSE_candidates = DENSE_candidates[['Gene','Phylostratum']].drop_duplicates()
print(len(DENSE_candidates))
print(len(set(DENSE_candidates.Gene.values.tolist())))
DENSE_candidates.Phylostratum = DENSE_candidates.Phylostratum.astype("category")
DENSE_candidates.Phylostratum = DENSE_candidates.Phylostratum.cat.set_categories(lineage)
  ## 'sort' changed to 'sort_values'
DENSE_candidates.sort_values(["Phylostratum"]).to_csv(os.path.join(phyloDir, "DENSE_PCGs.csv"), index=None)


missing_PCGs_in_DENSE = ribORF_in_2_canonical[~ribORF_in_2_canonical['gene_name'].isin(DENSE.Gene.values.tolist())]
print("Missing PCGs in DENSE")
print(len(set(missing_PCGs_in_DENSE.gene_name.values.tolist())))
missing_PCGs_in_DENSE.to_csv(os.path.join(phyloDir, "DENSE_missingPCGs.csv"), index=None)



14341
11511
Missing PCGs in DENSE
316


In [33]:
print(len(DENSE_candidates))
summary = DENSE_candidates.groupby("Phylostratum").count().reset_index()
summary.columns = ['Phylostratum','n_PCGs']
summary = summary[summary['Phylostratum'] != "Unknown"]
summary = summary[summary["Phylostratum"] != "Possible contamination or HGT"]

# Sorting the DataFrame
summary.Phylostratum = summary.Phylostratum.astype("category")
summary.Phylostratum = summary.Phylostratum.cat.set_categories(lineage)
summary.sort_values(["Phylostratum"])  ## 'sort' changed to 'sort_values'
summary.to_csv(os.path.join(phyloDir, "summary_DENSE_PCGs.csv"), index=None)

14341


## GenEra 

`https://github.com/josuebarrera/GenEra/wiki/Running-GenEra`



In [34]:
%%bash -s "$phyloDir"

OUTDIR=$1/GenEra
mkdir -p $OUTDIR
mkdir -p $OUTDIR/output

module load  Miniconda3/24.5.0
eval "$(/soft/system/software/Miniconda3/24.5.0/bin/conda shell.bash hook)"
conda activate genEra

genEra -q /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.noncanonical.fa \
    -t 9606 -b /datasets/diamond_databases/nr/nr -d /datasets/diamond_databases/nr/nr -n 20 -e 1e-4 -r $OUTDIR/ncbi_lineages_2022-07-28.csv

# genEra -q /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.noncanonical.fa \
#     -t 9606 -p /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/tmp_9606_16006/9606_Diamond_results.bout -d /datasets/diamond_databases/nr/nr -n 20 -e 1e-4 -r $OUTDIR/ncbi_lineages_2022-07-28.csv    

genEra v1.4.1 (C) Max Planck Society for the Advancement of Science
Starting time of run:
jue 25 jul 2024 15:50:31 CEST

Your temporary files will be stored in /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/tmp_9606_3188

STARTING STEP 1: SEARCHING FOR HOMOLOGS WITHIN THE DATABASE USING DIAMOND
--------------------------------------------------
Matching the query genes against themselves
--------------------------------------------------
Searching for homologs against the DIAMOND database
DIAMOND results written!
--------------------------------------------------
Step 1 finished!
The DIAMOND/MMseqs2 table can be found in /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/tmp_9606_3188/9606_Diamond_results.bout
This file is usually HUGE, please dispose of it if you no longer find it useful
It can still be used (-p) in case the user wants to re-run genEra while skipping step 1

STARTING STEP 2: GENERATING TAXONOMIC DATABASE FOR THE PHYLOSTRATIGRAPHIC ASSIGNMENT OF YOUR GENES
--------------

[mclIO] writing </home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/tmp_9606_3188/tmp_9606.mci>
.......................................
[mclIO] wrote native interchange 1530x1530 matrix with 2012 entries to stream </home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/tmp_9606_3188/tmp_9606.mci>
[mclIO] wrote 1530 tab entries to stream </home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/tmp_9606_3188/tmp_9606.tab>
[mcxload] tab has 1530 entries
[mclIO] reading </home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/tmp_9606_3188/tmp_9606.mcl>
.......................................
[mclIO] read native interchange 1530x1350 matrix with 1530 entries


--------------------------------------------------
Establishing the age and number of gene-family founder events
--------------------------------------------------
Step 3 finished!
The age assignment for your individual genes can be found in /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/9606_gene_ages.tsv
The possible ages for the genes with a taxonomic representativeness below 30 percent can be found in /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/9606_ambiguous_phylostrata.tsv
The estimation of gene family founder events can be found in /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/9606_founder_events.tsv
The number of individual genes that could be assigned to each phylostratum are summarized in /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/9606_gene_age_summary.tsv
The number of of gene family founder events per phylostratum are summarized in /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/9606_founder_summary.tsv

genEra finished at:
jue 25 jul 2024 18:09:15 CEST

Enjoy y

In [36]:
PCGs_ages = pd.read_csv(os.path.join(phyloDir, "summary_DENSE_PCGs.csv"))
PCGs_ages

ncORFs_ages = pd.read_csv(os.path.join(phyloDir, "GenEra/output/9606_gene_age_summary.tsv"), sep="\t")
ncORFs_ages = ncORFs_ages[["#number_of_genes","phylostratum"]]
ncORFs_ages.columns = ['n_ncORFs','Phylostratum']

ages = PCGs_ages.merge(ncORFs_ages, on="Phylostratum")
ages.Phylostratum = ages.Phylostratum.astype("category")
ages.Phylostratum = ages.Phylostratum.cat.set_categories(lineage)
ages.sort_values(["Phylostratum"], inplace=True)  ## 'sort' changed to 'sort_values'
ages.drop_duplicates(inplace=True)
ages.to_csv(os.path.join(phyloDir,"summary_all_genes_ages.csv"), index=None)

In [39]:
ncORFs_genes_ages = pd.read_csv(os.path.join(phyloDir, "GenEra/output/9606_gene_ages.tsv"), sep="\t", header=None)
ncORFs_genes_ages['transcript_id'] = ncORFs_genes_ages[0].str.split(":", expand=True)[0]
ncORFs_genes_ages['transcript_id'] = ncORFs_genes_ages['transcript_id'].str.split(".", expand=True)[0]
ncORFs_genes_ages = ncORFs_genes_ages.merge(transcript_gene, on="transcript_id")
ncORFs_genes_ages = ncORFs_genes_ages[['gene_name','gene_type',1,0]]
ncORFs_genes_ages.columns=['gene_name','gene_type','Phylostratum','orfID']
ncORFs_genes_ages



PCGs_gene_ages = pd.read_csv(os.path.join(phyloDir, "DENSE_PCGs.csv"))
PCGs_gene_ages.columns=['gene_name','Phylostratum']
PCGs_gene_ages['gene_type'] = "protein_coding"
PCGs_gene_ages['orfID'] = PCGs_gene_ages['gene_name'] + "_ORF1"


all_genes = pd.concat([PCGs_gene_ages, ncORFs_genes_ages])

all_genes.Phylostratum = all_genes.Phylostratum.astype("category")
all_genes.Phylostratum = all_genes.Phylostratum.cat.set_categories(lineage)
all_genes.sort_values(["Phylostratum"], inplace=True)  ## 'sort' changed to 'sort_values'
all_genes.drop_duplicates(inplace=True)
all_genes.to_csv(os.path.join(phyloDir,"all_genes_phyostratum.csv"), index=None)

In [40]:
all_genes

Unnamed: 0,gene_name,Phylostratum,gene_type,orfID
0,A2M,cellular organisms,protein_coding,A2M_ORF1
5846,ZNF814,cellular organisms,protein_coding,ZNF814_ORF1
5845,NEDD4L,cellular organisms,protein_coding,NEDD4L_ORF1
5844,PEX5,cellular organisms,protein_coding,PEX5_ORF1
5843,QDPR,cellular organisms,protein_coding,QDPR_ORF1
...,...,...,...,...
1577,XLOC_000607,,novel,TCONS_00000707:15:+|540|22729:8123:8150|noncod...
1582,XLOC_000828,,novel,TCONS_00000993:18:-|64|1055:791:806|noncoding|GTG
1588,XLOC_000954,,novel,TCONS_00001162:2:+|62|9974:848:863|noncoding|TTG
1596,XLOC_001498,,novel,TCONS_00001784:6:-|13|690:130:331|noncoding|ATG
