## PHYLOSTRATIGRAPHY

In [2]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
GENOMEDIR = "/genomics/users/marta/genomes"

## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")

## evoDir
evoDir=os.path.join(users_dir,"EvolutionaryOrigin")
phyloDir = os.path.join(users_dir,"Phylostratigraphy")

lineage=["cellular organisms", "Eukaryota", "Opisthokonta", "Metazoa", "Eumetazoa",
    "Bilateria", "Deuterostomia", "Chordata", "Vertebrata",
    "Gnathostomata", "Teleostomi", "Euteleostomi", "Sarcopterygii",
    "Dipnotetrapodomorpha", "Tetrapoda", "Amniota", "Mammalia", "Theria",
    "Eutheria", "Boreoeutheria", "Euarchontoglires", "Primates",
    "Simiiformes", "Catarrhini", "Hominoidea", "Hominidae", "Homininae", "Homo sapiens"]
print(len(lineage))

28


## Testis

In [30]:
ribORF_in_2 = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.csv")

ribORF_in_2_noncanonical = ribORF_in_2[ribORF_in_2['gene_type'] != "protein_coding"]
ribORF_in_2_noncanonical.to_csv(os.path.join(evoDir,"ribORF_humanTestis_in2.noncanonical.csv"), index=None)

ribORF_in_2_canonical = ribORF_in_2[ribORF_in_2['gene_type'] == "protein_coding"]
print(len(set(ribORF_in_2_canonical.gene_name.values.tolist())))
ribORF_in_2_canonical.to_csv(os.path.join(evoDir,"ribORF_humanTestis_in2.canonical.csv"), index=None)

11827


In [31]:
input_file = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.fa"
output_file_canonical = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.canonical.fa"
output_file_noncanonical = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.noncanonical.fa"

fasta_sequences = SeqIO.parse(open(input_file),'fasta')

with open(output_file_canonical, "w") as out_c:
    with open(output_file_noncanonical, "w") as out_nc:
        for fasta in fasta_sequences:
            if "noncoding" in fasta.id:
                out_nc.write(">%s\n%s\n" %(fasta.id, str(fasta.seq)))
            else:
                out_c.write(">%s\n%s\n" %(fasta.id, str(fasta.seq)))


In [32]:
DENSE=pd.read_csv("/projects_eg/projects/marta/DENSE_AnneLopes/denovotable.csv")
DENSE_candidates = DENSE[DENSE['Gene'].isin(ribORF_in_2_canonical.gene_name.values.tolist())]
DENSE_candidates = DENSE_candidates[['Gene','Phylostratum']].drop_duplicates()
print(len(DENSE_candidates))
print(len(set(DENSE_candidates.Gene.values.tolist())))
DENSE_candidates.Phylostratum = DENSE_candidates.Phylostratum.astype("category")
DENSE_candidates.Phylostratum = DENSE_candidates.Phylostratum.cat.set_categories(lineage)
  ## 'sort' changed to 'sort_values'
DENSE_candidates.sort_values(["Phylostratum"]).to_csv(os.path.join(phyloDir, "DENSE_PCGs.csv"), index=None)


missing_PCGs_in_DENSE = ribORF_in_2_canonical[~ribORF_in_2_canonical['gene_name'].isin(DENSE.Gene.values.tolist())]
print("Missing PCGs in DENSE")
print(len(set(missing_PCGs_in_DENSE.gene_name.values.tolist())))
missing_PCGs_in_DENSE.to_csv(os.path.join(phyloDir, "DENSE_missingPCGs.csv"), index=None)



14341
11511
Missing PCGs in DENSE
316


In [33]:
print(len(DENSE_candidates))
summary = DENSE_candidates.groupby("Phylostratum").count().reset_index()
summary.columns = ['Phylostratum','n_PCGs']
summary = summary[summary['Phylostratum'] != "Unknown"]
summary = summary[summary["Phylostratum"] != "Possible contamination or HGT"]

# Sorting the DataFrame
summary.Phylostratum = summary.Phylostratum.astype("category")
summary.Phylostratum = summary.Phylostratum.cat.set_categories(lineage)
summary.sort_values(["Phylostratum"])  ## 'sort' changed to 'sort_values'
summary.to_csv(os.path.join(phyloDir, "summary_DENSE_PCGs.csv"), index=None)

14341


## GenEra 

`https://github.com/josuebarrera/GenEra/wiki/Running-GenEra`



In [6]:
%%bash -s "$phyloDir"

OUTDIR=$1/GenEra
mkdir -p $OUTDIR
mkdir -p $OUTDIR/output

module load  Miniconda3/24.5.0
eval "$(/soft/system/software/Miniconda3/24.5.0/bin/conda shell.bash hook)"
conda activate genEra

genEra -q /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.noncanonical.fa \
    -t 9606 -b /datasets/diamond_databases/nr/nr -d /datasets/diamond_databases/nr/nr -n 20 -e 1e-4 -r $OUTDIR/ncbi_lineages_2022-07-28.csv

# genEra -q /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.noncanonical.fa \
#     -t 9606 -p /home/marta/PROJECT_SCRIPTS/TestisRestricted_MP/tmp_9606_16006/9606_Diamond_results.bout -d /datasets/diamond_databases/nr/nr -n 20 -e 1e-4 -r $OUTDIR/ncbi_lineages_2022-07-28.csv    

Process is interrupted.



# >>>>>>>>>>>>>>>>>>>>>> ERROR REPORT <<<<<<<<<<<<<<<<<<<<<<

    Traceback (most recent call last):
      File "/soft/system/software/Miniconda3/24.5.0/lib/python3.12/site-packages/conda/exception_handler.py", line 18, in __call__
        return func(*args, **kwargs)
               ^^^^^^^^^^^^^^^^^^^^^
      File "/soft/system/software/Miniconda3/24.5.0/lib/python3.12/site-packages/conda/cli/main.py", line 110, in main_sourced
        print(activator.execute(), end="")
    BrokenPipeError: [Errno 32] Broken pipe

`$ /soft/system/software/Miniconda3/24.5.0/bin/conda shell.bash hook`

  environment variables:
                 CIO_TEST=<not set>
               CONDA_ROOT=/soft/system/software/Miniconda3/24.5.0
           CURL_CA_BUNDLE=<not set>
               LD_PRELOAD=<not set>
  LMOD_DEFAULT_MODULEPATH=/soft/system/modules/all:/etc/lmod/modules:/usr/share/lmod/lmod/module
                          files
                  MANPATH=/soft/system/software/Miniconda3/24.5.0/share/man:/us

In [36]:
PCGs_ages = pd.read_csv(os.path.join(phyloDir, "summary_DENSE_PCGs.csv"))
PCGs_ages

ncORFs_ages = pd.read_csv(os.path.join(phyloDir, "GenEra/output/9606_gene_age_summary.tsv"), sep="\t")
ncORFs_ages = ncORFs_ages[["#number_of_genes","phylostratum"]]
ncORFs_ages.columns = ['n_ncORFs','Phylostratum']

ages = PCGs_ages.merge(ncORFs_ages, on="Phylostratum")
ages.Phylostratum = ages.Phylostratum.astype("category")
ages.Phylostratum = ages.Phylostratum.cat.set_categories(lineage)
ages.sort_values(["Phylostratum"], inplace=True)  ## 'sort' changed to 'sort_values'
ages.drop_duplicates(inplace=True)
ages.to_csv(os.path.join(phyloDir,"summary_all_genes_ages.csv"), index=None)

In [39]:
ncORFs_genes_ages = pd.read_csv(os.path.join(phyloDir, "GenEra/output/9606_gene_ages.tsv"), sep="\t", header=None)
ncORFs_genes_ages['transcript_id'] = ncORFs_genes_ages[0].str.split(":", expand=True)[0]
ncORFs_genes_ages['transcript_id'] = ncORFs_genes_ages['transcript_id'].str.split(".", expand=True)[0]
ncORFs_genes_ages = ncORFs_genes_ages.merge(transcript_gene, on="transcript_id")
ncORFs_genes_ages = ncORFs_genes_ages[['gene_name','gene_type',1,0]]
ncORFs_genes_ages.columns=['gene_name','gene_type','Phylostratum','orfID']
ncORFs_genes_ages



PCGs_gene_ages = pd.read_csv(os.path.join(phyloDir, "DENSE_PCGs.csv"))
PCGs_gene_ages.columns=['gene_name','Phylostratum']
PCGs_gene_ages['gene_type'] = "protein_coding"
PCGs_gene_ages['orfID'] = PCGs_gene_ages['gene_name'] + "_ORF1"


all_genes = pd.concat([PCGs_gene_ages, ncORFs_genes_ages])

all_genes.Phylostratum = all_genes.Phylostratum.astype("category")
all_genes.Phylostratum = all_genes.Phylostratum.cat.set_categories(lineage)
all_genes.sort_values(["Phylostratum"], inplace=True)  ## 'sort' changed to 'sort_values'
all_genes.drop_duplicates(inplace=True)
all_genes.to_csv(os.path.join(phyloDir,"all_genes_phyostratum.csv"), index=None)

In [40]:
all_genes

Unnamed: 0,gene_name,Phylostratum,gene_type,orfID
0,A2M,cellular organisms,protein_coding,A2M_ORF1
5846,ZNF814,cellular organisms,protein_coding,ZNF814_ORF1
5845,NEDD4L,cellular organisms,protein_coding,NEDD4L_ORF1
5844,PEX5,cellular organisms,protein_coding,PEX5_ORF1
5843,QDPR,cellular organisms,protein_coding,QDPR_ORF1
...,...,...,...,...
1577,XLOC_000607,,novel,TCONS_00000707:15:+|540|22729:8123:8150|noncod...
1582,XLOC_000828,,novel,TCONS_00000993:18:-|64|1055:791:806|noncoding|GTG
1588,XLOC_000954,,novel,TCONS_00001162:2:+|62|9974:848:863|noncoding|TTG
1596,XLOC_001498,,novel,TCONS_00001784:6:-|13|690:130:331|noncoding|ATG


## Phylostratigraphy Tumor-expressed

In [3]:
all_genes = pd.read_csv(os.path.join(phyloDir,"all_genes_phyostratum.csv"))
all_genes

Unnamed: 0,gene_name,Phylostratum,gene_type,orfID
0,A2M,cellular organisms,protein_coding,A2M_ORF1
1,ZNF814,cellular organisms,protein_coding,ZNF814_ORF1
2,NEDD4L,cellular organisms,protein_coding,NEDD4L_ORF1
3,PEX5,cellular organisms,protein_coding,PEX5_ORF1
4,QDPR,cellular organisms,protein_coding,QDPR_ORF1
...,...,...,...,...
15935,XLOC_000607,,novel,TCONS_00000707:15:+|540|22729:8123:8150|noncod...
15936,XLOC_000828,,novel,TCONS_00000993:18:-|64|1055:791:806|noncoding|GTG
15937,XLOC_000954,,novel,TCONS_00001162:2:+|62|9974:848:863|noncoding|TTG
15938,XLOC_001498,,novel,TCONS_00001784:6:-|13|690:130:331|noncoding|ATG


In [4]:
tumorexpressed = pd.read_csv(os.path.join(users_dir, "cancers/tumorexpressed/cancertypes/tumor_1FPKM_n10percent_pancancer.csv"))
tumorexpressed

Unnamed: 0,transcript_id,gene_id,gene_name,gene_type,Length,n,ctype
0,ENST00000000412,ENSG00000003056,M6PR,protein_coding,2450,131,BRCA
1,ENST00000001008,ENSG00000004478,FKBP4,protein_coding,3715,131,BRCA
2,ENST00000001146,ENSG00000003137,CYP26B1,protein_coding,4556,83,BRCA
3,ENST00000002165,ENSG00000001036,FUCA2,protein_coding,2385,131,BRCA
4,ENST00000002596,ENSG00000002587,HS3ST1,protein_coding,7160,63,BRCA
...,...,...,...,...,...,...,...
191011,TCONS_00002143,XLOC_001856,XLOC_001856,novel,244,140,COAD
191012,TCONS_00002144,XLOC_001848,XLOC_001848,novel,237,143,COAD
191013,TCONS_00002148,XLOC_001844,XLOC_001844,novel,249,110,COAD
191014,TCONS_00002149,XLOC_001845,XLOC_001845,novel,301,142,COAD


In [5]:
phylo_tumorexpressed = all_genes[all_genes['gene_name'].isin(tumorexpressed.gene_name.values.tolist())]
phylo_tumorexpressed.Phylostratum = phylo_tumorexpressed.Phylostratum.astype("category")
phylo_tumorexpressed.Phylostratum = phylo_tumorexpressed.Phylostratum.cat.set_categories(lineage)
phylo_tumorexpressed.sort_values(["Phylostratum"], inplace=True)  ## 'sort' changed to 'sort_values'
phylo_tumorexpressed.to_csv(os.path.join(phyloDir,"all_genes_phyostratum.tumorexpressed.csv"), index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phylo_tumorexpressed.Phylostratum = phylo_tumorexpressed.Phylostratum.astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phylo_tumorexpressed.Phylostratum = phylo_tumorexpressed.Phylostratum.cat.set_categories(lineage)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phylo_tumorexpressed.sort_values(["Phylostratum"], inplace=True)  ## 'sort'