## PHYLOSTRATIGRAPHY

In [52]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
GENOMEDIR = "/genomics/users/marta/genomes"

## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")

## evoDir
evoDir=os.path.join(users_dir,"EvolutionaryOrigin")
phyloDir = os.path.join(users_dir,"Phylostratigraphy")

lineage=["cellular organisms", "Eukaryota", "Opisthokonta", "Metazoa", "Eumetazoa",
    "Bilateria", "Deuterostomia", "Chordata", "Vertebrata",
    "Gnathostomata", "Teleostomi", "Euteleostomi", "Sarcopterygii",
    "Dipnotetrapodomorpha", "Tetrapoda", "Amniota", "Mammalia", "Theria",
    "Eutheria", "Boreoeutheria", "Euarchontoglires", "Primates",
    "Simiiformes", "Catarrhini", "Hominoidea", "Hominidae", "Homininae", "Homo sapiens"]
print(len(lineage))

28


In [14]:
ribORF_in_2 = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.csv")

ribORF_in_2_noncanonical = ribORF_in_2[ribORF_in_2['gene_type'] != "protein_coding"]
ribORF_in_2_noncanonical.to_csv(os.path.join(evoDir,"ribORF_humanTestis_in2.noncanonical.csv"), index=None)

ribORF_in_2_canonical = ribORF_in_2[ribORF_in_2['gene_type'] == "protein_coding"]
print(len(set(ribORF_in_2_canonical.gene_name.values.tolist())))
ribORF_in_2_canonical.to_csv(os.path.join(evoDir,"ribORF_humanTestis_in2.canonical.csv"), index=None)

11265


In [11]:
input_file = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.fa"
output_file_canonical = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.canonical.fa"
output_file_noncanonical = "/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/ribORF_humanTestis_in2.noncanonical.fa"

fasta_sequences = SeqIO.parse(open(input_file),'fasta')

with open(output_file_canonical, "w") as out_c:
    with open(output_file_noncanonical, "w") as out_nc:
        for fasta in fasta_sequences:
            if "noncoding" in fasta.id:
                out_nc.write(">%s\n%s\n" %(fasta.id, str(fasta.seq)))
            else:
                out_c.write(">%s\n%s\n" %(fasta.id, str(fasta.seq)))


In [54]:
DENSE=pd.read_csv("/projects_eg/projects/marta/DENSE_AnneLopes/denovotable.csv")
DENSE_candidates = DENSE[DENSE['Gene'].isin(ribORF_in_2_canonical.gene_name.values.tolist())]
DENSE_candidates = DENSE_candidates[['Gene','Phylostratum']].drop_duplicates()
print(len(DENSE_candidates))
print(len(set(DENSE_candidates.Gene.values.tolist())))
DENSE_candidates.to_csv(os.path.join(phyloDir, "DENSE_PCGs.csv"), index=None)


missing_PCGs_in_DENSE = ribORF_in_2_canonical[~ribORF_in_2_canonical['gene_name'].isin(DENSE.Gene.values.tolist())]
print("Missing PCGs in DENSE")
print(len(set(missing_PCGs_in_DENSE.gene_name.values.tolist())))
missing_PCGs_in_DENSE.to_csv(os.path.join(phyloDir, "DENSE_missingPCGs.csv"), index=None)



13668
10968
Missing PCGs in DENSE
297


In [56]:
print(len(DENSE_candidates))
summary = DENSE_candidates.groupby("Phylostratum").count().reset_index()
summary.columns = ['Phylostratum','n_PCGs']
summary = summary[summary['Phylostratum'] != "Unknown"]
summary = summary[summary["Phylostratum"] != "Possible contamination or HGT"]

# Sorting the DataFrame
summary.Phylostratum = summary.Phylostratum.astype("category")
summary.Phylostratum = summary.Phylostratum.cat.set_categories(lineage)
summary.sort_values(["Phylostratum"])  ## 'sort' changed to 'sort_values'
summary.to_csv(os.path.join(phyloDir, "summary_DENSE_PCGs.csv"), index=None)

13668
