In [1]:
import os,re,glob
import pandas as pd
import numpy as np

def save_to_fasta(df, filename):
    with open(filename, 'w') as f:
        for index, row in df.iterrows():
            f.write(f">{row['orfID']}\n")
            f.write(f"{row['ORFpep']}\n")
# dir="/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q2_TestisRestricted/human"
dir="/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q2_TestisRestricted/human"

In [2]:
testisSp = pd.read_csv(os.path.join(dir,"testisRestricted_GTEx_translatedONLYtestis.csv"))
testisSp

## select protein coding sequences
ProteinCoding = testisSp[testisSp['geneORFtype'] == "protein_coding_canonical"]
ProteinCoding = ProteinCoding[['orfID','ORFpep']]
ProteinCoding['orfID'] = ProteinCoding.orfID.str.split("canonical", expand=True)[0]
ProteinCoding.drop_duplicates(inplace=True)
print(len(ProteinCoding))

## save the fasta sequences
save_to_fasta(ProteinCoding, os.path.join(dir,"testisRestricted_GTEx_translatedONLYtestis.PCoding.fa"))

## select non-canonical sequences
NonCanonical = testisSp[testisSp['geneORFtype'] != "protein_coding_canonical"]
NonCanonical['orfID_gene_type'] = NonCanonical['gene_type'] + "|" + NonCanonical['orfID']
NonCanonical = NonCanonical[['orfID_gene_type','ORFpep','geneORFtype']]
NonCanonical.columns = ['orfID','ORFpep','geneORFtype']
NonCanonical.drop_duplicates(inplace=True)
print(len(NonCanonical))

## save the fasta sequences
save_to_fasta(NonCanonical, os.path.join(dir,"testisRestricted_GTEx_translatedONLYtestis.NonCanonical.fa"))


366
706


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NonCanonical['orfID_gene_type'] = NonCanonical['gene_type'] + "|" + NonCanonical['orfID']


In [3]:
NonCanonical['gene_type'] = NonCanonical.orfID.str.split("|", expand=True)[0]
print("lncRNA: ", len(NonCanonical[NonCanonical['gene_type'] == "lncRNA"]))
print("Novel: ", len(NonCanonical[NonCanonical['gene_type'] == "novel"]))
print("processed_pseudogene: ", len(NonCanonical[NonCanonical['gene_type'] == "processed_pseudogene"]))
print("uORFs: ", len(NonCanonical[NonCanonical['geneORFtype'] == "protein_coding_uORF"]))
print("dORFs: ", len(NonCanonical[NonCanonical['geneORFtype'] == "protein_coding_dORF"]))
print("ouORFs: ", len(NonCanonical[NonCanonical['geneORFtype'] == "protein_coding_ouORF"]))
print("odORFs: ", len(NonCanonical[NonCanonical['geneORFtype'] == "protein_coding_odORF"]))

lncRNA:  309
Novel:  224
processed_pseudogene:  30
uORFs:  66
dORFs:  33
ouORFs:  33
odORFs:  11


BLASTP so that sequences from ncORFs do not come from CDS

In [4]:
%%bash -s "$dir"

## create blastp database
module load BLAST+/2.12.0-Linux_x86_64

mkdir -p $1/blastp

makeblastdb -in $1/testisRestricted_GTEx_translatedONLYtestis.PCoding.fa -out $1/blastp/testisRestricted_GTEx_translatedONLYtestis.PCoding -parse_seqids -dbtype prot




Building a new DB, current time: 12/11/2024 15:18:54
New DB name:   /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q2_TestisRestricted/human/blastp/testisRestricted_GTEx_translatedONLYtestis.PCoding
New DB title:  /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q2_TestisRestricted/human/testisRestricted_GTEx_translatedONLYtestis.PCoding.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q2_TestisRestricted/human/blastp/testisRestricted_GTEx_translatedONLYtestis.PCoding
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 366 sequences in 0.029448 seconds.




In [5]:
%%bash -s "$dir"

module load BLAST+/2.12.0-Linux_x86_64

## testis-expressed vs MACACA
blastp -query $1/testisRestricted_GTEx_translatedONLYtestis.NonCanonical.fa \
-db $1/blastp/testisRestricted_GTEx_translatedONLYtestis.PCoding \
-task 'blastp-short' \
-out $1/blastp/TestisRestrictedGTEx_ncORFsvsCDS.txt \
-evalue 0.0001 \
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qframe sframe sstrand qcovs"


3 ncORFs from processed pseudogenes and 1 from lncRNA had homology with a protein-coding sequence, so we removed them

In [6]:
%%bash -s "$dir"

## create blastp database
module load BLAST+/2.12.0-Linux_x86_64

mkdir -p $1/blastp

makeblastdb -in /data/genomics/marta/genomes/GRCh38/Homo_sapiens.GRCh38.pep.all.fa -out /data/genomics/marta/genomes/GRCh38/Homo_sapiens.GRCh38.pep.all.blastpdb -parse_seqids -dbtype prot




Building a new DB, current time: 12/11/2024 15:18:57
New DB name:   /data/genomics/marta/genomes/GRCh38/Homo_sapiens.GRCh38.pep.all.blastpdb
New DB title:  /data/genomics/marta/genomes/GRCh38/Homo_sapiens.GRCh38.pep.all.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /data/genomics/marta/genomes/GRCh38/Homo_sapiens.GRCh38.pep.all.blastpdb
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 110788 sequences in 2.16732 seconds.




In [7]:
%%bash -s "$dir"

module load BLAST+/2.12.0-Linux_x86_64

blastp -query $1/testisRestricted_GTEx_translatedONLYtestis.NonCanonical.fa \
-db /data/genomics/marta/genomes/GRCh38/Homo_sapiens.GRCh38.pep.all.blastpdb \
-task 'blastp-short' \
-out $1/blastp/TestisRestrictedGTEx_ncORFsvsProteome.txt \
-evalue 0.0001 \
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qframe sframe sstrand qcovs"



If, instead, we do it against all the proteome (GRCh38), we would remove 3 lncRNA-ORFs, 2 novel-ORFs and 20 processedPseudogene ORFs from 17 processed psuedogenes

In [8]:
blastp_proteome = pd.read_csv(os.path.join(dir,"blastp/TestisRestrictedGTEx_ncORFsvsProteome.txt"), sep="\t", header=None)
blastp_proteome['orfID'] = blastp_proteome[0].str.split("|", n=1).str.get(1)
blastp_proteome
print(len(testisSp))
testisSp_noProteome = testisSp[~testisSp['orfID'].isin(blastp_proteome.orfID.values.tolist())]
print(len(testisSp_noProteome))
testisSp_noProteome.to_csv(os.path.join(dir,"testisRestricted_GTEx_translatedONLYtestis.noProteome.csv"))
testisSp_noProteome
print("lncRNA: ", len(testisSp_noProteome[testisSp_noProteome['gene_type'] == "lncRNA"]))
print("Novel: ", len(testisSp_noProteome[testisSp_noProteome['gene_type'] == "novel"]))
print("processed_pseudogene: ", len(testisSp_noProteome[testisSp_noProteome['gene_type'] == "processed_pseudogene"]))
print("uORFs: ", len(testisSp_noProteome[testisSp_noProteome['geneORFtype'] == "protein_coding_uORF"]))
print("dORFs: ", len(testisSp_noProteome[testisSp_noProteome['geneORFtype'] == "protein_coding_dORF"]))
print("ouORFs: ", len(testisSp_noProteome[testisSp_noProteome['geneORFtype'] == "protein_coding_ouORF"]))
print("odORFs: ", len(testisSp_noProteome[testisSp_noProteome['geneORFtype'] == "protein_coding_odORF"]))

1072
1034
lncRNA:  306
Novel:  222
processed_pseudogene:  11
uORFs:  65
dORFs:  25
ouORFs:  33
odORFs:  6
