In [1]:
import zipfile
import numpy as np
import pandas as pd
import gzip
import shutil
from Bio import SeqIO
import matplotlib.pyplot as plt
import os
from os import listdir
from os.path import isfile, join
from matplotlib.pyplot import figure
import pickle

In [2]:
#Load all the generated pickles, as their order of creation does not match flow of code below or they were generated in other files
MART95_df_psdg=pd.read_pickle("./MART95_df_psdg.pkl")
with open('./MS3up1_psdgU.pkl', 'rb') as handle:
    MS3up1_psdgU=pickle.load(handle)
with open('./MS2p_psdgU.pkl', 'rb') as handle:
    MS2p_psdgU=pickle.load(handle)
MART110_df=pd.read_pickle("./MART_df.pkl")
MART110_df_psdg=pd.read_pickle("./MART_df_psdg.pkl")
OP_df_altprot_ms0_psdg=pd.read_pickle("./OP16_df_altprot_ms0_psdg.pkl")

## Introduction

Number of different types of pseudogenes, Figure 1

In [3]:
MART110_df=pd.read_pickle("./MART_df.pkl") #Ensemble 110

In [4]:
MART110_df_psdg=pd.read_pickle("./MART_df_psdg.pkl") #Ensemble 110

In [5]:
#Count types of pseudogenes per gene
psdg_counts=MART110_df_psdg.drop_duplicates(["Gene stable ID"])["Gene type"].value_counts()

In [6]:
protcod_count=MART110_df.drop_duplicates(["Gene stable ID"])["Gene type"].value_counts().loc["protein_coding"]
unproc_count=psdg_counts.loc[["unprocessed_pseudogene","transcribed_unprocessed_pseudogene"]].values.sum()
proc_count=psdg_counts.loc[["processed_pseudogene","transcribed_processed_pseudogene","translated_processed_pseudogene"]].values.sum()
unitar_count=psdg_counts.loc[["unitary_pseudogene","transcribed_unitary_pseudogene"]].values.sum()

In [7]:
#Count types of pseudogenes per transcripts
psdgtrx_counts=MART110_df_psdg.drop_duplicates(["Transcript stable ID"])["Gene type"].value_counts()

In [8]:
print(f'Numbers of protein-coding genes = {protcod_count}')
print(f'Number of unprocessed pseudogenes = {unproc_count}')
print(f'Number of processed pseudogenes = {proc_count}')
print(f'Number of unitary pseudogenes = {unitar_count}')
print(f'Number of Other pseudogenes = {(np.sum(psdg_counts)-(unproc_count+proc_count+unitar_count))}')
print(f'Persent of unprocessed and processed pseudogenes (including others) = {((unproc_count+proc_count)/np.sum(psdg_counts))*100}')
print(f"Number of pseudogenes transcripts {np.sum(list(psdgtrx_counts))}")

Numbers of protein-coding genes = 23217
Number of unprocessed pseudogenes = 4604
Number of processed pseudogenes = 11470
Number of unitary pseudogenes = 267
Number of Other pseudogenes = 920
Persent of unprocessed and processed pseudogenes (including others) = 93.12322576907479
Number of pseudogenes transcripts 18852


In [30]:
f'Number of polimorphic pseudogenes {len(set(list(MART110_df.loc[MART110_df["Transcript type"]=="protein_coding_LoF","Gene stable ID"])))}'

'Number of polimorphic pseudogenes 85'

In [34]:
f'Number of protein coding transcripts = {len(set(list(MART110_df.loc[MART110_df["Transcript type"]=="protein_coding","Transcript stable ID"])))}'

'Number of protein coding transcripts = 99312'

In [164]:
f"Protein-coding genes: Fold change of sequence diversity = {99312/23217}"

'Protein-coding genes: Fold change of sequence diversity = 4.277555239695051'

In [132]:
#https://www.uniprot.org/proteomes/UP000005640 release 2024_01
f"Number of UniProt entries = 82,485"

'Number of UniProt entries = 82,485'

In [166]:
f"Proteins: Fold change of sequence diversity = {99312/82485}"

'Proteins: Fold change of sequence diversity = 1.2040007274049827'

##  Evidence of the coding nature of some pseudogenes 

Expression of POU5F1 and NANOG pseudogenes, Figure 3

In [None]:
#See file "gtex_analysis.R"

## Challenges to assess pseudogenes’ coding nature 

### The burden of an inept definition for a doppelganger

Pseudogene identity with parental gene and protein

In [None]:
#See file "psdg_pg_ident.ipynb"

Number of loci impossible to detect unambiguously by RNA-seq, multi-maping reads and regions of pseudogenes

In [None]:
#See folder "find_multimap_sites"

### Co-elution of homologous peptides

Number of pseudogenic peptides that are likely to co-elute

In [None]:
#See file "co_elution.ipynb"

### Short sequences

Figure 5

In [None]:
#See file "figure5.ipynb"

## Proteomics offer an unmatched solution to solve the pseudogene conundrum 

### Large-scale repositories reanalysis

Evidence of translated pseudogenes in OpenProt

In [4]:
#These files were generated through SQL request to OpenProt
MS3up1_df=pd.read_excel("OP_alt_MS3_Upep1_r79.xlsx",sheet_name=0)
MS2p_df=pd.read_excel("OP_alt_MS2pep_r79.xlsx",sheet_name=0)

In [5]:
MART95_df_psdg=pd.read_pickle("./MART95_df_psdg.pkl")

In [6]:
#Select pseudogene transcripts
MS3up1_psdgtrx=MS3up1_df.loc[MS3up1_df["Transcript_id"].isin(list(MART95_df_psdg['Transcript stable ID'])),]
MS2p_psdgtrx=MS2p_df.loc[MS2p_df["Transcript_id"].isin(list(MART95_df_psdg['Transcript stable ID'])),]

In [7]:
MS3up1_psdgUtrx=set(list(MS3up1_psdgtrx["Transcript_id"]))
MS2p_psdgUtrx=set(list(MS2p_psdgtrx["Transcript_id"]))

In [8]:
#Extract them as genes
MS3up1_psdgU=[MART95_df_psdg.loc[MART95_df_psdg['Transcript stable ID']==psdgUtrx,'Gene stable ID'].values[0] for psdgUtrx in MS3up1_psdgUtrx]
MS2p_psdgU=[MART95_df_psdg.loc[MART95_df_psdg['Transcript stable ID']==psdgUtrx,'Gene stable ID'].values[0] for psdgUtrx in MS2p_psdgUtrx]
#Calculate number of each
MS3up1_psdgUcount=len(set(MS3up1_psdgU))
MS2p_psdgUcount=len(set(MS2p_psdgU))

In [9]:
with open('./MS3up1_psdgU.pkl', 'wb') as handle:
    pickle.dump(MS3up1_psdgU, handle)
with open('./MS2p_psdgU.pkl', 'wb') as handle:
    pickle.dump(MS2p_psdgU, handle)

In [217]:
f"OpenProt reports hundreds ({(MS3up1_psdgUcount+MS2p_psdgUcount)}) of pseudogenes as translated. It reports {MS2p_psdgUcount} pseudogenes detected with at least 2 unique peptides, and {MS3up1_psdgUcount} pseudogenes detected with a single unique peptide in at least 3 independent datasets, after the reanalysis of 102 mass spectrometry datasets in humans. "

'OpenProt reports hundreds (1546) of pseudogenes as translated. It reports 1356 pseudogenes detected with at least 2 unique peptides, and 190 pseudogenes detected with a single unique peptide in at least 3 independent datasets, after the reanalysis of 102 mass spectrometry datasets in humans. '

### Proteogenomics approaches

OpenCustomDB identification of pseudogenes

In [47]:
#Supplementary Data 1 from https://doi.org/10.1021/acs.jproteome.3c00054
OpenCustomDB_Suppl1="pr3c00054_si_002.xlsx"

In [48]:
OS1_df=pd.read_excel(OpenCustomDB_Suppl1,sheet_name=2)

In [49]:
OS1_genes=list(OS1_df["Gene"])

In [50]:
MART110_df=pd.read_pickle("./MART_df.pkl") #Ensemble 110

In [51]:
OS1_genetype=MART110_df.loc[MART110_df["Gene name"].isin(OS1_genes)].drop_duplicates(["Gene name"])["Gene type"]

In [52]:
pd.DataFrame(OS1_genetype).value_counts()

Gene type                         
protein_coding                        24
processed_pseudogene                   1
transcribed_unprocessed_pseudogene     1
unprocessed_pseudogene                 1
dtype: int64

### Noncanonical ORF repositories

OpenProt lenght limit to identify short (<30 aa) pseudogenic ORFs

In [53]:
MART95_df_psdg=pd.read_pickle("./MART95_df_psdg.pkl") #Ensemble 95 compatible with OpenProt version 1.6
OP_df_altprot_ms0_psdg=pd.read_pickle("./OP16_df_altprot_ms0_psdg.pkl")

In [54]:
MART95all_psdg=set(list(MART95_df_psdg["Gene name"]))

In [55]:
OP_df_altprot_ms0_psdg["trxstableid"]=[t.split(".")[0] for t in list(OP_df_altprot_ms0_psdg['transcript accession'])]

In [56]:
OPincluded_psdg=set(list(OP_df_altprot_ms0_psdg[OP_df_altprot_ms0_psdg["trxstableid"].isin(list(MART95_df_psdg['Transcript stable ID']))]["gene symbol"]))

In [57]:
f"Note that {len(MART95all_psdg)-len(OPincluded_psdg)} pseudogenes do not harbor an ORF longer than 30 codons and are therefore excluded from OpenProt analyses."

'Note that 4066 pseudogenes do not harbor an ORF longer than 30 codons and are therefore excluded from OpenProt analyses.'

### Enrichment techniques

Percent of pseudogenes with ORF length > 100 aa

In [None]:
OP_df_altprot_ms0_psdg=pd.read_pickle("./OP16_df_altprot_ms0_psdg.pkl")

In [263]:
OP_df_altprot_ms0_psdg_len100plus=OP_df_altprot_ms0_psdg.loc[OP_df_altprot_ms0_psdg['protein length (a.a.)']>100,]

In [271]:
psdg_len100plus_perc=len(set(list(OP_df_altprot_ms0_psdg_len100plus['gene symbol'])))/len(set(list(OP_df_altprot_ms0_psdg['gene symbol'])))*100

In [272]:
f"One should note that {round(psdg_len100plus_perc,2)}% of human pseudogenes have at least one ORF longer than 100 codons, a common size threshold in the aforementioned studies."

'One should note that 36.16% of human pseudogenes have at least one ORF longer than 100 codons, a common size threshold in the aforementioned studies.'

Number of pseudogenes detected by MicroID (Supplementery Data 1) was calculated by hand

The list of selected pseudogenes:

Nucleus (Fib): NDUFB4P12, RPL18P13 (2), EEF1A1P7, PGK1P2, NAP1L4P1;

Chromatin (H2B): EEF1A1P7, NAP1L4P1, ASS1P1, HNRNPLP2;

Nuclear envelope (Lamin): - ;

Whole nucleus (NLS): RPS7P4, ASS1P1, RPL7AP5, YWHAEP1;

### Immunopeptidomics

Number of pseudogenic peptides identified in Chong et al. study

In [1]:
#https://doi.org/10.1038%2Fs41467-020-14968-9
#We filtered "Supplementary Data 5", excluding "TE" and "ribo" in column "HLAp category" , leaving in "IsDecoy" column only "False" values, 
#and non-empty cells in column "ORF_category".
#Remaining rows in column ("Chong_et_al_filtered") "Sequence" were checked for uniqueness in Nextprot. (https://www.nextprot.org/tools/peptide-uniqueness-checker)
#Resulting output depicts peptides detected in previous studies ("Chong_et_al_non-unique").
#They will be removed from filtered data and remaining peptides will be checked if they are assigned to pseudogene transcript or not.

In [2]:
Chongfilt_file="Chong_et_al_filtered.txt"
ChongNonU_file="Chong_et_al_non-unique.txt"

In [17]:
#Upload as a data frame
Chongfilt_df=pd.read_csv(Chongfilt_file,sep="\t")
#Upload as a list
ChongNonU_list=list(pd.read_csv(ChongNonU_file,sep="\t",header=None)[0])

In [20]:
#Remove non-unique peptides
Chongfilt_dffilt = Chongfilt_df[~Chongfilt_df['Sequence'].isin(ChongNonU_list)]

In [28]:
#Transform a data frame to a dictionary, where key is a peptide and value is a list of transcripts
Chong_dict={}
for key in Chongfilt_dffilt["Sequence"]:
    Chong_dict[key]=[]
    trx_list=list(Chongfilt_dffilt.loc[Chongfilt_dffilt["Sequence"]==key,"Transcript_ID"])
    for trx in trx_list:
        Chong_dict[key].append(trx[0:15])

In [30]:
#Load Ensemble 110 annotation of pseudogenes
MART110_df_psdg=pd.read_pickle("./MART_df_psdg.pkl") 

In [33]:
#Count peptides from pseudogenes
Chong_psdg_peptide=0
for key in Chong_dict.keys():
    for trx in Chong_dict[key]:
        if trx in list(MART110_df_psdg["Transcript stable ID"]):
            Chong_psdg_peptide+=1

In [35]:
f"Chong et al. identified 369  peptides from noncanonical proteins in the immunopeptidome of human melanoma cell lines, including {Chong_psdg_peptide} from pseudogenes."

'Chong et al. identified 369  peptides from noncanonical proteins in the immunopeptidome of human melanoma cell lines, including 89 from pseudogenes.'