In [4]:
import zipfile
import numpy as np
import pandas as pd
import gzip
import shutil
from Bio import SeqIO
import matplotlib.pyplot as plt
import os
from os import listdir
from os.path import isfile, join
from matplotlib.pyplot import figure
import pickle

In [5]:
#Load all intermidiate pickles (created in other document).
#Refer to "figure5.ipynb".

#Pseudogenic proteins (Alternative Proteins (AltProt)) from Openprot v1.6, not filtered.
OP_df_altprot_ms0_psdg=pd.read_pickle("./OP16_df_altprot_ms0_psdg.pkl")
#Canonical proteins (Reference Proteins (RefProt)) from Openprot v1.6, not filtered.
OP_df_refprot_ms0=pd.read_pickle("./OP16_df_refprot_ms0.pkl")

In [None]:
#Load all pickles created downstreem.

#psiCube dataset in dictionary, not filtered.
with open('PSICUBE_dict.pickle', 'rb') as handle:
    PSICUBE_dict = pickle.load(handle)
#Openprot pseudogenic proteins filtered by overlap with psiCube (by pseudogene transcript accession) and by  longest transcript (acording to Openprot).
#Note, if pseudogene transcript accession was present in Openprot, but coresponding parental gene transcript accession was absent (and visa versa), the pair was not included in .pkl.
OP_df_altprot_ms0_psdg_psicube_lngstorf.to_pickle("./OP16_df_altprot_ms0_psdg_psicube_lngstorf.pkl")
#Openprot reference proteins filtered by overlap with psiCube (by parental gene transcript accession) and by  longest transcript (acording to Openprot).
#Note, if pseudogene transcript accession was present in Openprot, but coresponding parental gene transcript accession was absent (and visa versa), the pair was not included in .pkl.
OP_df_refprot_ms0_psicube_lngstorf.to_pickle("./OP16_df_refprot_ms0_psicube_lngstorf.pkl")
#Filtered (above) pseudogenic proteins sequence (SeqIO dictionary).
with open(f'./OP16fasta_psdgpsicube_lngstorf.pickle','rb') as handle:
    OPfasta_psdgpsicube_lngstorf= pickle.load(handle)
#Filtered (above) reference proteins sequence (SeqIO dictionary).
with open(f'./OP16fasta_refprotpsicube_lngstorf.pickle','rb') as handle:
    OPfasta_refprotpsicube_lngstorf= pickle.load(handle)

Upload psiCube dataset

In [12]:
#http://www.pseudogene.org/psicube/data/gencode.v10.pgene.parents.txt
PSICUBE_orig=pd.read_table("./gencode.v10.pgene.parents.txt")

In [3]:
#Transform psiCube dataset to dictionary.
#Keys: PseudogeneTranscriptStableId_ParentalGeneStableId_ParentalGeneTranscriptStableId
#Absent transcript accessions mark as NaN.
PSICUBE_tmp=PSICUBE_orig.to_dict('index')
PSICUBE_dict={}
for key in PSICUBE_tmp.keys():
    psdg=PSICUBE_tmp[key]['ID'].split('.')[0]
    try:
        pg=PSICUBE_tmp[key]['Parent gene'].split('.')[0]
    except AttributeError:
        pg='NaN'
    try:
        ptrx=PSICUBE_tmp[key]['Parent transcript'].split('.')[0]
    except:
        ptrx='NaN'
    newkey=f'{psdg}_{pg}_{ptrx}'
    PSICUBE_dict[newkey]=PSICUBE_tmp[key]

In [6]:
#with open(f'./PSICUBE_dict.pickle','wb') as handle:
#            pickle.dump(PSICUBE_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
#            handle.close()

Upload Openprot data set with pseudogenes and reference proteins. See "figure5.ipunb" for how they were created.

In [6]:
OP_df_altprot_ms0_psdg=pd.read_pickle("./OP16_df_altprot_ms0_psdg.pkl")

In [7]:
OP_df_refprot_ms0=pd.read_pickle("./OP16_df_refprot_ms0.pkl")

Filter Openprot subsets with known psiCube parental gene (refprot) - pseudogene (psdg) pairs

In [29]:
#Separate psiCube dictionary keys to only pseudogene IDs, only parental gene IDs and both in tuple.
psdg_trx=[k.split("_")[0] for k in PSICUBE_dict.keys()]
pg_trx=[k.split("_")[2] for k in PSICUBE_dict.keys()]
pairs_trx=[(k.split("_")[0],k.split("_")[2]) for k in PSICUBE_dict.keys()]

In [14]:
#Add column to Openprot subsets with stable transcript accessions.
OP_df_altprot_ms0_psdg["trxstableid"]=[t.split(".")[0] for t in list(OP_df_altprot_ms0_psdg['transcript accession'])]
OP_df_refprot_ms0["trxstableid"]=[t.split(".")[0] for t in list(OP_df_refprot_ms0['transcript accession'])]

In [17]:
#Filter out prom Openprot subsets transcript accessions absent in psiCube dataset.
OP_df_altprot_ms0_psdg_psicube=OP_df_altprot_ms0_psdg[OP_df_altprot_ms0_psdg["trxstableid"].isin(psdg_trx)]
OP_df_refprot_ms0_psicube=OP_df_refprot_ms0[OP_df_refprot_ms0["trxstableid"].isin(pg_trx)]

In [28]:
#Different number indicates that there are more pseudogenes than parental genes.
len(set(OP_df_altprot_ms0_psdg_psicube["trxstableid"])),len(set(OP_df_refprot_ms0_psicube["trxstableid"])) 

(8189, 3910)

In [34]:
#Check if all selected pseudogenes have parental gene (and visa versa) in coresponding Openprot subsets.
nopg_pair=0
nopsdg_pair=0
incomplete_pairs=[]
for pair in pairs_trx:
    if pair[0] in set(OP_df_altprot_ms0_psdg_psicube["trxstableid"]):
        if pair[0] in set(OP_df_altprot_ms0_psdg_psicube["trxstableid"]) and pair[1] in set(OP_df_refprot_ms0_psicube["trxstableid"]):
            next
        else:
            nopg_pair+=1
            incomplete_pairs.append(pair)
    if pair[1] in set(OP_df_refprot_ms0_psicube["trxstableid"]):
        if pair[0] in set(OP_df_altprot_ms0_psdg_psicube["trxstableid"]) and pair[1] in set(OP_df_refprot_ms0_psicube["trxstableid"]):
            next
        else:
            nopsdg_pair+=1
            incomplete_pairs.append(pair)
        
print(nopg_pair,nopsdg_pair,len(incomplete_pairs))

814 1861 2675


In [37]:
#Delet this pairs from Openprot subsets.
for pair in incomplete_pairs:
    if pair[0] in set(OP_df_altprot_ms0_psdg_psicube["trxstableid"]):
        indxdel=OP_df_altprot_ms0_psdg_psicube.loc[OP_df_altprot_ms0_psdg_psicube["trxstableid"]==pair[0],].index
        OP_df_altprot_ms0_psdg_psicube=OP_df_altprot_ms0_psdg_psicube.drop(indxdel,axis='index')
    if pair[1] in set(OP_df_refprot_ms0_psicube["trxstableid"]):
        indxdel=OP_df_refprot_ms0_psicube.loc[OP_df_refprot_ms0_psicube["trxstableid"]==pair[1],].index
        OP_df_refprot_ms0_psicube=OP_df_refprot_ms0_psicube.drop(indxdel,axis='index')

In [40]:
len(set(OP_df_altprot_ms0_psdg_psicube["trxstableid"])),len(set(OP_df_refprot_ms0_psicube["trxstableid"]))

(7375, 2802)

Select longest transcripts for Openprot subsets

In [41]:
#For pseudogenic proteins.
OP_df_altprot_ms0_psdg_psicube_lngstorf=pd.DataFrame(columns=OP_df_altprot_ms0_psdg_psicube.columns)
for psdg in set(list(OP_df_altprot_ms0_psdg_psicube['gene symbol'])):
    df=OP_df_altprot_ms0_psdg_psicube.loc[OP_df_altprot_ms0_psdg_psicube['gene symbol']==psdg,]
    orflen=list(df['stop transcript coordinates']-df['start transcript coordinates'])
    maxorfidx=orflen.index(np.max(orflen))
    df=df.iloc[maxorfidx,:].to_frame().transpose()
    OP_df_altprot_ms0_psdg_psicube_lngstorf=pd.concat([OP_df_altprot_ms0_psdg_psicube_lngstorf,df])

In [42]:
len(set(OP_df_altprot_ms0_psdg_psicube["trxstableid"]))

7375

In [43]:
OP_df_altprot_ms0_psdg_psicube_lngstorf.to_pickle("./OP16_df_altprot_ms0_psdg_psicube_lngstorf.pkl")

In [44]:
#For reference proteins (transcribed from parental genes).
OP_df_refprot_ms0_psicube_lngstorf=pd.DataFrame(columns=OP_df_refprot_ms0_psicube.columns)
for psdg in set(list(OP_df_refprot_ms0_psicube['gene symbol'])):
    df=OP_df_refprot_ms0_psicube.loc[OP_df_refprot_ms0_psicube['gene symbol']==psdg,]
    orflen=list(df['stop transcript coordinates']-df['start transcript coordinates'])
    maxorfidx=orflen.index(np.max(orflen))
    df=df.iloc[maxorfidx,:].to_frame().transpose()
    OP_df_refprot_ms0_psicube_lngstorf=pd.concat([OP_df_refprot_ms0_psicube_lngstorf,df])

In [46]:
OP_df_refprot_ms0_psicube_lngstorf.to_pickle("./OP16_df_refprot_ms0_psicube_lngstorf.pkl")

Upload Openprot fasta file and select protein accession numbers based on filtered Openprot subsets.

In [76]:
OPfasta_file="human-openprot-r1_6-refprots+altprots+isoforms-+uniprot2019_03_01.fasta"

In [None]:
#Unarchive OP_file
#with zipfile.ZipFile(f'{OPfasta_file}.zip', 'r') as zip_ref:
#    zip_ref.extractall("./")

In [77]:
#Parce fasta to dictionary.
input_file = open(OPfasta_file)
OPfasta_orig = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))

In [78]:
#Select only keys with pseudogenic proteins.
OP_psdgpsicube_lngstorf_protacc=list(set(list(OP_df_altprot_ms0_psdg_psicube_lngstorf['protein accession numbers'])))
OPfasta_psdgpsicube_lngstorf={}
nokey=0
nokeylist=[]
for ip in OP_psdgpsicube_lngstorf_protacc:
    key=f"{ip}|TX=9606"
    if key in OPfasta_orig.keys():
        OPfasta_psdgpsicube_lngstorf[key]=OPfasta_orig[key]
    else:
        nokey+=1
        nokeylist.append(ip)
print(nokey,len(OPfasta_psdgpsicube_lngstorf.keys()))

4 7324


In [79]:
#Some keys are absent because proteins have several protein accessions. 
#Other protein accessions are writen in "protein accession (others)" column. 
#Find the right accession and add it to dictionary.
stillnokey=0
for ip in nokeylist:
    otherips=list(OP_df_altprot_ms0_psdg_psicube_lngstorf.loc[OP_df_altprot_ms0_psdg_psicube_lngstorf["protein accession numbers"]==ip,"protein accession (others)"])[0].split(";")
    stillnokey+=1
    for ip2 in otherips:
        key=f"{ip2}|TX=9606"
        if key in OPfasta_orig.keys():
            OPfasta_psdgpsicube_lngstorf[key]=OPfasta_orig[key]
            stillnokey-=1
print(stillnokey,len(OPfasta_psdgpsicube_lngstorf.keys()))

0 7328


In [80]:
with open(f'./OP16fasta_psdgpsicube_lngstorf.pickle','wb') as handle:
    pickle.dump(OPfasta_psdgpsicube_lngstorf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [81]:
#Select only keys with reference proteins.
OP_refprotpsicube_lngstorf_protacc=list(set(list(OP_df_refprot_ms0_psicube_lngstorf['protein accession numbers'])))
OPfasta_refprotpsicube_lngstorf={}
nokey=0
nokeylist=[]
for ip in OP_refprotpsicube_lngstorf_protacc:
    key=f"{ip}|TX=9606"
    if key in OPfasta_orig.keys():
        OPfasta_refprotpsicube_lngstorf[key]=OPfasta_orig[key]
    else:
        nokey+=1
        nokeylist.append(ip)
print(nokey,len(OPfasta_refprotpsicube_lngstorf.keys()))

2419 43


In [82]:
stillnokey=0
for ip in nokeylist:
    otherips=list(OP_df_refprot_ms0_psicube_lngstorf.loc[OP_df_refprot_ms0_psicube_lngstorf["protein accession numbers"]==ip,"protein accession (others)"])[0].split(";")
    stillnokey+=1
    for ip2 in otherips:
        key=f"{ip2}|TX=9606"
        if key in OPfasta_orig.keys():
            OPfasta_refprotpsicube_lngstorf[key]=OPfasta_orig[key]
            stillnokey-=1
print(stillnokey,len(OPfasta_refprotpsicube_lngstorf.keys()))

0 2461


In [83]:
with open(f'./OP16fasta_refprotpsicube_lngstorf.pickle','wb') as handle:
    pickle.dump(OPfasta_refprotpsicube_lngstorf, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

Digest peptides

Calculate % of pseudogenic peptides with 1 or 2 amino acid differences from parental coresponding peptides (exclude non unique peptides)