In [395]:
import zipfile
import numpy as np
import pandas as pd
import gzip
import shutil
from Bio import SeqIO
import os
import pickle
from itertools import chain
from scipy import stats
import re
import subprocess
from Bio.Blast import NCBIXML

Note, to repeat the analysis you will need to install standalone BLAST+ (https://www.ncbi.nlm.nih.gov/books/NBK52640/)

In [None]:
#Load pickles that were generated in other files 
with open('./PSICUBE_dict.pickle', 'rb') as handle:
    PSICUBE_dict = pickle.load(handle)
OP16_df_altprot_ms0_psdg_psicube=pd.read_pickle("OP16_df_altprot_ms0_psdg_psicube.pkl")
OP16_df_refprot_ms0_psicube=pd.read_pickle("OP16_df_refprot_ms0_psicube.pkl")
with open(f'./OP16fasta_orig.pickle','rb') as handle:
    OPfasta_orig= pickle.load(handle)

In [4]:
#Load pickles that were generated below
with open('./MART95fna_orig.pickle', 'rb') as handle:
    MART95fna_orig=pickle.load(handle)
with open(f'./OP16fasta_psdgpsicube_allprotacc.pickle','rb') as handle:
    OPfasta_psdgpsicube_allprotacc=pickle.load(handle)
with open(f'./OP16fasta_pgpsicube_allprotacc.pickle','rb') as handle:
    OPfasta_pgpsicube_allprotacc=pickle.load(handle)
with open("./ProtAcc_dict_dkey.pkl","rb") as f:
    ProtAcc_dict_dkey=pickle.load(f)
with open(f'./OP16psi_pepseqalignd_ms2p_blast.pickle','rb') as handle:
    OPpsi_pepseqalignd_ms2p_blast=pickle.load(handle)  
with open(f'./PSIms2p_unproc_seq.pickle','rb') as handle:
    PSIms2p_unproc_seq=pickle.load(handle)
with open(f'./MART95fasta_psims2_orig.pickle','rb') as handle:
    MART95fasta_psims2_orig=pickle.load(handle)
with open(f'./PSIms2p_proc_seq.pickle','rb') as handle:
    PSIms2p_proc_seq=pickle.load(handle)
with open(f'./PSIms2p_proc_align.pickle','rb') as handle:
    PSIms2p_proc_align= pickle.load(handle)
with open(f'./PSIms2p_unproc_align.pickle','rb') as handle:
    PSIms2p_unproc_align= pickle.load(handle)

The following code calculates identity percent between (1) pseudogenic proteins and proteins from parental genes; (2) pseudogenes and parental genes. For the proteins, first all predicted pseudogenic ORFs per transcript were considered. Then only thouse with lower identity percent were filtered out. For the genes, pseudogenes were classified by nature of origin (processed and unprocessed). Processed pseudogenes were compared to cDNA of parental genes transcript. Unprocessed pseudogenes were compared to the genomic sequence of parental gene. Aligment was done with blastp (for proteins) and blastn (for genes).

Pairs are defined by database psiCube. Only pseudogenes with detected unique peptides (with at least 2 unique peptides, and with a single unique peptide in at least 3 independent datasets) are concidered in the analysis.

In [69]:
#Import fasta file with genes sequences. Ensemble 95, compatible with OpenProt v1.6
MART95fna_file="./mart95fasta_export.txt"

In [70]:
#Unarchive
with gzip.open(f"{MART95fna_file}.gz", 'rb') as f_in:
    with open(MART95fna_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [72]:
#Parce to dictionary
input_file = open(MART95fna_file)
MART95fna_orig = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))

In [73]:
#with open('./MART95fna_orig.pickle', 'wb') as handle:
#    pickle.dump(MART95fna_orig, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [84]:
#Select pseudogenes transcript ID and protein coding genes ID
MART95fna_protcodgeneid=[key.split("|")[0] for key in MART95fna_orig.keys() if key.split("|")[3]=="protein_coding"]
MART95fna_psdgtrxid=[key.split("|")[1].split(";") for key in MART95fna_orig.keys() if "pseudogene" in key.split("|")[3]]
MART95fna_psdgtrxid=list(chain.from_iterable(MART95fna_psdgtrxid))

In [114]:
#Get protein coding transcript IDs
MART95fna_protcodtrxid=[key.split("|")[1].split(";") for key in MART95fna_orig.keys()  if key.split("|")[3]=="protein_coding"]
MART95fna_protcodtrxid=list(chain.from_iterable(MART95fna_protcodtrxid))

In [115]:
#Define pseudogene - psrental TRANSCRIPT pairs, filter out version incompatibilies and incompleat pairs.
pairs_trx=[(k.split("_")[0],k.split("_")[2]) for k in PSICUBE_dict.keys()]
PSIfilt95_pairstrx=pairs_trx.copy()
for pair in pairs_trx:
    if pair[0] not in MART95fna_psdgtrxid or pair[1] not in MART95fna_protcodtrxid:
        PSIfilt95_pairstrx.remove(pair)

In [603]:
len(PSIfilt95_pairstrx),len(pairs_trx) #around 1600 paires were filtered out

(8783, 10371)

In [77]:
#Dictionary that maps transcript IDs to gene IDs
MART95_acss={}
for key in MART95fna_orig.keys():
    gene=key.split("|")[0]
    trxlist=key.split("|")[1].split(";")
    for trx in trxlist:
        MART95_acss[trx]=gene

(1) calculate **protein** identity percent

In [278]:
#Load pseudogenes, that have detected at least 2 unique peptides, 
#and with a single unique peptide in at least 3 independent datasets.
#See "reported_values.ipynb"
with open(f'./MS3up1_psdgU.pkl','rb') as handle:
    MS3up1_psdgU= pickle.load(handle)
with open(f'./MS2p_psdgU.pkl','rb') as handle:
    MS2p_psdgU= pickle.load(handle)

In [287]:
#Merge two lists
MSall_psdgU=list(set(MS3up1_psdgU+MS2p_psdgU))

In [288]:
#Map transcript IDs
MSall_psdgUtrx=[trx for gene in MSall_psdgU for trx in MART95_acss.keys() if gene==MART95_acss[trx]] #all trx for each psdg

In [284]:
#Select pseudogenes transcripts from psiCube
psi_psdgtrx=[k.split("_")[0] for k in PSICUBE_dict.keys()]

In [289]:
#Filter out transcripts, that are not in psiCube
MSall_psdgUtrx_psi=[trx for trx in MSall_psdgUtrx if trx in psi_psdgtrx]

In [290]:
len(MSall_psdgUtrx_psi)

1279

In [604]:
#Load OpenProt .tsv data about pseudogenic predicted ORFs and reference (canonical) CDS.
#Both have transcripts that overlap psiCube pairs transcripts.
#See "co_elution.ipynb"
OP16_df_altprot_ms0_psdg_psicube=pd.read_pickle("OP16_df_altprot_ms0_psdg_psicube.pkl")
OP16_df_refprot_ms0_psicube=pd.read_pickle("OP16_df_refprot_ms0_psicube.pkl")
#Load OpenProt fasta file with proteins sequences
with open(f'./OP16fasta_orig.pickle','rb') as handle:
    OPfasta_orig= pickle.load(handle)

In [294]:
#Map gene IDs
OP16_df_refprot_ms0_psicube["genestableid"]=[MART95_acss[trx] for trx in list(OP16_df_refprot_ms0_psicube['trxstableid'])]
OP16_df_altprot_ms0_psdg_psicube["genestableid"]=[MART95_acss[trx] for trx in list(OP16_df_altprot_ms0_psdg_psicube['trxstableid'])]

In [307]:
#Select only detected pseudogenic proteins
OP16_df_altprot_ms0_psdg_psicube_MSalltrx=OP16_df_altprot_ms0_psdg_psicube.loc[OP16_df_altprot_ms0_psdg_psicube["trxstableid"].isin(MSall_psdgUtrx_psi),]

In [308]:
#Filter out ORFs with MS score less than 2
OP16_df_altprot_ms0_psdg_psicube_MSalltrx_2p=OP16_df_altprot_ms0_psdg_psicube_MSalltrx.loc[OP16_df_altprot_ms0_psdg_psicube_MSalltrx["MS score"]>=2,]

In [326]:
#Filter psiCube transcript pairs by presence in OpenProt and save protein accession numbers
PSIfilt95_pairstrx_protacc=[]
PSIfilt95_pairstrx_real=[]
#some pairs may be absent not only in version Ensemble 95 in general but also in Openprot, as gene/trx with ORF<30 aa are not included in it, or may not contain ORF at all
for pair in PSIfilt95_pairstrx:
    psdg_protacc=OP16_df_altprot_ms0_psdg_psicube_MSalltrx_2p.loc[OP16_df_altprot_ms0_psdg_psicube_MSalltrx_2p['trxstableid']==pair[0],'protein accession numbers'].values
    pg_protacc=OP16_df_refprot_ms0_psicube.loc[OP16_df_refprot_ms0_psicube['trxstableid']==pair[1],'protein accession numbers'].values
    if len(psdg_protacc)==0 or len(pg_protacc)==0: #as df previously was filterd on incomplete pairs, just skip them
        next
    else:
        PSIfilt95_pairstrx_protacc.append(tuple([psdg_protacc,pg_protacc])) #Each pair has a list of protein accetions to filter after
        PSIfilt95_pairstrx_real.append(pair)

In [322]:
len(PSIfilt95_pairstrx_protacc)

682

In [336]:
#creat all combinations of peptides to filter after by highest identity percent later
ProtAcc_dict={}
for peppair,trxpair in zip(PSIfilt95_pairstrx_protacc,PSIfilt95_pairstrx_real):
    peppsdglist=list(peppair[0])
    peppglist=list(peppair[1])
    for i in range(len(peppsdglist)):
        for j in range(len(peppglist)):
            ProtAcc_dict[(peppsdglist[i], peppglist[j])]=trxpair

In [337]:
len(ProtAcc_dict)

862

In [341]:
#Select only keys with pseudogenic proteins.
OP_psdgpsicube_allprotacc=[psdgprotacc for pair in PSIfilt95_pairstrx_protacc for psdgprotacc in pair[0]]
OPfasta_psdgpsicube_allprotacc={}
nokey=0
nokeylist=[]
for ip in OP_psdgpsicube_allprotacc:
    key=f"{ip}|TX=9606"
    if key in OPfasta_orig.keys():
        OPfasta_psdgpsicube_allprotacc[key]=OPfasta_orig[key]
    else:
        nokey+=1
        nokeylist.append(ip)
print(nokey,len(OPfasta_psdgpsicube_allprotacc.keys()),len(set(OP_psdgpsicube_allprotacc)))

0 856 856


In [351]:
#with open(f'./OP16fasta_psdgpsicube_allprotacc.pickle','wb') as handle:
    pickle.dump(OPfasta_psdgpsicube_allprotacc, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [372]:
#Select only keys with canonical proteins.
OP_pgpsicube_allprotacc=[pgprotacc for pair in PSIfilt95_pairstrx_protacc for pgprotacc in pair[1]]
OPfasta_pgpsicube_allprotacc={}
nokey=0
nokeylist=[]
for ip in OP_pgpsicube_allprotacc:
    key=f"{ip}|TX=9606"
    if key in OPfasta_orig.keys():
        OPfasta_pgpsicube_allprotacc[key]=OPfasta_orig[key]
    else:
        nokey+=1
        nokeylist.append(ip)
print(nokey,len(OPfasta_pgpsicube_allprotacc.keys()),len(set(OP_pgpsicube_allprotacc))) 

679 6 457


In [373]:
ProtAcc_dict_dkey=ProtAcc_dict.copy() #Make a copy of dictionary to change absent keys to real one (those present in fasta)

In [374]:
#Some keys are absent because proteins have several protein accessions. 
#Other protein accessions are writen in "protein accession (others)" column. 
#Find the right accession and add it to dictionary.
stillnokey=0
for ip in nokeylist:
    otherips=list(OP16_df_refprot_ms0_psicube.loc[OP16_df_refprot_ms0_psicube["protein accession numbers"]==ip,"protein accession (others)"])[0].split(";")
    stillnokey+=1
    for ip2 in otherips:
        key=f"{ip2}|TX=9606"
        if key in OPfasta_orig.keys():
            OPfasta_pgpsicube_allprotacc[key]=OPfasta_orig[key]
            #substitute key
            ProtAcc_dict_dkey={ (tuple([pair[0],ip2]) if pair[1]==ip else pair):ProtAcc_dict_dkey[pair] for pair in ProtAcc_dict_dkey.keys() }
            stillnokey-=1
print(stillnokey,len(OPfasta_pgpsicube_allprotacc.keys()))

0 437


In [377]:
#with open(f'./OP16fasta_pgpsicube_allprotacc.pickle','wb') as handle:
    pickle.dump(OPfasta_pgpsicube_allprotacc, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [376]:
#with open("./ProtAcc_dict_dkey.pkl","wb") as f:
    pickle.dump(ProtAcc_dict_dkey,f, protocol=pickle.HIGHEST_PROTOCOL)
    f.close

In [None]:
#Function that saves one sequence to fasta file
def several_seq_to_fasta(seqstr,filename,key): #input_flag: reference or query            
    with open(filename, "w") as file:
        header="".join([">",key,"\n"])
        seq="".join([seqstr,"\n"])
        file.writelines([header,seq])
    file.close()

In [None]:
#Do balstp
OPpsi_pepseqalignd_ms2p_blast={}
for pair in ProtAcc_dict_dkey.keys():
    print(pair,list(ProtAcc_dict_dkey.keys()).index(pair),len(ProtAcc_dict_dkey.keys()))
    pep_psdgkey=f"{pair[0]}|TX=9606"
    pep_pgkey=f"{pair[1]}|TX=9606"
    pep_psdg_seq=str(OPfasta_psdgpsicube_allprotacc[pep_psdgkey].seq)
    pep_pg_seq=str(OPfasta_pgpsicube_allprotacc[pep_pgkey].seq)
    blast_queryfile=f"{pair[0]}_query.fna"
    several_seq_to_fasta(pep_psdg_seq,blast_queryfile,pair[0])
    blast_subjectfile=f"{pair[1]}_subject.fna"
    several_seq_to_fasta(pep_pg_seq,blast_subjectfile,pair[1])
    blast_outfile=f"{pair[0]}_{pair[1]}_results.xml"
    subprocess.run(f"blastp -query {blast_queryfile} -subject {blast_subjectfile} -out {blast_outfile} -word_size 4 -outfmt 5", shell=True)
    os.remove(blast_queryfile)
    os.remove(blast_subjectfile)
    OPpsi_pepseqalignd_ms2p_blast[pair]={}
    for record in NCBIXML.parse(open(blast_outfile)): 
        if record.alignments: 
            for align in record.alignments: 
                for hsp in align.hsps:
                    OPpsi_pepseqalignd_ms2p_blast[pair]["match"]=hsp.match.replace(" ","-")
                    OPpsi_pepseqalignd_ms2p_blast[pair]["score"]=hsp.score
                    OPpsi_pepseqalignd_ms2p_blast[pair]["expect"]=hsp.expect
                    OPpsi_pepseqalignd_ms2p_blast[pair]["gaps"]=hsp.gaps
    os.remove(blast_outfile)

In [None]:
#with open(f'./OP16psi_pepseqalignd_ms2p_blast.pickle','wb') as handle:
    pickle.dump(OPpsi_pepseqalignd_ms2p_blast, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [477]:
#Calculate identity fraction and how many comparisons did not give any aligment
nohit=0
for key in OPpsi_pepseqalignd_ms2p_blast.keys():
    if len(OPpsi_pepseqalignd_ms2p_blast[key])==0:
        nohit+=1
        OPpsi_pepseqalignd_ms2p_blast[key]["ident"]=0#np.nan
    else:
        seq=OPpsi_pepseqalignd_ms2p_blast[key]["match"]
        OPpsi_pepseqalignd_ms2p_blast[key]["ident"]=len(re.findall('[A-Z]', seq))/len(seq)

In [478]:
nohit #That may comparisins did not give any aligment

27

In [479]:
#Select one pseudogenic ORFs per transcript with highest identity fraction 
OPpsi_pepseqalignd_ms2p_blast_maxident={}
for trxpair in PSIfilt95_pairstrx_real:
    peppairs_list=[]
    for peppairs in ProtAcc_dict_dkey.keys():
        if ProtAcc_dict_dkey[peppairs]==trxpair:
            peppairs_list.append(peppairs)
    ident_list=[]
    for peppairs in peppairs_list:
        ident_list.append(OPpsi_pepseqalignd_ms2p_blast[peppairs]["ident"])
    peppairs_maxident=peppairs_list[ident_list.index(np.max(ident_list))]
    OPpsi_pepseqalignd_ms2p_blast_maxident[peppairs_maxident]=OPpsi_pepseqalignd_ms2p_blast[peppairs_maxident]

In [487]:
values=[OPpsi_pepseqalignd_ms2p_blast_maxident[key]["ident"] for key in OPpsi_pepseqalignd_ms2p_blast_maxident.keys()]

In [488]:
np.mean(values)

0.5433849591266984

In [605]:
values.count(0)

18

In [489]:
values2=values.copy() #Remove comparisons with no aligment
values2.remove(0)

In [608]:
np.mean(values2)*100 #everage identity psercent

54.41828812399536

(2) calculate **genes** identity percent

In [501]:
#Map gene types to filtered psiCube pairs present in Ensemble 95 and OpenProt
PSIfilt95_real_pairpsdggene=[tuple([MART95_acss[pair[0]],pair[1]]) for pair in PSIfilt95_pairstrx_real]
PSIfilt95_real_type={pair:key.split("|")[3] for key in MART95fna_orig.keys() for pair in PSIfilt95_real_pairpsdggene if key.split("|")[0]==pair[0]}

In [606]:
#Sort processed and unprocessed parental genes
PSIfilt95_real_unproc=[key for key in PSIfilt95_real_type if "unprocessed" in PSIfilt95_real_type[key]]
PSIfilt95_real_proc=[pair for pair in PSIfilt95_real_type if pair not in PSIfilt95_real_unproc]
print(len(PSIfilt95_real_proc),len(PSIfilt95_real_unproc),len(PSIfilt95_real_type))

630 52 682


In [583]:
#Save genes sequences for unprocessed pseudogenes
PSIms2p_unproc_seq={}
for pair in PSIfilt95_real_unproc:
    psdg_gene=pair[0]
    pg_gene=MART95_acss[pair[1]]
    key_psdg=[key for key in MART95fna_orig.keys() if key.split("|")[0]==psdg_gene][0]
    key_pg=[key for key in MART95fna_orig.keys() if key.split("|")[0]==pg_gene][0]
    PSIms2p_unproc_seq[pair]={}
    PSIms2p_unproc_seq[pair]["psdg_seq"]=str(MART95fna_orig[key_psdg].seq)
    PSIms2p_unproc_seq[pair]["pg_seq"]=str(MART95fna_orig[key_pg].seq)

In [585]:
#with open(f'./PSIms2p_unproc_seq.pickle','wb') as handle:
    pickle.dump(PSIms2p_unproc_seq, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [520]:
#get psiCube genes of processed pseudogenes
PSIfilt95_real_procgene=[tuple([pair[0],MART95_acss[pair[1]]]) for pair in PSIfilt95_real_proc]
#get a flat list of accessions for serch in BioMart
geneacc=list(chain.from_iterable([[pair[0]]+[pair[1]] for pair in PSIfilt95_real_procgene]))
#get psiCube transcripts of processed pseudogenes
PSIfilt95_real_proctrx=[pair for pair,key in zip(PSIfilt95_pairstrx_real,PSIfilt95_real_type) if key not in PSIfilt95_real_unproc]
trxacc=list(chain.from_iterable([[pair[0]]+[pair[1]] for pair in PSIfilt95_real_proctrx]))

In [568]:
#Run this loop and copy output to Gene ID filter in BioMart (Ensembl version 95)
#for i in set(geneacc):
#    print(i)

In [524]:
#parce MART 95 with genes accesseions only from PSIfilt95_real_procgene
MART95fasta_psims2_file="mart95fasta_psims2_export.txt"
#Unarchive
with gzip.open(f"{MART95fasta_psims2_file}.gz", 'rb') as f_in:
    with open(MART95fasta_psims2_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
#Parce to dictionary
input_file = open(MART95fasta_psims2_file)
MART95fasta_psims2_orig = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))

In [609]:
#with open(f'./MART95fasta_psims2_orig.pickle','wb') as handle:
    pickle.dump(MART95fasta_psims2_orig, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [566]:
len(set(geneacc)),len(set([key.split("|")[0] for key in MART95fasta_psims2_real.keys()])) #for some reason not all genes were taken from BioMart

(998, 930)

In [527]:
#Select transcripts from pairs
MART95fasta_psims2_real={}
for key in MART95fasta_psims2_orig:
    if key.split("|")[1] in set(trxacc):
        MART95fasta_psims2_real[key]=MART95fasta_psims2_orig[key]

In [528]:
len(MART95fasta_psims2_real),len(MART95fasta_psims2_orig)

(972, 4485)

In [580]:
#Save pairs transcripts sequences
no=[]
PSIms2p_proc_seq={}
for pair in PSIfilt95_real_proctrx:
    psdg_trx=pair[0]
    pg_trx=pair[1]
    key_psdg=[key for key in MART95fasta_psims2_real.keys() if key.split("|")[1]==psdg_trx]
    key_pg=[key for key in MART95fasta_psims2_real.keys() if key.split("|")[1]==pg_trx]
    if len(key_psdg)==0 or len(key_pg)==0: #for genes that were not taken from BioMart
        no.append(pair)
        next
    else:
        PSIms2p_proc_seq[pair]={}
        PSIms2p_proc_seq[pair]["psdg_seq"]=str(MART95fasta_psims2_real[key_psdg[0]].seq)
        PSIms2p_proc_seq[pair]["pg_seq"]=str(MART95fasta_psims2_real[key_pg[0]].seq)

In [582]:
#with open(f'./PSIms2p_proc_seq.pickle','wb') as handle:
    pickle.dump(PSIms2p_proc_seq, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [None]:
#Function to perform blastn
def blastn_pairs(inputdict):
    outputdict=inputdict.copy()
    for pair in inputdict.keys():
        print(pair,list(inputdict.keys()).index(pair),len(inputdict.keys()))
        psdg_seq=inputdict[pair]["psdg_seq"]
        pg_seq=inputdict[pair]["pg_seq"]
        blast_queryfile=f"{pair[0]}_query.fna"
        several_seq_to_fasta(psdg_seq,blast_queryfile,pair[0])
        blast_subjectfile=f"{pair[1]}_subject.fna"
        several_seq_to_fasta(pg_seq,blast_subjectfile,pair[1])
        blast_outfile=f"{pair[0]}_{pair[1]}_results.xml"
        subprocess.run(f"blastn -query {blast_queryfile} -subject {blast_subjectfile} -out {blast_outfile} -word_size 15 -dust no -outfmt 5", shell=True)
        os.remove(blast_queryfile)
        os.remove(blast_subjectfile)
        for record in NCBIXML.parse(open(blast_outfile)): 
            if record.alignments:
                for align in record.alignments:
                    for hsp in align.hsps:
                        outputdict[pair]["match"]=hsp.match.replace(" ","-")
                        outputdict[pair]["score"]=hsp.score
                        outputdict[pair]["expect"]=hsp.expect
                        outputdict[pair]["gaps"]=hsp.gaps
        os.remove(blast_outfile)
    return outputdict

In [None]:
#Run blastn on processed pseudogenes pairs
PSIms2p_proc_align=blastn_pairs(PSIms2p_proc_seq)

In [None]:
#with open(f'./PSIms2p_proc_align.pickle','wb') as handle:
    pickle.dump(PSIms2p_proc_align, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [None]:
#Run blastn on unprocessed pseudogenes pairs
PSIms2p_unproc_align=blastn_pairs(PSIms2p_unproc_seq)

In [None]:
#with open(f'./PSIms2p_unproc_align.pickle','wb') as handle:
    pickle.dump(PSIms2p_unproc_align, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [594]:
#Calculate identity fraction for processed pseudogene pairs
noproc=0
proc_ident=[]
for key in PSIms2p_proc_align:
    if "match" not in PSIms2p_proc_align[key].keys():
        noproc+=1
    else:
        seq=PSIms2p_proc_align[key]["match"]
        proc_ident.append((seq.count("|")/len(seq)))
        PSIms2p_proc_align[key]["ident"]=(seq.count("|")/len(seq))

In [595]:
len(proc_ident),noproc #15 sequences didn't have aligment

(570, 15)

In [596]:
#Calculate identity fraction for unprocessed pseudogene pairs
nounproc=0
unproc_ident=[]
for key in PSIms2p_unproc_align:
    if "match" not in PSIms2p_unproc_align[key].keys():
        nounproc+=1
    else:
        seq=PSIms2p_unproc_align[key]["match"]
        unproc_ident.append((seq.count("|")/len(seq)))
        PSIms2p_unproc_align[key]["ident"]=(seq.count("|")/len(seq))

In [598]:
len(unproc_ident),nounproc #all sequenses had aligment

(52, 0)

In [607]:
np.mean(proc_ident+unproc_ident)*100 #get combined everage identity percent

91.2580217327255