# Process protein data from uniprot
Previously used this tool https://www.uniprot.org/id-mapping to map the protein IDs returned from proteomics to PP_/4-letter codes. This script parses the output and saves a table to convert between formats.

In [1]:
import edd_utils as eddu
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import random
random.seed(1)

Import uniprot results and dataframe with original protein labels

In [2]:
uniprot_df = pd.read_csv('./data/uniprot_db_DBTL0_DBTL1_proteins.tsv', delimiter='\t')
edd_df = pd.read_csv('./data/protein_id_conversion_df_init.csv')

In [3]:
uniprot_df.head(3)

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Gene Names (ordered locus),Gene Names (primary),Gene Names (synonym)
0,Q88P31,Q88P31,unreviewed,Q88P31_PSEPK,Glucose-6-phosphate 1-dehydrogenase (G6PD) (EC...,zwfA zwf PP_1022,Pseudomonas putida (strain ATCC 47054 / DSM 61...,489.0,PP_1022,zwfA,zwf
1,Q88C32,Q88C32,unreviewed,Q88C32_PSEPK,Glucose-6-phosphate 1-dehydrogenase (G6PD) (EC...,zwf PP_5351,Pseudomonas putida (strain ATCC 47054 / DSM 61...,485.0,PP_5351,zwf,
2,Q88RL1,Q88RL1,reviewed,ZNUC_PSEPK,Zinc import ATP-binding protein ZnuC (EC 7.2.2...,znuC PP_0118,Pseudomonas putida (strain ATCC 47054 / DSM 61...,257.0,PP_0118,znuC,


In [4]:
edd_df.head(3)

Unnamed: 0,original,extracted
0,P0AE22,P0AE22
1,sp|A9GAJ9|A9GAJ9_SORC5 Mcm,A9GAJ9
2,sp|K4JH65|K4JH65_9ACTN Gdnd,K4JH65


Rename uniprot df columns

In [5]:
uniprot_col_rename = {'Entry Name': 'entry_name', 'Protein names': 'protein_names',
                      'Gene Names': 'gene_names', 'Gene Names (ordered locus)': 'locus_name',
                     'Gene Names (primary)': 'primary_name', 'Gene Names (synonym)': 'synonym_name'}
uniprot_df = uniprot_df.rename(columns=uniprot_col_rename)
uniprot_df

Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,primary_name,synonym_name
0,Q88P31,Q88P31,unreviewed,Q88P31_PSEPK,Glucose-6-phosphate 1-dehydrogenase (G6PD) (EC...,zwfA zwf PP_1022,Pseudomonas putida (strain ATCC 47054 / DSM 61...,489.0,PP_1022,zwfA,zwf
1,Q88C32,Q88C32,unreviewed,Q88C32_PSEPK,Glucose-6-phosphate 1-dehydrogenase (G6PD) (EC...,zwf PP_5351,Pseudomonas putida (strain ATCC 47054 / DSM 61...,485.0,PP_5351,zwf,
2,Q88RL1,Q88RL1,reviewed,ZNUC_PSEPK,Zinc import ATP-binding protein ZnuC (EC 7.2.2...,znuC PP_0118,Pseudomonas putida (strain ATCC 47054 / DSM 61...,257.0,PP_0118,znuC,
3,Q88F24,Q88F24,reviewed,ZIPA_PSEPK,Cell division protein ZipA,zipA PP_4275,Pseudomonas putida (strain ATCC 47054 / DSM 61...,297.0,PP_4275,zipA,
4,Q88E29,Q88E29,unreviewed,Q88E29_PSEPK,P-loop guanosine triphosphatase-dependent zinc...,zinU PP_4639,Pseudomonas putida (strain ATCC 47054 / DSM 61...,323.0,PP_4639,zinU,
...,...,...,...,...,...,...,...,...,...,...,...
2733,P04264,P04264,reviewed,K2C1_HUMAN,"Keratin, type II cytoskeletal 1 (67 kDa cytoke...",KRT1 KRTA,Homo sapiens (Human),644.0,,KRT1,KRTA
2734,Q835L3,Q835L3,unreviewed,Q835L3_ENTFA,Acetyl-CoA acetyltransferase/hydroxymethylglut...,EF_1364,Enterococcus faecalis (strain ATCC 700802 / V583),803.0,EF_1364,,
2735,Q88P68,Q88P68,,Q88P68_PSEPK,deleted,,,,,,
2736,P00761,P00761,reviewed,TRYP_PIG,Trypsin (EC 3.4.21.4),,Sus scrofa (Pig),231.0,,,


## Look at uniprot output

In [6]:
uniprot_df['Organism'].value_counts()

Organism
Pseudomonas putida (strain ATCC 47054 / DSM 6125 / CFBP 8728 / NCIMB 11950 / KT2440)                               2712
Homo sapiens (Human)                                                                                                  6
Caulobacter vibrioides (strain ATCC 19089 / CB15) (Caulobacter crescentus)                                            1
Enterococcus faecalis (strain ATCC 700802 / V583)                                                                     1
Ovis aries (Sheep)                                                                                                    1
Gallus gallus (Chicken)                                                                                               1
Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)                                                 1
Hevea brasiliensis (Para rubber tree) (Siphonia brasiliensis)                                                         1
Pseudomonas aeruginosa         

Create flag for whether or not entry is from putida

In [7]:
pp_string = 'Pseudomonas putida (strain ATCC 47054 / DSM 6125 / CFBP 8728 / NCIMB 11950 / KT2440)'
uniprot_df['is_putida'] = uniprot_df['Organism'] == pp_string

In [8]:
uniprot_df.loc[~uniprot_df['is_putida'], :]

Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,primary_name,synonym_name,is_putida
88,Q9A9Z2,Q9A9Z2,reviewed,XYLD_CAUVC,D-xylonate dehydratase (XyDHT) (EC 4.2.1.82) (...,xylD CC_0819,Caulobacter vibrioides (strain ATCC 19089 / CB...,595.0,CC_0819,xylD,,False
552,Q3L885,Q3L885,unreviewed,Q3L885_MYCS2,Polyketide synthase (Type I modular polyketide...,pks MSMEG_0408,Mycolicibacterium smegmatis (strain ATCC 70008...,3652.0,MSMEG_0408,pks,,False
686,P00552,P00552,reviewed,KKA2_KLEPN,Aminoglycoside 3'-phosphotransferase (EC 2.7.1...,neo kan nptII,Klebsiella pneumoniae,264.0,,neo,kan nptII,False
690,Q7WTF4,Q7WTF4,unreviewed,Q7WTF4_9ACTN,NanA2,nanA2,Streptomyces nanchangensis,2223.0,,nanA2,,False
698,Q8PW39,Q8PW39,unreviewed,Q8PW39_METMA,Mevalonate kinase (MK) (MVK) (EC 2.7.1.36),mvk MM_1762,Methanosarcina mazei (strain ATCC BAA-159 / DS...,301.0,MM_1762,mvk,,False
699,Q9FD71,Q9FD71,reviewed,HMGCS_ENTFL,Hydroxymethylglutaryl-CoA synthase (HMG-CoA sy...,mvaS,Enterococcus faecalis (Streptococcus faecalis),383.0,,mvaS,,False
792,A9GAJ9,A9GAJ9,unreviewed,A9GAJ9_SORC5,Methylmalonyl-CoA mutase (EC 5.4.99.2),mcm sce2716,Sorangium cellulosum (strain So ce56) (Polyang...,882.0,sce2716,mcm,,False
832,Q30CS2,Q30CS2,unreviewed,Q30CS2_KITAU,LipPks1,lipPks1,Kitasatospora aureofaciens (Streptomyces aureo...,2259.0,,lipPks1,,False
1036,K4JH65,K4JH65,unreviewed,K4JH65_9ACTN,3 hydroxyacyl CoA dehydrogenase,gdnD,Streptomyces sp. K01-0509,289.0,,gdnD,,False
1338,Q99ZW2,Q99ZW2,reviewed,CAS9_STRP1,CRISPR-associated endonuclease Cas9/Csn1 (EC 3...,cas9 csn1 SPy_1046,Streptococcus pyogenes serotype M1,1368.0,SPy_1046,cas9,csn1,False


In [9]:
uniprot_df.loc[uniprot_df['is_putida'],:]

Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,primary_name,synonym_name,is_putida
0,Q88P31,Q88P31,unreviewed,Q88P31_PSEPK,Glucose-6-phosphate 1-dehydrogenase (G6PD) (EC...,zwfA zwf PP_1022,Pseudomonas putida (strain ATCC 47054 / DSM 61...,489.0,PP_1022,zwfA,zwf,True
1,Q88C32,Q88C32,unreviewed,Q88C32_PSEPK,Glucose-6-phosphate 1-dehydrogenase (G6PD) (EC...,zwf PP_5351,Pseudomonas putida (strain ATCC 47054 / DSM 61...,485.0,PP_5351,zwf,,True
2,Q88RL1,Q88RL1,reviewed,ZNUC_PSEPK,Zinc import ATP-binding protein ZnuC (EC 7.2.2...,znuC PP_0118,Pseudomonas putida (strain ATCC 47054 / DSM 61...,257.0,PP_0118,znuC,,True
3,Q88F24,Q88F24,reviewed,ZIPA_PSEPK,Cell division protein ZipA,zipA PP_4275,Pseudomonas putida (strain ATCC 47054 / DSM 61...,297.0,PP_4275,zipA,,True
4,Q88E29,Q88E29,unreviewed,Q88E29_PSEPK,P-loop guanosine triphosphatase-dependent zinc...,zinU PP_4639,Pseudomonas putida (strain ATCC 47054 / DSM 61...,323.0,PP_4639,zinU,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2720,Q88RW0,Q88RW0,unreviewed,Q88RW0_PSEPK,Transcriptional regulator,PP_0019,Pseudomonas putida (strain ATCC 47054 / DSM 61...,232.0,PP_0019,,,True
2721,Q88RW1,Q88RW1,unreviewed,Q88RW1_PSEPK,TIGR04255 family protein,PP_0018,Pseudomonas putida (strain ATCC 47054 / DSM 61...,280.0,PP_0018,,,True
2722,Q88RW2,Q88RW2,unreviewed,Q88RW2_PSEPK,Transcriptional regulator,PP_0017,Pseudomonas putida (strain ATCC 47054 / DSM 61...,119.0,PP_0017,,,True
2723,P0A149,P0A149,reviewed,Y002_PSEPK,Uncharacterized protein PP_0002,PP_0002,Pseudomonas putida (strain ATCC 47054 / DSM 61...,263.0,PP_0002,,,True


## Count how many genes have locus names or primary gene names

In [12]:
print(f"For p-putida proteins, there are N = {sum(uniprot_df.loc[uniprot_df['is_putida'], 'locus_name'].isna())} rows where locus_name is NaN")
print(f"There are N = {sum(uniprot_df['locus_name'].isna())} rows where locus_name is NaN")
uniprot_df.loc[uniprot_df['locus_name'].isna()]

For p-putida proteins, there are N = 0 rows where locus_name is NaN
There are N = 18 rows where locus_name is NaN


Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,primary_name,synonym_name,is_putida
686,P00552,P00552,reviewed,KKA2_KLEPN,Aminoglycoside 3'-phosphotransferase (EC 2.7.1...,neo kan nptII,Klebsiella pneumoniae,264.0,,neo,kan nptII,False
690,Q7WTF4,Q7WTF4,unreviewed,Q7WTF4_9ACTN,NanA2,nanA2,Streptomyces nanchangensis,2223.0,,nanA2,,False
699,Q9FD71,Q9FD71,reviewed,HMGCS_ENTFL,Hydroxymethylglutaryl-CoA synthase (HMG-CoA sy...,mvaS,Enterococcus faecalis (Streptococcus faecalis),383.0,,mvaS,,False
832,Q30CS2,Q30CS2,unreviewed,Q30CS2_KITAU,LipPks1,lipPks1,Kitasatospora aureofaciens (Streptomyces aureo...,2259.0,,lipPks1,,False
1036,K4JH65,K4JH65,unreviewed,K4JH65_9ACTN,3 hydroxyacyl CoA dehydrogenase,gdnD,Streptomyces sp. K01-0509,289.0,,gdnD,,False
1491,P23181,P23181,reviewed,AACC1_PSEAI,Gentamicin 3-N-acetyltransferase (EC 2.3.1.60)...,aacC1,Pseudomonas aeruginosa,177.0,,aacC1,,False
1492,O82803,O82803,reviewed,SRPP_HEVBR,Small rubber particle protein (HbSRPP) (22 kDa...,SRPP HEVB3,Hevea brasiliensis (Para rubber tree) (Siphoni...,204.0,,SRPP,HEVB3,False
2726,P00698,P00698,reviewed,LYSC_CHICK,"Lysozyme C (EC 3.2.1.17) (1,4-beta-N-acetylmur...",LYZ,Gallus gallus (Chicken),147.0,,LYZ,,False
2727,P35527,P35527,reviewed,K1C9_HUMAN,"Keratin, type I cytoskeletal 9 (Cytokeratin-9)...",KRT9,Homo sapiens (Human),623.0,,KRT9,,False
2728,P78386,P78386,reviewed,KRT85_HUMAN,"Keratin, type II cuticular Hb5 (Hair keratin K...",KRT85 KRTHB5,Homo sapiens (Human),507.0,,KRT85,KRTHB5,False


In [13]:
print(f"For p-putida proteins, there are N = {sum(uniprot_df.loc[uniprot_df['is_putida'], 'primary_name'].isna())} rows where primary_name is NaN")
print(f"There are N = {sum(uniprot_df['primary_name'].isna())} rows where primary_name is NaN")
uniprot_df.loc[uniprot_df['primary_name'].isna()]

For p-putida proteins, there are N = 1231 rows where primary_name is NaN
There are N = 1235 rows where primary_name is NaN


Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,primary_name,synonym_name,is_putida
1493,Q88BX7,Q88BX7,unreviewed,Q88BX7_PSEPK,"Transcriptional regulator, DeoR family",PP_5410,Pseudomonas putida (strain ATCC 47054 / DSM 61...,258.0,PP_5410,,,True
1494,Q88BY6,Q88BY6,unreviewed,Q88BY6_PSEPK,PIN domain-containing protein,PP_5401,Pseudomonas putida (strain ATCC 47054 / DSM 61...,213.0,PP_5401,,,True
1495,Q88BY7,Q88BY7,unreviewed,Q88BY7_PSEPK,IrrE N-terminal-like domain-containing protein,PP_5400,Pseudomonas putida (strain ATCC 47054 / DSM 61...,355.0,PP_5400,,,True
1496,Q88BY9,Q88BY9,unreviewed,Q88BY9_PSEPK,N(4)-bis(aminopropyl)spermidine synthase C-ter...,PP_5395,Pseudomonas putida (strain ATCC 47054 / DSM 61...,281.0,PP_5395,,,True
1497,Q88BZ1,Q88BZ1,unreviewed,Q88BZ1_PSEPK,Beta-propeller fold lactonase family protein,PP_5392,Pseudomonas putida (strain ATCC 47054 / DSM 61...,340.0,PP_5392,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2723,P0A149,P0A149,reviewed,Y002_PSEPK,Uncharacterized protein PP_0002,PP_0002,Pseudomonas putida (strain ATCC 47054 / DSM 61...,263.0,PP_0002,,,True
2734,Q835L3,Q835L3,unreviewed,Q835L3_ENTFA,Acetyl-CoA acetyltransferase/hydroxymethylglut...,EF_1364,Enterococcus faecalis (strain ATCC 700802 / V583),803.0,EF_1364,,,False
2735,Q88P68,Q88P68,,Q88P68_PSEPK,deleted,,,,,,,False
2736,P00761,P00761,reviewed,TRYP_PIG,Trypsin (EC 3.4.21.4),,Sus scrofa (Pig),231.0,,,,False


## Create dataframe to convert between proteomics label and different uniprot labels

In [14]:
edd_df

Unnamed: 0,original,extracted
0,P0AE22,P0AE22
1,sp|A9GAJ9|A9GAJ9_SORC5 Mcm,A9GAJ9
2,sp|K4JH65|K4JH65_9ACTN Gdnd,K4JH65
3,sp|O77727,O77727
4,sp|O82803,O82803
...,...,...
2733,tr|Q88QV1|Q88QV1_PSEPK,Q88QV1
2734,tr|Q88QV2|Q88QV2_PSEPK,Q88QV2
2735,tr|Q88RH1|Q88RH1_PSEPK,Q88RH1
2736,tr|Q88RH2|Q88RH2_PSEPK,Q88RH2


In [24]:
edd_df['locus'] = [uniprot_df.loc[uniprot_df['From'] == x, 'locus_name'].values[0] for x in edd_df['extracted']]
edd_df['primary_name'] = [uniprot_df.loc[uniprot_df['From'] == x, 'primary_name'].values[0] for x in edd_df['extracted']]
edd_df['organism'] = [uniprot_df.loc[uniprot_df['From'] == x, 'Organism'].values[0] for x in edd_df['extracted']]
edd_df['is_putida'] = edd_df['organism'] == pp_string

In [25]:
edd_df.head(3)

Unnamed: 0,original,extracted,locus,primary_name,organism,is_putida
0,P0AE22,P0AE22,b4055 JW4015,aphA,Escherichia coli (strain K12),False
1,sp|A9GAJ9|A9GAJ9_SORC5 Mcm,A9GAJ9,sce2716,mcm,Sorangium cellulosum (strain So ce56) (Polyang...,False
2,sp|K4JH65|K4JH65_9ACTN Gdnd,K4JH65,,gdnD,Streptomyces sp. K01-0509,False


In [26]:
edd_df.to_csv('./data/proteomics_id_translator.csv', index=False, header=True)