# Process protein data from uniprot
Previously used this tool https://www.uniprot.org/id-mapping to map the protein IDs returned from proteomics to PP_/4-letter codes. This script parses the output and saves a table to convert between formats.

In [1]:
import edd_utils as eddu
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import random
random.seed(1)

Import uniprot results and dataframe with original protein labels

In [2]:
uniprot_df = pd.read_csv('./data/proteomics_id_translator.tsv', delimiter='\t')
edd_df = pd.read_csv('./data/protein_id_conversion_df_init.csv')

In [3]:
uniprot_df.head(3)

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Gene Names (ordered locus),Gene Names (ORF),Gene Names (primary),Gene Names (synonym)
0,P0AE22,P0AE22,reviewed,APHA_ECOLI,Class B acid phosphatase (CBAP) (EC 3.1.3.2),aphA napA yjbP b4055 JW4015,Escherichia coli (strain K12),237.0,b4055 JW4015,,aphA,napA yjbP
1,A9GAJ9,A9GAJ9,unreviewed,A9GAJ9_SORC5,Methylmalonyl-CoA mutase (EC 5.4.99.2),mcm sce2716,Sorangium cellulosum (strain So ce56) (Polyang...,882.0,sce2716,,mcm,
2,J5JYK1,J5JYK1,unreviewed,J5JYK1_BEAB2,Polyketide synthase,BBA_03810,Beauveria bassiana (strain ARSEF 2860) (White ...,2413.0,,BBA_03810,,


In [4]:
edd_df.head(3)

Unnamed: 0,orig,has_6letter_substring,extracted
0,P0AE22,True,P0AE22
1,sp|A0A0M4F6K2,False,
2,sp|A0A140FVW8,False,


Rename uniprot df columns

In [5]:
uniprot_col_rename = {'Entry Name': 'entry_name', 'Protein names': 'protein_names',
                      'Gene Names': 'gene_names', 'Gene Names (ordered locus)': 'locus_name',
                     'Gene Names (primary)': 'primary_name', 'Gene Names (synonym)': 'synonym_name'}
uniprot_df = uniprot_df.rename(columns=uniprot_col_rename)
uniprot_df

Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,Gene Names (ORF),primary_name,synonym_name
0,P0AE22,P0AE22,reviewed,APHA_ECOLI,Class B acid phosphatase (CBAP) (EC 3.1.3.2),aphA napA yjbP b4055 JW4015,Escherichia coli (strain K12),237.0,b4055 JW4015,,aphA,napA yjbP
1,A9GAJ9,A9GAJ9,unreviewed,A9GAJ9_SORC5,Methylmalonyl-CoA mutase (EC 5.4.99.2),mcm sce2716,Sorangium cellulosum (strain So ce56) (Polyang...,882.0,sce2716,,mcm,
2,J5JYK1,J5JYK1,unreviewed,J5JYK1_BEAB2,Polyketide synthase,BBA_03810,Beauveria bassiana (strain ARSEF 2860) (White ...,2413.0,,BBA_03810,,
3,K4JH65,K4JH65,unreviewed,K4JH65_9ACTN,3 hydroxyacyl CoA dehydrogenase,gdnD,Streptomyces sp. K01-0509,289.0,,,gdnD,
4,O77727,O77727,reviewed,K1C15_SHEEP,"Keratin, type I cytoskeletal 15 (Cytokeratin-1...",KRT15,Ovis aries (Sheep),453.0,,,KRT15,
...,...,...,...,...,...,...,...,...,...,...,...,...
2875,Q88QV2,Q88QV2,unreviewed,Q88QV2_PSEPK,5-aminopentanamidase (EC 3.5.1.30),davA PP_0382,Pseudomonas putida (strain ATCC 47054 / DSM 61...,264.0,PP_0382,,davA,
2876,Q88RH1,Q88RH1,unreviewed,Q88RH1_PSEPK,CoA-transferase family III (EC 2.8.3.-),PP_0159,Pseudomonas putida (strain ATCC 47054 / DSM 61...,406.0,PP_0159,,,
2877,Q88RH2,Q88RH2,unreviewed,Q88RH2_PSEPK,glutaryl-CoA dehydrogenase (ETF) (EC 1.3.8.6),gcdH PP_0158,Pseudomonas putida (strain ATCC 47054 / DSM 61...,393.0,PP_0158,,gcdH,
2878,Q8PW39,Q8PW39,unreviewed,Q8PW39_METMA,Mevalonate kinase (MK) (MVK) (EC 2.7.1.36),mvk MM_1762,Methanosarcina mazei (strain ATCC BAA-159 / DS...,301.0,MM_1762,,mvk,


## Look at uniprot output

In [6]:
uniprot_df['Organism'].value_counts()

Organism
Pseudomonas putida (strain ATCC 47054 / DSM 6125 / CFBP 8728 / NCIMB 11950 / KT2440)                               2843
Homo sapiens (Human)                                                                                                 11
Ovis aries (Sheep)                                                                                                    3
Enterococcus faecalis (Streptococcus faecalis)                                                                        2
Enterococcus faecalis (strain ATCC 700802 / V583)                                                                     2
Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)                                                 1
Caulobacter vibrioides (strain ATCC 19089 / CIP 103742 / CB 15) (Caulobacter crescentus)                              1
Streptococcus pyogenes serotype M1                                                                                    1
Streptomyces nanchangensis     

Create flag for whether or not entry is from putida

In [7]:
pp_string = 'Pseudomonas putida (strain ATCC 47054 / DSM 6125 / CFBP 8728 / NCIMB 11950 / KT2440)'
uniprot_df['is_putida'] = uniprot_df['Organism'] == pp_string

In [8]:
uniprot_df.loc[~uniprot_df['is_putida'], :]

Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,Gene Names (ORF),primary_name,synonym_name,is_putida
0,P0AE22,P0AE22,reviewed,APHA_ECOLI,Class B acid phosphatase (CBAP) (EC 3.1.3.2),aphA napA yjbP b4055 JW4015,Escherichia coli (strain K12),237.0,b4055 JW4015,,aphA,napA yjbP,False
1,A9GAJ9,A9GAJ9,unreviewed,A9GAJ9_SORC5,Methylmalonyl-CoA mutase (EC 5.4.99.2),mcm sce2716,Sorangium cellulosum (strain So ce56) (Polyang...,882.0,sce2716,,mcm,,False
2,J5JYK1,J5JYK1,unreviewed,J5JYK1_BEAB2,Polyketide synthase,BBA_03810,Beauveria bassiana (strain ARSEF 2860) (White ...,2413.0,,BBA_03810,,,False
3,K4JH65,K4JH65,unreviewed,K4JH65_9ACTN,3 hydroxyacyl CoA dehydrogenase,gdnD,Streptomyces sp. K01-0509,289.0,,,gdnD,,False
4,O77727,O77727,reviewed,K1C15_SHEEP,"Keratin, type I cytoskeletal 15 (Cytokeratin-1...",KRT15,Ovis aries (Sheep),453.0,,,KRT15,,False
5,O82803,O82803,reviewed,SRPP_HEVBR,Small rubber particle protein (HbSRPP) (22 kDa...,SRPP HEVB3,Hevea brasiliensis (Para rubber tree) (Siphoni...,204.0,,,SRPP,HEVB3,False
7,P00552,P00552,reviewed,KKA2_KLEPN,Aminoglycoside 3'-phosphotransferase (EC 2.7.1...,neo kan nptII,Klebsiella pneumoniae,264.0,,,neo,kan nptII,False
8,P00698,P00698,reviewed,LYSC_CHICK,"Lysozyme C (EC 3.2.1.17) (1,4-beta-N-acetylmur...",LYZ,Gallus gallus (Chicken),147.0,,,LYZ,,False
9,P00761,P00761,reviewed,TRYP_PIG,Trypsin (EC 3.4.21.4),,Sus scrofa (Pig),231.0,,,,,False
10,P02539,P02539,reviewed,K2M1_SHEEP,"Keratin, type II microfibrillar (Low-sulfur ke...",,Ovis aries (Sheep),109.0,,,,,False


In [9]:
uniprot_df.loc[uniprot_df['is_putida'],:]

Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,Gene Names (ORF),primary_name,synonym_name,is_putida
6,O85207,O85207,reviewed,PHAG_PSEPK,(R)-3-hydroxydecanoyl-ACP:CoA transacylase (EC...,phaG PP_1408,Pseudomonas putida (strain ATCC 47054 / DSM 61...,295.0,PP_1408,,phaG,,True
13,P0A0Z9,P0A0Z9,reviewed,ARGA_PSEPK,Amino-acid acetyltransferase (EC 2.3.1.1) (N-a...,argA PP_5185,Pseudomonas putida (strain ATCC 47054 / DSM 61...,432.0,PP_5185,,argA,,True
14,P0A101,P0A101,reviewed,PCAJ_PSEPK,3-oxoadipate CoA-transferase subunit B (EC 2.8...,pcaJ PP_3952,Pseudomonas putida (strain ATCC 47054 / DSM 61...,213.0,PP_3952,,pcaJ,,True
15,P0A116,P0A116,reviewed,DNAA_PSEPK,Chromosomal replication initiator protein DnaA,dnaA PP_0010,Pseudomonas putida (strain ATCC 47054 / DSM 61...,506.0,PP_0010,,dnaA,,True
16,P0A118,P0A118,reviewed,DNAG_PSEPK,DNA primase (EC 2.7.7.101),dnaG PP_0388,Pseudomonas putida (strain ATCC 47054 / DSM 61...,660.0,PP_0388,,dnaG,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2873,Q88PL6,Q88PL6,unreviewed,Q88PL6_PSEPK,Sec translocon accessory complex subunit YajC,yajC PP_0834,Pseudomonas putida (strain ATCC 47054 / DSM 61...,111.0,PP_0834,,yajC,,True
2874,Q88QV1,Q88QV1,unreviewed,Q88QV1_PSEPK,Tryptophan 2-monooxygenase (EC 1.13.12.3),davB PP_0383,Pseudomonas putida (strain ATCC 47054 / DSM 61...,560.0,PP_0383,,davB,,True
2875,Q88QV2,Q88QV2,unreviewed,Q88QV2_PSEPK,5-aminopentanamidase (EC 3.5.1.30),davA PP_0382,Pseudomonas putida (strain ATCC 47054 / DSM 61...,264.0,PP_0382,,davA,,True
2876,Q88RH1,Q88RH1,unreviewed,Q88RH1_PSEPK,CoA-transferase family III (EC 2.8.3.-),PP_0159,Pseudomonas putida (strain ATCC 47054 / DSM 61...,406.0,PP_0159,,,,True


## Count how many genes have locus names or primary gene names

In [10]:
print(f"For p-putida proteins, there are N = {sum(uniprot_df.loc[uniprot_df['is_putida'], 'locus_name'].isna())} rows where locus_name is NaN")
print(f"There are N = {sum(uniprot_df['locus_name'].isna())} rows where locus_name is NaN")
uniprot_df.loc[uniprot_df['locus_name'].isna()]

For p-putida proteins, there are N = 0 rows where locus_name is NaN
There are N = 28 rows where locus_name is NaN


Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,Gene Names (ORF),primary_name,synonym_name,is_putida
2,J5JYK1,J5JYK1,unreviewed,J5JYK1_BEAB2,Polyketide synthase,BBA_03810,Beauveria bassiana (strain ARSEF 2860) (White ...,2413.0,,BBA_03810,,,False
3,K4JH65,K4JH65,unreviewed,K4JH65_9ACTN,3 hydroxyacyl CoA dehydrogenase,gdnD,Streptomyces sp. K01-0509,289.0,,,gdnD,,False
4,O77727,O77727,reviewed,K1C15_SHEEP,"Keratin, type I cytoskeletal 15 (Cytokeratin-1...",KRT15,Ovis aries (Sheep),453.0,,,KRT15,,False
5,O82803,O82803,reviewed,SRPP_HEVBR,Small rubber particle protein (HbSRPP) (22 kDa...,SRPP HEVB3,Hevea brasiliensis (Para rubber tree) (Siphoni...,204.0,,,SRPP,HEVB3,False
7,P00552,P00552,reviewed,KKA2_KLEPN,Aminoglycoside 3'-phosphotransferase (EC 2.7.1...,neo kan nptII,Klebsiella pneumoniae,264.0,,,neo,kan nptII,False
8,P00698,P00698,reviewed,LYSC_CHICK,"Lysozyme C (EC 3.2.1.17) (1,4-beta-N-acetylmur...",LYZ,Gallus gallus (Chicken),147.0,,,LYZ,,False
9,P00761,P00761,reviewed,TRYP_PIG,Trypsin (EC 3.4.21.4),,Sus scrofa (Pig),231.0,,,,,False
10,P02539,P02539,reviewed,K2M1_SHEEP,"Keratin, type II microfibrillar (Low-sulfur ke...",,Ovis aries (Sheep),109.0,,,,,False
11,P02768,P02768,reviewed,ALBU_HUMAN,Albumin,ALB GIG20 GIG42 PRO0903 PRO1708 PRO2044 PRO261...,Homo sapiens (Human),609.0,,GIG20 GIG42 PRO0903 PRO1708 PRO2044 PRO2619 PR...,ALB,,False
12,P04264,P04264,reviewed,K2C1_HUMAN,"Keratin, type II cytoskeletal 1 (67 kDa cytoke...",KRT1 KRTA,Homo sapiens (Human),644.0,,,KRT1,KRTA,False


In [11]:
print(f"For p-putida proteins, there are N = {sum(uniprot_df.loc[uniprot_df['is_putida'], 'primary_name'].isna())} rows where primary_name is NaN")
print(f"There are N = {sum(uniprot_df['primary_name'].isna())} rows where primary_name is NaN")
uniprot_df.loc[uniprot_df['primary_name'].isna()]

For p-putida proteins, there are N = 1323 rows where primary_name is NaN
There are N = 1332 rows where primary_name is NaN


Unnamed: 0,From,Entry,Reviewed,entry_name,protein_names,gene_names,Organism,Length,locus_name,Gene Names (ORF),primary_name,synonym_name,is_putida
2,J5JYK1,J5JYK1,unreviewed,J5JYK1_BEAB2,Polyketide synthase,BBA_03810,Beauveria bassiana (strain ARSEF 2860) (White ...,2413.0,,BBA_03810,,,False
9,P00761,P00761,reviewed,TRYP_PIG,Trypsin (EC 3.4.21.4),,Sus scrofa (Pig),231.0,,,,,False
10,P02539,P02539,reviewed,K2M1_SHEEP,"Keratin, type II microfibrillar (Low-sulfur ke...",,Ovis aries (Sheep),109.0,,,,,False
27,P0A149,P0A149,reviewed,Y002_PSEPK,Uncharacterized protein PP_0002,PP_0002,Pseudomonas putida (strain ATCC 47054 / DSM 61...,263.0,PP_0002,,,,True
39,P25690,P25690,reviewed,K1M2_SHEEP,"Keratin, type I microfibrillar, 47.6 kDa (Low-...",,Ovis aries (Sheep),404.0,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849,Q88RW0,Q88RW0,unreviewed,Q88RW0_PSEPK,Transcriptional regulator,PP_0019,Pseudomonas putida (strain ATCC 47054 / DSM 61...,232.0,PP_0019,,,,True
2850,Q88RW1,Q88RW1,unreviewed,Q88RW1_PSEPK,TIGR04255 family protein,PP_0018,Pseudomonas putida (strain ATCC 47054 / DSM 61...,280.0,PP_0018,,,,True
2851,Q88RW2,Q88RW2,unreviewed,Q88RW2_PSEPK,Transcriptional regulator,PP_0017,Pseudomonas putida (strain ATCC 47054 / DSM 61...,119.0,PP_0017,,,,True
2860,Q835L3,Q835L3,unreviewed,Q835L3_ENTFA,Acetyl-CoA acetyltransferase (EC 2.3.1.9) (Ace...,EF_1364,Enterococcus faecalis (strain ATCC 700802 / V583),803.0,EF_1364,,,,False


## Create dataframe to convert between proteomics label and different uniprot labels

In [16]:
edd_df
edd_df = edd_df[edd_df['has_6letter_substring']]

In [17]:
edd_df['locus'] = [uniprot_df.loc[uniprot_df['From'] == x, 'locus_name'].values[0] for x in edd_df['extracted']]
edd_df['primary_name'] = [uniprot_df.loc[uniprot_df['From'] == x, 'primary_name'].values[0] for x in edd_df['extracted']]
edd_df['organism'] = [uniprot_df.loc[uniprot_df['From'] == x, 'Organism'].values[0] for x in edd_df['extracted']]
edd_df['is_putida'] = edd_df['organism'] == pp_string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edd_df['locus'] = [uniprot_df.loc[uniprot_df['From'] == x, 'locus_name'].values[0] for x in edd_df['extracted']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edd_df['primary_name'] = [uniprot_df.loc[uniprot_df['From'] == x, 'primary_name'].values[0] for x in edd_df['extracted']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ret

In [18]:
edd_df.head(3)

Unnamed: 0,orig,has_6letter_substring,extracted,locus,primary_name,organism,is_putida
0,P0AE22,True,P0AE22,b4055 JW4015,aphA,Escherichia coli (strain K12),False
62,sp|A9GAJ9|A9GAJ9_SORC5 Mcm,True,A9GAJ9,sce2716,mcm,Sorangium cellulosum (strain So ce56) (Polyang...,False
63,sp|J5JYK1,True,J5JYK1,,,Beauveria bassiana (strain ARSEF 2860) (White ...,False


In [19]:
edd_df.to_csv('./data/proteomics_id_translator.csv', index=False, header=True)