Proteomics Data Processing Pipeline: UniProt-to-Gene Symbol Mapping, Normalization, and Log Transformation

In [None]:
# Install the mygene package (used for mapping protein IDs to gene symbols)
!pip install mygene



In [None]:
# Import required libraries
import pandas as pd              # Data manipulation and analysis
import mygene                    # Gene annotation and ID mapping
import numpy as np               # Numerical operations

In [None]:
# File upload utility for Google Colab
from google.colab import files

# Upload a file from the local machine to the Colab environment
uploaded = files.upload()

Saving Proteomics_Practical_Case.csv to Proteomics_Practical_Case (1).csv


In [None]:
# Read the uploaded CSV file into a pandas DataFrame
# Use 'Protein_IDs' column as the index
prot_df = pd.read_csv(list(uploaded.keys())[0], index_col='Protein_IDs')

# Display a preview of the proteomics dataset
print('Proteomic Data Preview:')
prot_df.head()

Proteomic Data Preview:


Unnamed: 0_level_0,GeneID,Peptide.counts,Protein.names,Gene.names,Standard1,Standard2,Standard3,Treat1.1,Treat1.2,Treat1.3,Treat2.1,Treat2.2,Treat2.3,Treat3.1,Treat3.2,Treat3.3,Treat4.1,Treat4.2,Treat4.3
Protein_IDs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
P92980,AT4G21990,7,"5-adenylylsulfate reductase 3, chloroplastic",,435850,521480.0,547050.0,351850.0,383920.0,302620,555110.0,539700.0,648260,454270,522010.0,567780,488200,488930.0,513710
Q9M0X9,AT4G05160,5,4-coumarate--CoA ligase-like 7,4CLL7,0,0.0,284160.0,468890.0,417040.0,333720,354980.0,428320.0,415570,0,249540.0,270340,422000,336340.0,383230
Q8L539,AT5G13050,1,"5-formyltetrahydrofolate cyclo-ligase, mitocho...",5FCL,210130,220540.0,224050.0,263790.0,256880.0,168710,150110.0,236730.0,253010,181640,0.0,190150,253680,274010.0,273930
P31167,AT3G08580,16,"ADP,ATP carrier protein 1, mitochondrial",AAC1,33437000,35990000.0,34996000.0,39834000.0,43167000.0,33947000,41727000.0,41795000.0,41877000,34122000,37810000.0,33239000,39471000,41251000.0,37790000
P40941,AT5G13490,10,"ADP,ATP carrier protein 2, mitochondrial",AAC2,3918300,4237900.0,4287100.0,4704900.0,5115200.0,4097000,4153800.0,4802300.0,5167900,4439200,4672500.0,4097100,4944400,5005900.0,5024100


In [None]:
# Initialize MyGeneInfo object for querying gene annotations
mg = mygene.MyGeneInfo()

In [None]:
# Extract protein IDs from the DataFrame index
protein_ids = list(prot_df.index)

In [None]:
# Query MyGeneInfo to map UniProt protein IDs to gene symbols
# - scopes='uniprot': indicates the input IDs are UniProt IDs
# - fields='symbol': retrieve gene symbols
# - species=3702: restrict search to Arabidopsis thaliana
mappings = mg.querymany(
    protein_ids,
    scopes='uniprot',
    fields='symbol',
    species=3702
)

INFO:biothings.client:querying 1-1000 ...
INFO:biothings.client:querying 1001-2000 ...
INFO:biothings.client:querying 2001-2424 ...
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [None]:
# Initialize dictionary to store protein â†’ gene symbol mappings
prot_to_gene = {}

# Iterate through query results and extract valid gene symbols
for hit in mappings:
    if 'symbol' in hit:
        # Ensure symbol is treated consistently as a list
        symbol = hit['symbol'] if isinstance(hit['symbol'], list) else [hit['symbol']]
        prot_to_gene[hit['query']] = hit['symbol']

In [None]:
# Print the protein-to-gene mappings
for k, v in prot_to_gene.items():
    print(k, v)

P92980 APR3
Q9M0X9 AT4G05160
Q8L539 5-FCL
P31167 AAC1
P40941 AAC2
O49447 AAC3
Q9SMT7 AAE3
Q8VZF1 AAE7
Q7G193 AO1
Q7G192 AAO2
Q8S4Y1 ACAT2
Q39002 NTT1
P92935 ATNTT2
Q9C826 ABA2
Q9LZB8 ABCB29
Q9LZJ5 ABCC14
Q7DM58 ABCC4
Q8LPJ4 ABCE2
Q9FJH6 ABCF1
Q8H0V6 ABCF3
Q9M1H3 ABCF4
Q9CAF5 NAP7
Q9LQK7 ABCI7
Q9SIU2 ABH1
O04846 ACA1
Q9LF79 ACA8
Q9MA55 ACBP4
P57752 ACBP6
Q9FJI5 G6PD6
O80526 ACLA-3
Q9C522 ACLB-1
Q9FGX1 ACLB-2
Q9SIB9 ACO3
Q41931 ACO2
Q94A28 ACO2
Q06588 EFE
P11829 ACP1
P25701 ACP2
P25702 ACP3
Q9FNP9 AACT1
P53496 ACT11
Q96293 ACT8
Q96329 ACX4
Q9ZSK3 ADF4
P55229 APL1
P06525 ADH1
Q9ZUU1 ADK
Q9SF85 ADK1
Q9ZUY3 PD1
Q8VXZ7 AT3G56310
Q9FVJ3 ZAC
Q9FL69 AGD5
O04379 AGO1
Q9ZVD5 AGO4
Q9FZA2 AGP31
Q56YA5 AGT
O24521 HB2
Q9ZPI6 AIM1
Q9ZSP5 AIR3
Q9LYU8 AK-LYS1
Q9SA18 AK-HSDH I
O81852 AK-HSDH II
Q29Q26 AKR2B
Q84TF0 AKR4C10
Q0PGJ6 ChlAKR
Q9FFF5 AL1
Q9M2B4 AL3
O81488 AL4
Q9LNQ4 AT1G17500
F4I7I0 AlaAT1
Q9LDV4 ALAAT2
Q9STS1 ALDH10A9
Q8VZC3 ALDH12A1
Q9SU63 ALDH2B4
Q56YU0 ALDH2C4
Q70E96 ALDH3F1
Q70DU8 ALDH3H1
Q

In [None]:
# Add gene symbols to the original DataFrame using the protein IDs as keys
prot_df['Gene_symbols'] = prot_df.index.map(prot_to_gene)

# Preview the updated DataFrame with gene symbols added
print('Linked Data Preview:')
print(prot_df.head())

Linked Data Preview:
                GeneID  Peptide.counts  \
Protein_IDs                              
P92980       AT4G21990               7   
Q9M0X9       AT4G05160               5   
Q8L539       AT5G13050               1   
P31167       AT3G08580              16   
P40941       AT5G13490              10   

                                                 Protein.names Gene.names  \
Protein_IDs                                                                 
P92980            5-adenylylsulfate reductase 3, chloroplastic        NaN   
Q9M0X9                          4-coumarate--CoA ligase-like 7      4CLL7   
Q8L539       5-formyltetrahydrofolate cyclo-ligase, mitocho...       5FCL   
P31167                ADP,ATP carrier protein 1, mitochondrial       AAC1   
P40941                ADP,ATP carrier protein 2, mitochondrial       AAC2   

             Standard1   Standard2   Standard3    Treat1.1    Treat1.2  \
Protein_IDs                                                           

In [None]:
# Calculate column-wise sums for numeric sample data only
sample_sum = prot_df.sum(axis=0, numeric_only=True)

# Display the first few sample totals
print(sample_sum.head)

<bound method NDFrame.head of Peptide.counts    1.661100e+04
Standard1         9.091103e+09
Standard2         9.259010e+09
Standard3         9.344477e+09
Treat1.1          8.446065e+09
Treat1.2          8.588726e+09
Treat1.3          8.742227e+09
Treat2.1          9.028471e+09
Treat2.2          8.881304e+09
Treat2.3          8.786647e+09
Treat3.1          9.142383e+09
Treat3.2          9.106444e+09
Treat3.3          9.208144e+09
Treat4.1          8.557670e+09
Treat4.2          8.546808e+09
Treat4.3          8.824787e+09
dtype: float64>


In [None]:
# Normalize protein intensities using Counts Per Million (CPM)
# - Drop non-numeric annotation columns before normalization
normalized_df = (
    prot_df
    .drop(['GeneID', 'Protein.names', 'Gene.names', 'Gene_symbols'], axis=1)
    .div(sample_sum, axis=1)
    * 1_000_000
)

             Peptide.counts    Standard1    Standard2    Standard3  \
Protein_IDs                                                          
P92980           421.407501    47.942478    56.321358    58.542601   
Q9M0X9           301.005358     0.000000     0.000000    30.409406   
Q8L539            60.201072    23.113807    23.818962    23.976729   
P31167           963.217145  3677.991601  3887.024775  3745.099828   
P40941           602.010716   431.003813   457.705538   458.784360   

                Treat1.1     Treat1.2     Treat1.3     Treat2.1     Treat2.2  \
Protein_IDs                                                                    
P92980         41.658454    44.700460    34.615895    61.484388    60.768104   
Q9M0X9         55.515795    48.556678    38.173341    39.317843    48.227153   
Q8L539         31.232297    29.908976    19.298287    16.626293    26.654870   
P31167       4716.279259  5026.007410  3883.106798  4621.712914  4705.953120   
P40941        557.052324   59

In [None]:
# Preview the normalized data
print(normalized_df.head())

# Apply log2 transformation (log2(x + 1)) to stabilize variance
transformed_df = np.log2(normalized_df + 1)

# Reattach gene symbols after transformation
transformed_df['Gene_symbols'] = prot_df['Gene_symbols']

# Display the final normalized and log-transformed DataFrame
print('Normalized and Transformed Dataframe:')
print(transformed_df)

Normalized and Transformed Dataframe:
             Peptide.counts  Standard1  Standard2  Standard3   Treat1.1  \
Protein_IDs                                                               
P92980             8.722492   5.613015   5.841001   5.895850   5.414760   
Q9M0X9             8.238430   0.000000   0.000000   4.973125   5.820582   
Q8L539             5.935485   4.591788   4.633371   4.642513   5.010435   
P31167             9.913214  11.845095  11.924822  11.871174  12.203739   
P40941             9.236040   8.754900   8.841425   8.844814   9.124257   
...                     ...        ...        ...        ...        ...   
Q940J9             8.238430   5.616545   5.690247   5.149858   5.896898   
Q681Q7             6.923650   3.964693   3.769956   3.871119   4.059491   
O48737             8.722492   9.663442   9.607436   9.644274   9.733305   
Q8LPS6             8.914710   6.741546   6.843148   6.906771   6.780317   
O23676             6.923650   6.252009   0.000000   0.000000  