In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import linregress

In [2]:
input_dir = r'Figure_Data'
output_dir = r'Figure_Export'

In [3]:
# Incorporates all DBTL cycles
# filenames = [f'DBTL{i}.csv' for i in range(7)]
filenames = [f'DBTL{0}.csv']
input_paths = [os.path.join(input_dir, filename) for filename in filenames]

# Load CSVs and concatenate multiple files if necessary
dataframes = [pd.read_csv(input_path) for input_path in input_paths]
df = pd.concat(dataframes, ignore_index=True)

In [5]:
# Translation file between gene names
name_df = pd.read_csv(f'{input_dir}/proteomics_id_translator_240305.csv') 

# primary_name is the common name whereas 'locus' is the PP_####; this is usually better for the output
translator_dict = pd.Series(name_df['primary_name'].values, index=name_df['extracted']).to_dict()  # proteomics -> common name
translator_dict2 = pd.Series(name_df['locus'].values, index=name_df['extracted']).to_dict()        # proteomics -> locus

def map_protein_group(protein_group):
    # proteomics -> common name
    primary_name = translator_dict.get(protein_group)
    if pd.notna(primary_name) and primary_name.strip():  # Check if primary_name is a non-blank string
        return primary_name
    
    # proteomics -> locus
    locus = translator_dict2.get(protein_group)
    if pd.notna(locus) and locus.strip():  # Check if locus is a non-blank string
        return locus

# Map Protein.Group from proteomics to Locus
df['Protein.Group'] = df['Protein.Group'].map(map_protein_group)

# Count the number of non-translated proteins
nontranslated = df['Protein.Group'].apply(lambda x: x not in translator_dict.values() and x not in translator_dict2.values()).sum()
print(f"In total, N = {nontranslated}/{len(df['Protein.Group'])} proteins were not translated to primary/locus names")

In total, N = 828/772692 proteins were not translated to primary/locus names


In [6]:
# Filter the dataframe for 'Control' samples
control_df = df[df['Sample'] == 'Control']

# Group by relevant protein identifiers and calculate the mean abundance
grouped_control_df = control_df.groupby(['Protein.Group', 'Protein.Names', 'Protein', 'Protein.Description']).agg({
    '%_of protein_abundance_Top3-method': 'mean'
}).reset_index()

# Sort the groups by mean abundance in descending order
sorted_grouped_control_df = grouped_control_df.sort_values(by='%_of protein_abundance_Top3-method', ascending=False)

# Assign ranks based on the sorted mean abundance
sorted_grouped_control_df['Rank'] = sorted_grouped_control_df['%_of protein_abundance_Top3-method'].rank(ascending=False, method='first')

# Sort by rank to ensure the dataframe is ordered by rank
ranked_df = sorted_grouped_control_df.sort_values(by='Rank')
ranked_df.to_csv('ranked_proteins.csv', index=False)

print("Dataframe saved to 'ranked_proteins.csv'")
ranked_df.head(25)

Dataframe saved to 'ranked_proteins.csv'


Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,%_of protein_abundance_Top3-method,Rank
1477,neo,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,3.352559,1.0
5,MVD1,MVD1_YEAST,Mvd1,Diphosphomevalonate decarboxylase,2.507166,2.0
1235,groEL,CH60_PSEPK,Grol,60 kDa chaperonin,2.283663,3.0
805,aacC1,AACC1_PSEAI,Aacc1,Gentamicin 3-N-acetyltransferase,1.973718,4.0
1933,tufB,EFTU2_PSEPK,Tufb,Elongation factor Tu-B,1.813564,5.0
0,EF_1364,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,1.460456,6.0
1754,rplL,RL7_PSEPK,Rpll,50S ribosomal protein L7/L12,1.33431,7.0
1058,dnaK,DNAK_PSEPK,Dnak,Chaperone protein DnaK,1.094669,8.0
1745,rplA,RL1_PSEPK,Rpla,50S ribosomal protein L1,1.03785,9.0
1867,sucC,SUCC_PSEPK,Succ,Succinyl-CoA ligase [ADP-forming] subunit beta,0.991579,10.0
