In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
MAX_ABUNDANCE_THRESHOLD = 0.3

In [3]:
top3_files = []
for file in os.listdir("."):
    if (file.startswith('DBTL') & (file.endswith(".csv"))):
        top3_files.append(file)
top3_files
        

['DBTL0_Plate1_Top3_Full_list_proteins_20230224-053921.csv',
 'DBTL0_Plate2_Top3_Full_list_proteins_20230224-055157.csv',
 'DBTL0_Plate3_Top3_Full_list_proteins_20230303-212530.csv',
 'DBTL1_Plate1_Top3_Full_list_proteins_20230928-015658.csv',
 'DBTL1_Plate2_Top3_Full_list_proteins_20230928-021027.csv',
 'DBTL2_Top3_Full_list_proteins_20231213-155313.csv',
 'DBTL3_Top3_Full_list_proteins_20240221-033259.csv']

## Merge dataframes

### Add plate information for DBTL0
In DBTL0, one of the proteomics plates had way lower intensity

In [4]:
plate_mapping_df = pd.read_excel('../sample injection order.xlsx')
plate_mapping_df['Batch number'].value_counts()

Batch number
3    228
1     96
2     96
Name: count, dtype: int64

In [5]:
plate_mapping_dict = dict(zip(list(plate_mapping_df['Sample Name in order injected']), list(plate_mapping_df['Batch number'])))

In [6]:
df_list = []
for f in top3_files:
    print(f)
    df = pd.read_csv(f)
    current_cycle = int(f[4])
    group_idx = f.find('Plate')
    if group_idx != -1:
        group = f[group_idx + 5]
    else:
        group = None
    print(f'Cycle: {current_cycle}\t Group/plate: {group}')
    df['group'] = group
    df['cycle'] = current_cycle
    df['Line Name'] = df.apply(lambda x: f"{x['Sample']}-{x['Replicate']}", axis = 1)
    if current_cycle == 0:
        df['plate_number'] = df['Line Name'].apply(lambda x: plate_mapping_dict[x])
    else:
        df['plate_number'] = 1
    df_list.append(df)



DBTL0_Plate1_Top3_Full_list_proteins_20230224-053921.csv
Cycle: 0	 Group/plate: 1
DBTL0_Plate2_Top3_Full_list_proteins_20230224-055157.csv
Cycle: 0	 Group/plate: 2
DBTL0_Plate3_Top3_Full_list_proteins_20230303-212530.csv
Cycle: 0	 Group/plate: 3
DBTL1_Plate1_Top3_Full_list_proteins_20230928-015658.csv
Cycle: 1	 Group/plate: 1
DBTL1_Plate2_Top3_Full_list_proteins_20230928-021027.csv
Cycle: 1	 Group/plate: 2
DBTL2_Top3_Full_list_proteins_20231213-155313.csv
Cycle: 2	 Group/plate: None
DBTL3_Top3_Full_list_proteins_20240221-033259.csv
Cycle: 3	 Group/plate: None


In [7]:
df_full = pd.concat(df_list)

## Update df_full

Change replicates from string to number

In [8]:
df_full['Replicate']=df_full['Replicate'].apply(lambda x: x[1:])

Add PP_ numbers to data

In [9]:
translation_df = pd.read_csv('../proteomics_id_translator.csv', header = 0, index_col = False)
translation_df

Unnamed: 0,original,extracted,locus,primary_name,organism,is_putida
0,sp|Q9A9Z2,Q9A9Z2,CC_0819,xylD,Caulobacter vibrioides (strain ATCC 19089 / CB...,False
1,tr|Q835L3|Q835L3_ENTFA,Q835L3,EF_1364,mvaE,Enterococcus faecalis (strain ATCC 700802 / V583),False
2,sp|Q9FD71|HMGCS_ENTFL,Q9FD71,,mvaS,Enterococcus faecalis (Streptococcus faecalis),False
3,P0AE22,P0AE22,b4055 JW4015,aphA,Escherichia coli (strain K12),False
4,sp|P00698|LYSC_CHICK Lysozyme C,P00698,,LYZ,Gallus gallus (Chicken),False
...,...,...,...,...,...,...
2733,tr|Q88QV1|Q88QV1_PSEPK,Q88QV1,PP_0383,davB,Pseudomonas putida (strain ATCC 47054 / DSM 61...,True
2734,tr|Q88QV2|Q88QV2_PSEPK,Q88QV2,PP_0382,davA,Pseudomonas putida (strain ATCC 47054 / DSM 61...,True
2735,tr|Q88RH1|Q88RH1_PSEPK,Q88RH1,PP_0159,,Pseudomonas putida (strain ATCC 47054 / DSM 61...,True
2736,tr|Q88RH2|Q88RH2_PSEPK,Q88RH2,PP_0158,gcdH,Pseudomonas putida (strain ATCC 47054 / DSM 61...,True


In [10]:
ext_to_organism = dict(translation_df[['extracted', 'organism']].values)
ext_to_locus = dict(translation_df[['extracted', 'locus']].values)
ext_to_primary = dict(translation_df[['extracted', 'primary_name']].values)

In [11]:
any(df_full['Protein.Group'].unique() == 'Q88QV1')

True

In [12]:
len(df_full['Protein.Group'].unique())

2118

In [13]:
# [p for p in df_full['Protein.Group'].unique() if p in translation_df['extracted']]
len([p for p in df_full['Protein.Group'].unique() if p in translation_df['extracted'].unique()])

2113

In [14]:
df_full

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,group,cycle,Line Name,plate_number
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,1,1.191124e+08,3.112624,0.493127,1,0,Control-R1,1
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,2,1.491721e+08,3.712705,0.569690,1,0,Control-R2,1
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,3,1.387009e+08,3.700489,0.568259,1,0,Control-R3,1
3,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,4,1.003063e+08,2.650788,0.423375,1,0,Control-R4,1
4,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,5,1.142768e+08,2.835433,0.452619,1,0,Control-R5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
305788,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PP_0368_PP_4189_P4,2,1.318040e+05,0.006630,-2.178455,,3,PP_0368_PP_4189_P4-R2,1
305789,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PP_0368_PP_4189_P4,3,1.668348e+05,0.008851,-2.052990,,3,PP_0368_PP_4189_P4-R3,1
305790,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PP_4189_PP_4192,1,1.792609e+05,0.009231,-2.034763,,3,PP_4189_PP_4192-R1,1
305791,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PP_4189_PP_4192,2,1.231874e+05,0.006716,-2.172877,,3,PP_4189_PP_4192-R2,1


In [15]:
df_full[df_full['Protein'] == 'Neo']

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,group,cycle,Line Name,plate_number
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,1,1.191124e+08,3.112624,0.493127,1,0,Control-R1,1
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,2,1.491721e+08,3.712705,0.569690,1,0,Control-R2,1
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,3,1.387009e+08,3.700489,0.568259,1,0,Control-R3,1
3,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,4,1.003063e+08,2.650788,0.423375,1,0,Control-R4,1
4,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,5,1.142768e+08,2.835433,0.452619,1,0,Control-R5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,PP_0368_PP_4189_P4,2,2.627000e+07,1.321528,0.121077,,3,PP_0368_PP_4189_P4-R2,1
362,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,PP_0368_PP_4189_P4,3,5.870756e+07,3.114706,0.493417,,3,PP_0368_PP_4189_P4-R3,1
363,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,PP_4189_PP_4192,1,8.037235e+07,4.138642,0.616858,,3,PP_4189_PP_4192-R1,1
364,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,PP_4189_PP_4192,2,4.805442e+07,2.619930,0.418290,,3,PP_4189_PP_4192-R2,1


In [16]:
df_full['Protein.Names'].unique()

array(['KKA2_KLEPN', 'TRYP_PIG', 'K2C1_HUMAN', ..., 'Q88PK3_PSEPK',
       'HSCB_PSEPK', 'RS20_PSEPK'], dtype=object)

In [17]:
[p for p in df_full['Protein.Names'].unique() if p.upper().find('ENTFL')>-1]

['HMGCS_ENTFL']

In [18]:
any(df_full['Protein'] == 'ENTFA')

False

In [19]:
proteins_to_keep = ['neo', 'aacC1', 'mvaS', 'mcm', 'aphA', 'MVD1', 'mvk', 'cas9', 'ef_1364']
# proteins_to_keep = ['neo', 'aacC1', 'mvaS', 'mcm', 'aphA', 'MVD1', 'mvk', 'cas9', 'mvaE']

proteins_to_keep_lower = [s.lower() for s in proteins_to_keep]

df_full['protein_lower'] = [p.lower() for p in df_full['Protein']]

In [20]:
[p for p in proteins_to_keep_lower if p in df_full['protein_lower'].unique()]

['neo', 'aacc1', 'mvas', 'apha', 'mvd1', 'mvk', 'cas9', 'ef_1364']

In [21]:
df_full.loc[df_full['Protein.Group'] == 'Q835L3',:]

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,group,cycle,Line Name,plate_number,protein_lower
3936,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,Control,1,5.645667e+07,1.475316,0.168885,1,0,Control-R1,1,ef_1364
3937,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,Control,2,5.065667e+07,1.260781,0.100640,1,0,Control-R2,1,ef_1364
3938,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,Control,3,5.043000e+07,1.345454,0.128869,1,0,Control-R3,1,ef_1364
3939,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,Control,4,4.569667e+07,1.207623,0.081931,1,0,Control-R4,1,ef_1364
3940,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,Control,5,4.793000e+07,1.189238,0.075269,1,0,Control-R5,1,ef_1364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8047,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,PP_0368_PP_4189_P4,2,1.630529e+07,0.820248,-0.086055,,3,PP_0368_PP_4189_P4-R2,1,ef_1364
8048,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,PP_0368_PP_4189_P4,3,1.904978e+07,1.010678,0.004613,,3,PP_0368_PP_4189_P4-R3,1,ef_1364
8049,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,PP_4189_PP_4192,1,1.827141e+07,0.940856,-0.026477,,3,PP_4189_PP_4192-R1,1,ef_1364
8050,Q835L3,Q835L3_ENTFA,Ef_1364,Acetyl-CoA acetyltransferase/hydroxymethylglut...,PP_4189_PP_4192,2,1.381909e+07,0.753418,-0.122964,,3,PP_4189_PP_4192-R2,1,ef_1364


In [22]:
df_full['Protein.Group'].value_counts()

Protein.Group
P00552    987
Q88MD8    987
Q88MF9    987
Q88MF5    987
Q88MF3    987
         ... 
Q88KP3     96
Q88KN6     96
Q88GW9     96
Q88PC3     96
Q88LY6     96
Name: count, Length: 2118, dtype: int64

In [23]:
df_full.loc[:, 'organism'] = df_full.loc[:, 'Protein.Group'].apply(lambda x: ext_to_organism[x] if x in ext_to_organism.keys() else None)
df_full.loc[:, 'locus'] = df_full.loc[:, 'Protein.Group'].apply(lambda x: ext_to_locus[x] if x in ext_to_locus.keys() else None)
df_full.loc[:, 'primary'] = df_full.loc[:, 'Protein.Group'].apply(lambda x: ext_to_primary[x] if x in ext_to_primary.keys() else None)
df_full.loc[:, 'is_putida'] = df_full.loc[:, 'organism'].apply(lambda x: x == 'Pseudomonas putida (strain ATCC 47054 / DSM 6125 / CFBP 8728 / NCIMB 11950 / KT2440)')

In [24]:
df_full.head(3)

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,group,cycle,Line Name,plate_number,protein_lower,organism,locus,primary,is_putida
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,1,119112400.0,3.112624,0.493127,1,0,Control-R1,1,neo,Klebsiella pneumoniae,,neo,False
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,2,149172100.0,3.712705,0.56969,1,0,Control-R2,1,neo,Klebsiella pneumoniae,,neo,False
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,3,138700900.0,3.700489,0.568259,1,0,Control-R3,1,neo,Klebsiella pneumoniae,,neo,False


In [25]:
df_full.loc[df_full['protein_lower'].isin(proteins_to_keep_lower), 'Protein'].value_counts()

Protein
Neo        987
Apha       987
Aacc1      987
Mvd1       987
Ef_1364    987
Mvk        987
Cas9       987
Mvas       987
Name: count, dtype: int64

In [26]:
df_full.loc[df_full['primary'].isin(proteins_to_keep_lower), 'Protein'].value_counts()

Protein
Neo     987
Mvk     987
Cas9    987
Name: count, dtype: int64

### Filter out proteins if they aren't putida, in the list of separate proteins to keep, or have 'pp' at the start of their name

In [27]:
df_full.loc[:,'filter_foreign_tokeep'] = (df_full['is_putida'] | df_full['protein_lower'].isin(proteins_to_keep_lower) | [p[:2] == 'pp' for p in df_full['protein_lower']])
print(df_full.loc[~df_full['filter_foreign_tokeep'], 'protein_lower'].unique())
print(f'pre-filter len(df): {len(df_full)}')
df_full = df_full[df_full['filter_foreign_tokeep']]
print(f'post-filter len(df): {len(df_full)}')


  df_full.loc[:,'filter_foreign_tokeep'] = (df_full['is_putida'] | df_full['protein_lower'].isin(proteins_to_keep_lower) | [p[:2] == 'pp' for p in df_full['protein_lower']])


['tryp_pig' 'krt1' 'krt10' 'krt9' 'krt2' 'lippks1' 'krt15' 'ibpb' 'pabb'
 'slyx']
pre-filter len(df): 1716921
post-filter len(df): 1711113


In [28]:
df_full['Protein'].unique()[:50]

array(['Neo', 'Arga', 'Dnaa', 'Dnan', 'Rsmg', 'Ihfa', 'Ihfb', 'Pp_1224',
       'Adk', 'Pal', 'Yidc', 'Hpf', 'Pp_0002', 'Parb', 'Rpll', 'Rpsu',
       'Rpon', 'Tolb', 'Mnme', 'Apha', 'Aacc1', 'Mvd1', 'Alla', 'Argb',
       'Argc2', 'Argd', 'Pp_0396', 'Rpsk', 'Seld', 'Hisd', 'Pp_1352',
       'Argg', 'Argj', 'Argh', 'Infa', 'Ftsz', 'Ef_1364', 'Pp_1157',
       'Atpf', 'Atph', 'Atpa', 'Atpg', 'Atpd', 'Atpc', 'Glmu', 'Glms',
       'Pp_5400', 'Pp_5395', 'Pp_5392', 'Pp_5391'], dtype=object)

## Get final name for proteins


In [29]:
has_locus = ~df_full['locus'].isna()
df_full.loc[has_locus, 'final_protein_name'] = df_full.loc[has_locus, 'locus']
df_full.loc[~has_locus, 'final_protein_name'] = df_full.loc[~has_locus, 'protein_lower']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full.loc[has_locus, 'final_protein_name'] = df_full.loc[has_locus, 'locus']


## Remove proteins that aren't in each run

In [30]:
cycles = [0,1,2,3]
unique_proteins = set(df_full.loc[df_full['cycle'] == cycles[0], 'final_protein_name'])
for c in cycles[1:]:
    unique_proteins = unique_proteins.intersection(
        set(
            df_full.loc[df_full['cycle'] == c, 'final_protein_name'].unique()
        )
    )
shared_unique_proteins = list(unique_proteins)
print(len(shared_unique_proteins))

1427


In [31]:
df_full = df_full.loc[df_full['final_protein_name'].isin(shared_unique_proteins), :]

## Get control mean for each run

In [32]:
df_full.head(3)

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,group,cycle,Line Name,plate_number,protein_lower,organism,locus,primary,is_putida,filter_foreign_tokeep,final_protein_name
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,1,119112400.0,3.112624,0.493127,1,0,Control-R1,1,neo,Klebsiella pneumoniae,,neo,False,True,neo
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,2,149172100.0,3.712705,0.56969,1,0,Control-R2,1,neo,Klebsiella pneumoniae,,neo,False,True,neo
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,3,138700900.0,3.700489,0.568259,1,0,Control-R3,1,neo,Klebsiella pneumoniae,,neo,False,True,neo


In [33]:
df_full['is_control'] = df_full['Sample'].apply(lambda x: 'ontrol' in x)
control_mean_df = df_full.loc[df_full['is_control'], ['final_protein_name', 'Top_3pep_counts_mean', 'cycle', 'plate_number']].groupby(['final_protein_name', 'cycle', 'plate_number']).mean()
control_mean_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Top_3pep_counts_mean
final_protein_name,cycle,plate_number,Unnamed: 3_level_1
EF_1364,0,1,5.565429e+07
EF_1364,0,2,8.121857e+07
EF_1364,0,3,1.327726e+07
EF_1364,1,1,2.352441e+07
EF_1364,2,1,5.580900e+06
...,...,...,...
pp_0985,0,2,5.270041e+06
pp_0985,0,3,7.070051e+06
pp_0985,1,1,1.039243e+06
pp_0985,2,1,1.352591e+06


In [34]:
divide_by_control = lambda x: x['Top_3pep_counts_mean']/control_mean_df.loc[x['final_protein_name'], x['cycle'], x['plate_number']].values[0]
df_full.loc[:, 'normalized_top3'] = df_full.apply(divide_by_control, axis = 1)

In [35]:
df_full[df_full['is_control']].head(5)

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,group,...,plate_number,protein_lower,organism,locus,primary,is_putida,filter_foreign_tokeep,final_protein_name,is_control,normalized_top3
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,1,119112400.0,3.112624,0.493127,1,...,1,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,0.977287
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,2,149172100.0,3.712705,0.56969,1,...,1,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,1.223919
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,3,138700900.0,3.700489,0.568259,1,...,1,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,1.138006
3,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,4,100306300.0,2.650788,0.423375,1,...,1,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,0.822988
4,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,5,114276800.0,2.835433,0.452619,1,...,1,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,0.937612


In [36]:
df_full['final_line_name'] = df_full.apply(lambda x: f"{x['Sample']}-R{x['Replicate']}_c{x['cycle']}" , axis = 1)

In [37]:
idx_to_drop = (
    (df_full['Sample'] == 'PP_0528_PP_0815_PP_1317') & 
    (df_full['cycle'] == 3)
)
df_full = df_full[~idx_to_drop]

In [38]:
df_full[df_full['cycle'] == 0].value_counts(['final_line_name'])
# df_full[df_full['cycle'] == 0].value_counts(['final_protein_name'])

final_line_name          
PP_3394-R2_c0                1416
PP_0582-R2_c0                1416
PP_1607_NT2_xylS_Pm-R3_c0    1416
PP_1157-R1_c0                1416
PP_1157-R2_c0                1416
                             ... 
PP_1457-R3_c0                1415
PP_1457-R2_c0                1415
PP_1457-R1_c0                1415
PP_1394-R3_c0                1415
PP_5420-R3_c0                1415
Name: count, Length: 420, dtype: int64

## Look at distribution of intensities for each run

In [39]:
# df_full['plate_and_cycle'] = [f"{df_full.loc[i, 'plate']}_{df_full.loc[i,'cycle']}" for i in df_full.index]
# df_full['plate_and_cycle'] = df_full.apply(lambda x: f"{x['plate']}_{x['cycle']}", axis = 1)

In [40]:
# sns.histplot(
#     data = df_full[(df_full['is_control']) & (~df_full['Top_3pep_counts_mean'].isna())],
#     x = 'Top_3pep_counts_mean',
#     hue = 'plate_and_cycle',
#     log_scale = True
# )

In [41]:
# any(df_full.loc[(df_full['is_control']) & (~df_full['Top_3pep_counts_mean'].isna()), 'Top_3pep_counts_mean']==0)


In [42]:
# fig, ax= plt.subplots()
# sns.violinplot(
#     data = df_full[(~df_full['is_control']) & (~df_full['Top_3pep_counts_mean'].isna()) & (df_full['Top_3pep_counts_mean']>0)],
#     x = 'Top_3pep_counts_mean',
#     hue = 'plate_and_cycle',
#     log_scale = True
# )

In [43]:
# fig, ax= plt.subplots()
# sns.violinplot(
#     data = df_full[(df_full['is_control']) & (~df_full['Top_3pep_counts_mean'].isna()) & (df_full['Top_3pep_counts_mean']>0)],
#     x = 'Top_3pep_counts_mean',
#     hue = 'plate_and_cycle',
#     log_scale = True
# )

## Make pivot table for single-rep top3 mean counts

### add final name column
Combine condition, DBTL cycle, and rep so that there are no duplicate entries

In [44]:
df_full.head(3)

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,group,...,protein_lower,organism,locus,primary,is_putida,filter_foreign_tokeep,final_protein_name,is_control,normalized_top3,final_line_name
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,1,119112400.0,3.112624,0.493127,1,...,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,0.977287,Control-R1_c0
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,2,149172100.0,3.712705,0.56969,1,...,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,1.223919,Control-R2_c0
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,3,138700900.0,3.700489,0.568259,1,...,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,1.138006,Control-R3_c0


Update replicate number for one set of runs specifically

In [45]:
df_full['final_line_name'].value_counts()

final_line_name
PP_0812_PP_2095-R2_c1    1427
PP_0437_PP_0814-R1_c2    1427
PP_0437_PP_0814-R3_c2    1427
PP_0437_PP_1769-R1_c2    1427
PP_0437_PP_1769-R2_c2    1427
                         ... 
PP_0277-R2_c0            1415
PP_0277-R3_c0            1415
PP_0654-R1_c0            1415
PP_0654-R2_c0            1415
PP_0814-R2_c0            1415
Name: count, Length: 981, dtype: int64

In [46]:
df_full

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,group,...,protein_lower,organism,locus,primary,is_putida,filter_foreign_tokeep,final_protein_name,is_control,normalized_top3,final_line_name
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,1,1.191124e+08,3.112624,0.493127,1,...,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,0.977287,Control-R1_c0
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,2,1.491721e+08,3.712705,0.569690,1,...,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,1.223919,Control-R2_c0
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,3,1.387009e+08,3.700489,0.568259,1,...,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,1.138006,Control-R3_c0
3,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,4,1.003063e+08,2.650788,0.423375,1,...,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,0.822988,Control-R4_c0
4,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,5,1.142768e+08,2.835433,0.452619,1,...,neo,Klebsiella pneumoniae,,neo,False,True,neo,True,0.937612,Control-R5_c0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305788,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PP_0368_PP_4189_P4,2,1.318040e+05,0.006630,-2.178455,,...,csda,Pseudomonas putida (strain ATCC 47054 / DSM 61...,PP_1529,csdA,True,True,PP_1529,False,0.914931,PP_0368_PP_4189_P4-R2_c3
305789,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PP_0368_PP_4189_P4,3,1.668348e+05,0.008851,-2.052990,,...,csda,Pseudomonas putida (strain ATCC 47054 / DSM 61...,PP_1529,csdA,True,True,PP_1529,False,1.158101,PP_0368_PP_4189_P4-R3_c3
305790,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PP_4189_PP_4192,1,1.792609e+05,0.009231,-2.034763,,...,csda,Pseudomonas putida (strain ATCC 47054 / DSM 61...,PP_1529,csdA,True,True,PP_1529,False,1.244358,PP_4189_PP_4192-R1_c3
305791,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PP_4189_PP_4192,2,1.231874e+05,0.006716,-2.172877,,...,csda,Pseudomonas putida (strain ATCC 47054 / DSM 61...,PP_1529,csdA,True,True,PP_1529,False,0.855118,PP_4189_PP_4192-R2_c3


### Add isoprenol to the top3 data

In [47]:
isoprenol_df = pd.read_csv('../normalized_proteomics_pivot.csv')
isoprenol_df = isoprenol_df.loc[:, ['line_name_rep_cycle', 'isoprenol']]
isoprenol_df.shape
isoprenol_df['line_name_rep_cycle'] = isoprenol_df['line_name_rep_cycle'].apply(lambda x: f'{x[:-1]}{int(x[-1])-1}')
isoprenol_df['line_name_rep_cycle']
isoprenol_df = isoprenol_df.set_index('line_name_rep_cycle')
isoprenol_df.head(3)

Unnamed: 0_level_0,isoprenol
line_name_rep_cycle,Unnamed: 1_level_1
Control-R10_c0,1.024392
Control-R11_c0,1.120367
Control-R12_c0,1.17892


In [48]:
def merge_isoprenol_data(df_prot, df_isoprenol):
    print(
        f'Found N = {len([line_name for line_name in df_isoprenol.index if line_name in df_prot.index])} matching indices'
    )
    df_out = pd.merge(left = df_prot, right = df_isoprenol, how='inner', left_index=True, right_index=True)
    return df_out

In [49]:
df_pivot_singlerep = df_full.pivot(index = 'final_line_name', columns='final_protein_name', values='Top_3pep_counts_mean')
df_pivot_singlerep_norm = df_full.pivot(index = 'final_line_name', columns='final_protein_name', values='normalized_top3')
df_pivot_singlerep_abundance = df_full.pivot(index = 'final_line_name', columns='final_protein_name', values='%_of protein_abundance_Top3-method')

df_pivot_singlerep = merge_isoprenol_data(df_pivot_singlerep, isoprenol_df)
df_pivot_singlerep_norm = merge_isoprenol_data(df_pivot_singlerep_norm, isoprenol_df)
df_pivot_singlerep_abundance = merge_isoprenol_data(df_pivot_singlerep_abundance, isoprenol_df)

df_pivot_singlerep = df_pivot_singlerep.dropna(axis = 'columns', how = 'any')
df_pivot_singlerep_norm = df_pivot_singlerep_norm.dropna(axis = 'columns', how = 'any')
df_pivot_singlerep_abundance = df_pivot_singlerep_abundance.dropna(axis = 'columns', how = 'any')

df_pivot_singlerep.to_csv('top3_singlerep.csv')
df_pivot_singlerep_norm.to_csv('top3_singlerep_norm.csv')
df_pivot_singlerep_abundance.to_csv('top3_singlerep_abundance.csv')

Found N = 939 matching indices
Found N = 939 matching indices
Found N = 939 matching indices


In [50]:
df_pivot = df_pivot_singlerep.copy()
df_pivot_norm = df_pivot_singlerep_norm.copy()
df_pivot_abundance = df_pivot_singlerep_abundance.copy()

In [51]:
def add_line_cycle(x):
    rep_idx = x.index('-R')
    name = x[:rep_idx]
    c_idx = x.index('c')
    cycle = x[c_idx+1:]
    rep = x[rep_idx+2:c_idx-1]
    name_cycle = f"{name}_c{cycle}"
    assert len(x) == len(name_cycle)+len(rep)+2, print(f'name: {x} name_cycle: {name_cycle}')
    return name_cycle
_=[add_line_cycle(x) for x in df_pivot.index]



In [52]:
df_pivot.head(3)

Unnamed: 0,EF_1364,MM_1762,PP_0001,PP_0002,PP_0003,PP_0004,PP_0005,PP_0006,PP_0010,PP_0011,...,PP_5416,PP_5417,SPy_1046,YNR043W,aacc1,b4055 JW4015,mvas,neo,pp_0985,isoprenol
Control-R10_c0,84580000.0,40336670.0,786768.666667,575493.0,906661.666667,585435.0,246149.4,1861924.0,1097381.0,3876297.0,...,2517358.0,4890000.0,2039252.0,100896700.0,80485510.0,14646670.0,32750000.0,132008500.0,6587633.0,1.024392
Control-R11_c0,82380000.0,41836670.0,795890.566667,502817.8,839299.0,564741.233333,237367.933333,1771418.0,1011488.0,4106196.0,...,2711258.0,5450000.0,1789548.0,111816700.0,83480410.0,16420410.0,34136670.0,128302000.0,5247480.0,1.120367
Control-R12_c0,83663330.0,44023330.0,914329.333333,297034.333333,848661.666667,619902.666667,486974.666667,1911236.0,895361.3,4255051.0,...,2644439.0,4926667.0,1756136.0,106200000.0,84000000.0,12060000.0,32790000.0,144794900.0,4376667.0,1.17892


In [53]:
df_list = [df_pivot, df_pivot_norm, df_pivot_abundance]
df_names = ['top3', 'top3_norm', 'top3_abundance']
for i, df in enumerate(df_list):
    df_list[i] = df_list[i].reset_index(names = 'line_rep_cycle')
    df_list[i]['line_cycle'] = df_list[i]['line_rep_cycle'].apply(lambda x: add_line_cycle(x))
    df_list[i] = df_list[i].drop(columns = 'line_rep_cycle')
    columns_to_mean = df_list[i].columns[:-1]
    df_list[i] = df_list[i].groupby(by = 'line_cycle').mean()
    df_list[i].to_csv(f'{df_names[i]}.csv')