# Compile top3 data

In [1]:
%pip install openpyxl==3.1

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np


In [3]:
translator_df = pd.read_csv('../proteomics_id_translator.csv')
translator_df

translator_df.loc[~translator_df['locus'].isna(), 'output_name'] = translator_df.loc[~translator_df['locus'].isna(), 'locus']
translator_df.loc[translator_df['locus'].isna(), 'output_name'] = translator_df.loc[translator_df['locus'].isna(), 'primary_name']

translator_df
translator_dict = dict(zip(translator_df['extracted'], translator_df['output_name']))
# translator_df['output_name'] = [x['locus'] if ~x['locus'].isna() else x['primary_name'] for x in translator_df.iterrows()]

In [4]:
dfs = []
for i in range(7):
    print(i)
    df = pd.read_csv(f'DBTL{i}.csv')
    df['line_name_rep_cycle'] = df.apply(lambda x: f"{x['Sample']}-{x['Replicate']}_c{i}", axis = 1)
    df['in_translation'] = df['Protein.Group'].apply(lambda x: x in translator_dict.keys())
    n_proteins_in_translation_dict = len(df.loc[df['in_translation'], 'Protein.Group'].unique())
    n_proteins_total = len(df.loc[:, 'Protein.Group'].unique())
    print(f'N proteins in translation dict: {n_proteins_in_translation_dict}\t N total proteins: {n_proteins_total}\t Pct: {100*n_proteins_in_translation_dict/n_proteins_total:.2f}')
    df.loc[df['in_translation'], 'locus'] = df.loc[df['in_translation'], 'Protein.Group'].apply(lambda x: translator_dict[x])
    df.loc[~df['in_translation'], 'locus'] = df.loc[~df['in_translation'], 'Protein.Group']
    dfs.append(df)

0
N proteins in translation dict: 2051	 N total proteins: 2051	 Pct: 100.00
1
N proteins in translation dict: 1643	 N total proteins: 1643	 Pct: 100.00
2
N proteins in translation dict: 1623	 N total proteins: 1623	 Pct: 100.00
3
N proteins in translation dict: 1666	 N total proteins: 1671	 Pct: 99.70
4
N proteins in translation dict: 1286	 N total proteins: 1316	 Pct: 97.72
5
N proteins in translation dict: 1668	 N total proteins: 1710	 Pct: 97.54
6
N proteins in translation dict: 1346	 N total proteins: 1378	 Pct: 97.68


In [5]:
df_top3 = pd.concat(dfs)
df_top3


Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,Cycle,line_name_rep_cycle,in_translation,locus
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R1,1.191124e+08,3.112624,0.493127,0.0,Control-R1_c0,True,neo
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R2,1.491721e+08,3.712705,0.569690,0.0,Control-R2_c0,True,neo
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R3,1.387009e+08,3.700489,0.568259,0.0,Control-R3_c0,True,neo
3,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R4,1.003063e+08,2.650788,0.423375,0.0,Control-R4_c0,True,neo
4,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R5,1.142768e+08,2.835433,0.452619,0.0,Control-R5_c0,True,neo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
264571,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_0815_PP_4191,R2,5.581366e+05,0.011091,-1.955039,,PRT1093_PP_0815_PP_4191-R2_c6,True,PP_1529
264572,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_0815_PP_4191,R3,7.769397e+05,0.015602,-1.806817,,PRT1093_PP_0815_PP_4191-R3_c6,True,PP_1529
264573,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_1506_PP_4120,R1,0.000000e+00,0.000010,-5.000000,,PRT1093_PP_1506_PP_4120-R1_c6,True,PP_1529
264574,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_1506_PP_4120,R2,0.000000e+00,0.000010,-5.000000,,PRT1093_PP_1506_PP_4120-R2_c6,True,PP_1529


Replace one incorrect entry - turn 'Pp_0985' to 'PP_0985'

In [6]:
df_top3.loc[df_top3['Protein'] == 'Pp_0985', ['Protein', 'locus']] = 'PP_0985'

In [7]:
df_top3.loc[df_top3['locus'].isna(), :].Protein.unique()

array(['Tryp_pig'], dtype=object)

Remove trypsin from data

In [8]:
df_top3 = df_top3.loc[df_top3['Protein'] != 'Tryp_pig', :].reset_index(drop = True)

In [9]:
len(df_top3['locus'].unique())

2188

In [10]:
len(df_top3['locus'].unique())

2188

In [11]:
df_top3

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,Cycle,line_name_rep_cycle,in_translation,locus
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R1,1.191124e+08,3.112624,0.493127,0.0,Control-R1_c0,True,neo
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R2,1.491721e+08,3.712705,0.569690,0.0,Control-R2_c0,True,neo
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R3,1.387009e+08,3.700489,0.568259,0.0,Control-R3_c0,True,neo
3,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R4,1.003063e+08,2.650788,0.423375,0.0,Control-R4_c0,True,neo
4,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R5,1.142768e+08,2.835433,0.452619,0.0,Control-R5_c0,True,neo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542777,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_0815_PP_4191,R2,5.581366e+05,0.011091,-1.955039,,PRT1093_PP_0815_PP_4191-R2_c6,True,PP_1529
2542778,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_0815_PP_4191,R3,7.769397e+05,0.015602,-1.806817,,PRT1093_PP_0815_PP_4191-R3_c6,True,PP_1529
2542779,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_1506_PP_4120,R1,0.000000e+00,0.000010,-5.000000,,PRT1093_PP_1506_PP_4120-R1_c6,True,PP_1529
2542780,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_1506_PP_4120,R2,0.000000e+00,0.000010,-5.000000,,PRT1093_PP_1506_PP_4120-R2_c6,True,PP_1529


In [12]:
len(df_top3['line_name_rep_cycle'].unique())

1551

In [13]:
df_top3[df_top3['line_name_rep_cycle'] == 'PP_0528_PP_0815_PP_1317-R1_c3']

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,Cycle,line_name_rep_cycle,in_translation,locus
1410501,A0A379PS61,A0A379PS61_PSEPU,Ibpb,Heat shock protein Hsp20,PP_0528_PP_0815_PP_1317,R1,1.383240e+06,0.069396,-1.158663,3.0,PP_0528_PP_0815_PP_1317-R1_c3,False,A0A379PS61
1410684,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,PP_0528_PP_0815_PP_1317,R1,7.049324e+07,3.536609,0.548587,3.0,PP_0528_PP_0815_PP_1317-R1_c3,True,neo
1410867,P04264,K2C1_HUMAN,Krt1,"Keratin, type II cytoskeletal 1",PP_0528_PP_0815_PP_1317,R1,1.619953e+04,0.000813,-3.090058,3.0,PP_0528_PP_0815_PP_1317-R1_c3,True,KRT1
1411050,P0A0Z9,ARGA_PSEPK,Arga,Amino-acid acetyltransferase,PP_0528_PP_0815_PP_1317,R1,2.691920e+05,0.013505,-1.869498,3.0,PP_0528_PP_0815_PP_1317-R1_c3,True,PP_5185
1411233,P0A101,PCAJ_PSEPK,Pcaj,3-oxoadipate CoA-transferase subunit B,PP_0528_PP_0815_PP_1317,R1,6.145607e+04,0.003083,-2.510996,3.0,PP_0528_PP_0815_PP_1317-R1_c3,True,PP_3952
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1715196,Q88RW8,MNMG_PSEPK,Mnmg,tRNA uridine 5-carboxymethylaminomethyl modifi...,PP_0528_PP_0815_PP_1317,R1,2.457486e+05,0.012329,-1.909069,3.0,PP_0528_PP_0815_PP_1317-R1_c3,True,PP_0004
1715379,Q8PW39,Q8PW39_METMA,Mvk,Mevalonate kinase,PP_0528_PP_0815_PP_1317,R1,2.213333e+07,1.110418,0.045486,3.0,PP_0528_PP_0815_PP_1317-R1_c3,True,MM_1762
1715562,Q99ZW2,CAS9_STRP1,Cas9,CRISPR-associated endonuclease Cas9/Csn1,PP_0528_PP_0815_PP_1317,R1,5.943787e+05,0.029820,-1.525497,3.0,PP_0528_PP_0815_PP_1317-R1_c3,True,SPy_1046
1715745,Q9FD71,HMGCS_ENTFL,Mvas,Hydroxymethylglutaryl-CoA synthase,PP_0528_PP_0815_PP_1317,R1,2.215000e+07,1.111254,0.045813,3.0,PP_0528_PP_0815_PP_1317-R1_c3,True,mvaS


In [14]:
df_top3

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,Cycle,line_name_rep_cycle,in_translation,locus
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R1,1.191124e+08,3.112624,0.493127,0.0,Control-R1_c0,True,neo
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R2,1.491721e+08,3.712705,0.569690,0.0,Control-R2_c0,True,neo
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R3,1.387009e+08,3.700489,0.568259,0.0,Control-R3_c0,True,neo
3,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R4,1.003063e+08,2.650788,0.423375,0.0,Control-R4_c0,True,neo
4,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R5,1.142768e+08,2.835433,0.452619,0.0,Control-R5_c0,True,neo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542777,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_0815_PP_4191,R2,5.581366e+05,0.011091,-1.955039,,PRT1093_PP_0815_PP_4191-R2_c6,True,PP_1529
2542778,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_0815_PP_4191,R3,7.769397e+05,0.015602,-1.806817,,PRT1093_PP_0815_PP_4191-R3_c6,True,PP_1529
2542779,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_1506_PP_4120,R1,0.000000e+00,0.000010,-5.000000,,PRT1093_PP_1506_PP_4120-R1_c6,True,PP_1529
2542780,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_1506_PP_4120,R2,0.000000e+00,0.000010,-5.000000,,PRT1093_PP_1506_PP_4120-R2_c6,True,PP_1529


## Edit top3 data
Some line names need to be edited or dropped

### Remove string from some line names
Some line names have PRT1093 preprended:

In [15]:
def remove_prt(x):
    if x[:8] == 'PRT1093_':
        return x[8:]
    else:
        return x

def remove_timestamp(x, verbose = False):
    if '20240605' in x:
        if verbose:
            print(f'editing: {x}')
            print(f'new string: {new_string}')
        new_string = x[:26] + x[41:]
        return new_string
    else: 
        return x



df_top3['line_name_rep_cycle'] = df_top3['line_name_rep_cycle'].apply(remove_prt)
# df_top3['line_name_rep_cycle'] = df_top3['line_name_rep_cycle'].apply(remove_timestamp, verbose = False)

    

### Remove 'salt' condition

In [16]:
print(len(df_top3['Sample'].unique()))
df_top3 = df_top3.loc[~df_top3['Sample'].str.contains('Salt'), :]
print(len(df_top3['Sample'].unique()))


428
427


## Look at non-PP genes

In [17]:
df_top3

Unnamed: 0,Protein.Group,Protein.Names,Protein,Protein.Description,Sample,Replicate,Top_3pep_counts_mean,%_of protein_abundance_Top3-method,log10_%_abundance,Cycle,line_name_rep_cycle,in_translation,locus
0,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R1,1.191124e+08,3.112624,0.493127,0.0,Control-R1_c0,True,neo
1,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R2,1.491721e+08,3.712705,0.569690,0.0,Control-R2_c0,True,neo
2,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R3,1.387009e+08,3.700489,0.568259,0.0,Control-R3_c0,True,neo
3,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R4,1.003063e+08,2.650788,0.423375,0.0,Control-R4_c0,True,neo
4,P00552,KKA2_KLEPN,Neo,Aminoglycoside 3'-phosphotransferase,Control,R5,1.142768e+08,2.835433,0.452619,0.0,Control-R5_c0,True,neo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542777,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_0815_PP_4191,R2,5.581366e+05,0.011091,-1.955039,,PP_0815_PP_4191-R2_c6,True,PP_1529
2542778,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_0815_PP_4191,R3,7.769397e+05,0.015602,-1.806817,,PP_0815_PP_4191-R3_c6,True,PP_1529
2542779,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_1506_PP_4120,R1,0.000000e+00,0.000010,-5.000000,,PP_1506_PP_4120-R1_c6,True,PP_1529
2542780,Q9Z408,CSD_PSEPK,Csda,Probable cysteine desulfurase,PRT1093_PP_1506_PP_4120,R2,0.000000e+00,0.000010,-5.000000,,PP_1506_PP_4120-R2_c6,True,PP_1529


In [18]:
non_pp_genes = [locus for locus in df_top3['locus'].unique() if locus[:3] != 'PP_']
non_pp_genes

['neo',
 'KRT1',
 'b4055 JW4015',
 'KRT10',
 'aacC1',
 'YNR043W',
 'KRT9',
 'KRT2',
 'EF_1364',
 'MM_1762',
 'SPy_1046',
 'mvaS',
 'lipPks1',
 'KRT15',
 'A0A379PS61',
 'Q88FU7;Q88I85',
 'Q88KG1',
 'Q88M58',
 'Q88NJ9',
 'A0A140FVX0',
 'A0A140FVX4',
 'A0A140FVX8',
 'A0A140FVZ0',
 'A0A140FVZ1',
 'A0A140FVZ6',
 'A0A140FW16',
 'A0A140FW18',
 'A0A140FW22',
 'A0A140FW35',
 'A0A140FW42',
 'A0A140FW43',
 'A0A140FW57',
 'A0A140FWA3',
 'A0A140FWA4',
 'A0A140FWC7',
 'A0A140FWJ5',
 'A0A140FWL3',
 'A0A140FWM3',
 'A0A140FWP7',
 'A0A140FWQ2',
 'A0A140FWQ5',
 'A0A140FWQ8',
 'A0A140FWQ9',
 'A0A140FWS3',
 'A0A140FWS4',
 'A0A140FWS5',
 'A0A140FWU0',
 'Q9FD70',
 'A0A140FW37',
 'A0A140FW51',
 'A0A140FW85',
 'A0A140FW92',
 'A0A140FWJ4',
 'A0A140FWK1',
 'A0A140FWK7',
 'A0A140FWL0',
 'A0A140FWR3',
 'P02768',
 'Q835L4',
 'Q88F54',
 'Q88F77',
 'Q88GY3',
 'Q88KK3']

## Drop unidentified proteins

In [19]:
print(f'Before dropping unidentified proteins: df.shape = {df_top3.shape}')
df_top3 = df_top3.loc[~df_top3['locus'].isin(non_pp_genes), :]
print(f'Before dropping unidentified proteins: df.shape = {df_top3.shape}')


Before dropping unidentified proteins: df.shape = (2538651, 13)
Before dropping unidentified proteins: df.shape = (2500332, 13)


## Add isoprenol values

In [20]:
isoprenol_df = pd.read_csv('../normalized_proteomics_pivot_c0-c6.csv').loc[:, ['line_name_rep_cycle', 'isoprenol']]
isoprenol_df

Unnamed: 0,line_name_rep_cycle,isoprenol
0,Control-R10_c0,1.024392
1,Control-R11_c0,1.120367
2,Control-R12_c0,1.178920
3,Control-R13_c0,0.720284
4,Control-R14_c0,1.030501
...,...,...
1507,PP_5419-R2_c0,2.153361
1508,PP_5419-R3_c0,2.397354
1509,PP_5420-R1_c0,0.484521
1510,PP_5420-R2_c0,0.527125


In [21]:
isoprenol_lines = isoprenol_df['line_name_rep_cycle'].unique()
top3_lines = df_top3['line_name_rep_cycle'].unique()

In [22]:
print('linename/rep/cycle in top3 lines not in isoprenol lines')
for i in range(7):
    print(f'~~~~~CYCLE: {i}~~~~~')
    _=[print(line) for line in top3_lines if (line not in isoprenol_lines) & (line[-1] == str(i))]

linename/rep/cycle in top3 lines not in isoprenol lines
~~~~~CYCLE: 0~~~~~
PP_1607_NT1-R1_c0
PP_1607_NT1-R2_c0
PP_1607_NT1-R3_c0
PP_1607_NT1_xylS_Pm-R1_c0
PP_1607_NT1_xylS_Pm-R2_c0
PP_1607_NT1_xylS_Pm-R3_c0
PP_1607_NT2-R1_c0
PP_1607_NT2-R2_c0
PP_1607_NT2-R3_c0
PP_1607_NT2_xylS_Pm-R1_c0
PP_1607_NT2_xylS_Pm-R2_c0
PP_1607_NT2_xylS_Pm-R3_c0
PP_1607_NT3-R1_c0
PP_1607_NT3-R2_c0
PP_1607_NT3-R3_c0
PP_1607_NT3_xylS_Pm-R1_c0
PP_1607_NT3_xylS_Pm-R2_c0
PP_1607_NT3_xylS_Pm-R3_c0
PP_1607_NT4-R1_c0
PP_1607_NT4-R2_c0
PP_1607_NT4-R3_c0
PP_1607_NT4_xylS_Pm-R1_c0
PP_1607_NT4_xylS_Pm-R2_c0
PP_1607_NT4_xylS_Pm-R3_c0
PP_4194_NT2-R1_c0
PP_4194_NT2-R2_c0
PP_4194_NT2-R3_c0
PP_4194_NT3-R1_c0
PP_4194_NT3-R2_c0
PP_4194_NT3-R3_c0
PP_1607_NT1-R4_c0
PP_1607_NT1-R5_c0
PP_1607_NT1-R6_c0
~~~~~CYCLE: 1~~~~~
PP_0814_PP_4192_Bat2-R1_c1
PP_0814_PP_4192_Bat2-R2_c1
PP_0814_PP_4192_Bat2-R3_c1
PP_0814_PP_4862_Bat2-R1_c1
PP_0814_PP_4862_Bat2-R2_c1
PP_0814_PP_4862_Bat2-R3_c1
PP_2137_PP_4189_Bat2-R1_c1
PP_2137_PP_4189_Bat2-R2_c1


In [23]:
missing_isoprenol_lines = [line for line in top3_lines if line not in isoprenol_lines]
missing_isoprenol_lines

['PP_1607_NT1-R1_c0',
 'PP_1607_NT1-R2_c0',
 'PP_1607_NT1-R3_c0',
 'PP_1607_NT1_xylS_Pm-R1_c0',
 'PP_1607_NT1_xylS_Pm-R2_c0',
 'PP_1607_NT1_xylS_Pm-R3_c0',
 'PP_1607_NT2-R1_c0',
 'PP_1607_NT2-R2_c0',
 'PP_1607_NT2-R3_c0',
 'PP_1607_NT2_xylS_Pm-R1_c0',
 'PP_1607_NT2_xylS_Pm-R2_c0',
 'PP_1607_NT2_xylS_Pm-R3_c0',
 'PP_1607_NT3-R1_c0',
 'PP_1607_NT3-R2_c0',
 'PP_1607_NT3-R3_c0',
 'PP_1607_NT3_xylS_Pm-R1_c0',
 'PP_1607_NT3_xylS_Pm-R2_c0',
 'PP_1607_NT3_xylS_Pm-R3_c0',
 'PP_1607_NT4-R1_c0',
 'PP_1607_NT4-R2_c0',
 'PP_1607_NT4-R3_c0',
 'PP_1607_NT4_xylS_Pm-R1_c0',
 'PP_1607_NT4_xylS_Pm-R2_c0',
 'PP_1607_NT4_xylS_Pm-R3_c0',
 'PP_4194_NT2-R1_c0',
 'PP_4194_NT2-R2_c0',
 'PP_4194_NT2-R3_c0',
 'PP_4194_NT3-R1_c0',
 'PP_4194_NT3-R2_c0',
 'PP_4194_NT3-R3_c0',
 'PP_1607_NT1-R4_c0',
 'PP_1607_NT1-R5_c0',
 'PP_1607_NT1-R6_c0',
 'PP_0814_PP_4192_Bat2-R1_c1',
 'PP_0814_PP_4192_Bat2-R2_c1',
 'PP_0814_PP_4192_Bat2-R3_c1',
 'PP_0814_PP_4862_Bat2-R1_c1',
 'PP_0814_PP_4862_Bat2-R2_c1',
 'PP_0814_PP_4862_Bat2-

In [24]:
print(df_top3.shape)
df_top3 = df_top3.loc[
    ~df_top3['line_name_rep_cycle'].isin(missing_isoprenol_lines), :
]
print(df_top3.shape)

(2500332, 13)
(2416047, 13)


In [25]:
print('linename/rep/cycle in isoprenol data not in top3 data')
for i in range(7):
    print(f'~~~~~CYCLE: {i}~~~~~')
    _=[print(line) for line in isoprenol_lines if (line not in top3_lines) & (line[-1] == str(i))]

linename/rep/cycle in isoprenol data not in top3 data
~~~~~CYCLE: 0~~~~~
~~~~~CYCLE: 1~~~~~
PP_0814_PP_4192-R4_c1
PP_0814_PP_4192-R5_c1
PP_0814_PP_4192-R6_c1
PP_0814_PP_4862-R4_c1
PP_0814_PP_4862-R5_c1
PP_0814_PP_4862-R6_c1
PP_2137_PP_4189-R4_c1
PP_2137_PP_4189-R5_c1
PP_2137_PP_4189-R6_c1
~~~~~CYCLE: 2~~~~~
~~~~~CYCLE: 3~~~~~
~~~~~CYCLE: 4~~~~~
PP_0751_PP_0814_PP_4120_P4-R1_c4
PP_0751_PP_0814_PP_4120_P4-R2_c4
PP_0751_PP_0814_PP_4120_P4-R3_c4
~~~~~CYCLE: 5~~~~~
~~~~~CYCLE: 6~~~~~


In [26]:
isoprenol_df = isoprenol_df.set_index('line_name_rep_cycle')

In [27]:
df_pivot = df_top3.pivot(index = 'line_name_rep_cycle', columns = 'locus', values='log10_%_abundance')
df_pivot = df_pivot.join(isoprenol_df, on = 'line_name_rep_cycle')

In [28]:
df_pivot

Unnamed: 0_level_0,PP_0001,PP_0002,PP_0003,PP_0004,PP_0005,PP_0006,PP_0010,PP_0011,PP_0013,PP_0017,...,PP_5411,PP_5412,PP_5413,PP_5414,PP_5415,PP_5416,PP_5417,PP_5418,PP_5419,isoprenol
line_name_rep_cycle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Control-R10_c0,-1.702623,-1.838430,-1.641024,-1.830991,-2.207271,-1.328508,-1.558112,-1.010053,-1.439384,-1.244749,...,-1.496749,-1.161248,-0.250491,-1.047622,-0.505048,-1.197525,-0.909161,,,1.024392
Control-R11_c0,-1.692394,-1.891837,-1.669331,-1.841398,-2.217825,-1.344926,-1.588287,-0.979808,-1.393492,-1.301998,...,-1.539391,-1.251483,-0.204258,-1.025308,-0.475521,-1.160076,-0.856851,,,1.120367
Control-R12_c0,-1.640501,-2.128797,-1.672869,-1.809280,-1.914097,-1.320289,-1.649605,-0.972699,-1.452411,-1.238412,...,-1.561635,-1.127387,-0.235241,-1.001357,-0.476665,-1.179270,-0.909050,,,1.178920
Control-R13_c0,-1.680365,-1.634646,-1.719868,-1.864548,-2.128575,-1.312061,-1.594387,-0.996562,-1.414306,-1.284785,...,-1.537490,-1.202095,-0.213866,-1.020359,-0.473025,-1.109567,-0.858687,,,0.720284
Control-R14_c0,-1.650709,-1.790164,-1.702430,-1.846582,-2.177776,-1.318577,-1.609326,-1.001895,-1.420389,-1.209198,...,-1.468245,-1.211092,-0.214927,-0.974098,-0.470553,-1.109419,-0.890327,,,1.030501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PP_5419-R2_c0,-1.673511,-3.245886,-1.781190,-2.065708,-1.918742,-1.419010,-1.721185,-0.921180,-1.377231,-1.426777,...,-1.526896,-1.493790,-0.379905,-1.109808,-0.478091,-1.217316,-0.823736,,,2.153361
PP_5419-R3_c0,-1.686083,-2.285501,-1.775359,-1.906116,-1.676041,-1.425076,-1.714134,-0.976799,-1.378902,-1.343794,...,-1.539916,-1.172799,-0.383756,-1.122637,-0.535937,-1.253465,-0.901043,,,2.397354
PP_5420-R1_c0,-1.625546,-2.073687,-1.605030,-2.210190,-2.093708,-1.728035,-1.719898,-1.287797,-1.435522,-1.436350,...,-1.672172,-1.544782,-0.345770,-0.913597,-0.368391,-1.090943,-0.778143,-2.327398,,0.484521
PP_5420-R2_c0,-1.654087,-2.165709,-1.674787,-2.279164,-1.828562,-1.739094,-1.864161,-1.240000,-1.514166,-1.478951,...,-1.661417,-1.597359,-0.290291,-0.930629,-0.343305,-1.017702,-0.726611,-1.596836,,0.527125


Drop proteins that have nans

In [29]:
print(df_pivot.shape)
df_pivot = df_pivot.dropna(axis = 1)
print(df_pivot.shape)

(1500, 2126)
(1500, 1126)


In [30]:
df_pivot

Unnamed: 0_level_0,PP_0001,PP_0004,PP_0006,PP_0010,PP_0011,PP_0013,PP_0017,PP_0018,PP_0052,PP_0060,...,PP_5395,PP_5409,PP_5411,PP_5412,PP_5413,PP_5414,PP_5415,PP_5416,PP_5417,isoprenol
line_name_rep_cycle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Control-R10_c0,-1.702623,-1.830991,-1.328508,-1.558112,-1.010053,-1.439384,-1.244749,-1.267489,-1.768652,-1.103546,...,-1.466675,-1.732097,-1.496749,-1.161248,-0.250491,-1.047622,-0.505048,-1.197525,-0.909161,1.024392
Control-R11_c0,-1.692394,-1.841398,-1.344926,-1.588287,-0.979808,-1.393492,-1.301998,-1.575737,-2.044064,-1.114329,...,-1.494969,-1.752919,-1.539391,-1.251483,-0.204258,-1.025308,-0.475521,-1.160076,-0.856851,1.120367
Control-R12_c0,-1.640501,-1.809280,-1.320289,-1.649605,-0.972699,-1.452411,-1.238412,-1.547554,-1.947949,-1.110680,...,-1.517119,-1.716223,-1.561635,-1.127387,-0.235241,-1.001357,-0.476665,-1.179270,-0.909050,1.178920
Control-R13_c0,-1.680365,-1.864548,-1.312061,-1.594387,-0.996562,-1.414306,-1.284785,-1.254397,-1.989865,-1.114483,...,-1.562362,-1.796405,-1.537490,-1.202095,-0.213866,-1.020359,-0.473025,-1.109567,-0.858687,0.720284
Control-R14_c0,-1.650709,-1.846582,-1.318577,-1.609326,-1.001895,-1.420389,-1.209198,-1.229003,-2.125779,-1.086317,...,-1.554715,-1.771002,-1.468245,-1.211092,-0.214927,-0.974098,-0.470553,-1.109419,-0.890327,1.030501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PP_5419-R2_c0,-1.673511,-2.065708,-1.419010,-1.721185,-0.921180,-1.377231,-1.426777,-1.334504,-1.876600,-1.056022,...,-1.357945,-1.604740,-1.526896,-1.493790,-0.379905,-1.109808,-0.478091,-1.217316,-0.823736,2.153361
PP_5419-R3_c0,-1.686083,-1.906116,-1.425076,-1.714134,-0.976799,-1.378902,-1.343794,-1.366867,-1.589633,-1.030289,...,-1.383039,-1.625878,-1.539916,-1.172799,-0.383756,-1.122637,-0.535937,-1.253465,-0.901043,2.397354
PP_5420-R1_c0,-1.625546,-2.210190,-1.728035,-1.719898,-1.287797,-1.435522,-1.436350,-1.461992,-1.619231,-1.141654,...,-1.418756,-1.600864,-1.672172,-1.544782,-0.345770,-0.913597,-0.368391,-1.090943,-0.778143,0.484521
PP_5420-R2_c0,-1.654087,-2.279164,-1.739094,-1.864161,-1.240000,-1.514166,-1.478951,-1.757478,-1.594653,-1.198289,...,-1.407750,-1.572827,-1.661417,-1.597359,-0.290291,-0.930629,-0.343305,-1.017702,-0.726611,0.527125


In [31]:
df_pivot.to_csv('top3_processed_nounidentifiedproteins.csv')