# Get protein list from EDD
EDD databases for DBTL0 and DBTL1 have proteins listed in UNIPROT format. I want to use this tool to convert to PP_XXXX/4 letter codes: https://www.uniprot.org/id-mapping 

In [1]:
import edd_utils as eddu
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import random
random.seed(1)

## Import data from EDD

Import data

In [2]:
study_slug_1 = 'crispri-automation-for-enhanced-isoprenol-pro-096d'
study_slug_2 = 'crispri-automation-for-enhanced-isoprenol-pro-fca3'
study_slug_3 = 'crispri-automation-for-enhanced-isoprenol-pro-05e7'
study_slug_4 = 'crispri-automation-for-enhanced-isoprenol-pro-a97b'
study_slug_5 = 'crispri-automation-for-enhanced-isoprenol-pro-9d3d'
study_slug_6 = 'crispri-automation-for-enhanced-isoprenol-pro-271b'
study_slug_7 = 'crispri-automation-for-enhanced-isoprenol-pro-6e5e'


edd_server   = 'edd.jbei.org'
username     = 'pckinnunen'

Open EDD session

In [3]:
try:
    session = eddu.login(edd_server=edd_server, user=username)
except:
    print('ERROR! Connection to EDD failed. We will try to load data from disk...')
else:
    print('OK! Connection to EDD successful. We will try to load data from EDD...')

Password for pckinnunen:  ········


OK! Connection to EDD successful. We will try to load data from EDD...


Import data

In [4]:
try:
    df1 = eddu.export_study(session, study_slug_1, edd_server=edd_server)
except (NameError, AttributeError, KeyError):
    print(f'ERROR! Not able to export study 1.')
    
try:
    df2 = eddu.export_study(session, study_slug_2, edd_server=edd_server)
except (NameError, AttributeError, KeyError):
    print(f'ERROR! Not able to export study 2.')
    
try:
    df3 = eddu.export_study(session, study_slug_3, edd_server=edd_server)
except (NameError, AttributeError, KeyError):
    print(f'ERROR! Not able to export study 3.')
    
try:
    df4 = eddu.export_study(session, study_slug_4, edd_server=edd_server)
except (NameError, AttributeError, KeyError):
    print(f'ERROR! Not able to export study 4.')
    
try:
    df5 = eddu.export_study(session, study_slug_5, edd_server=edd_server)
except (NameError, AttributeError, KeyError):
    print(f'ERROR! Not able to export study 5.')
    
try:
    df6 = eddu.export_study(session, study_slug_6, edd_server=edd_server)
except (NameError, AttributeError, KeyError):
    print(f'ERROR! Not able to export study 6.')
try:
    df7 = eddu.export_study(session, study_slug_7, edd_server=edd_server)
except (NameError, AttributeError, KeyError):
    print(f'ERROR! Not able to export study 7.')

  0%|          | 0/1033140 [00:00<?, ?it/s]

  0%|          | 0/475488 [00:00<?, ?it/s]

  0%|          | 0/420288 [00:00<?, ?it/s]

  0%|          | 0/411567 [00:00<?, ?it/s]

  0%|          | 0/345774 [00:00<?, ?it/s]

  0%|          | 0/418872 [00:00<?, ?it/s]

  0%|          | 0/398001 [00:00<?, ?it/s]

In [8]:
df_list = [df1, df2, df3, df4, df5, df6, df7]
for i, df in enumerate(df_list):
    print(f'i = {i+1}: Protocols: {df.Protocol.unique()}')

i = 1: Protocols: ['GC-FID' 'Global Proteomics']
i = 2: Protocols: ['GC-FID' 'Global Proteomics' 'Biolector']
i = 3: Protocols: ['GC-FID' 'Global Proteomics']
i = 4: Protocols: ['Global Proteomics' 'GC-FID']
i = 5: Protocols: ['GC-FID' 'Shotgun (Discovery) Proteomics']
i = 6: Protocols: ['GC-FID' 'Global Proteomics']
i = 7: Protocols: ['GC-FID' 'Global Proteomics']


In [9]:
df5.loc[df5['Protocol'] == 'Shotgun (Discovery) Proteomics', 'Protocol'] = 'Global Proteomics'

In [10]:
for i, df in enumerate(df_list):
    print(f'i = {i+1}: Protocols: {df.Protocol.unique()}')

i = 1: Protocols: ['GC-FID' 'Global Proteomics']
i = 2: Protocols: ['GC-FID' 'Global Proteomics' 'Biolector']
i = 3: Protocols: ['GC-FID' 'Global Proteomics']
i = 4: Protocols: ['Global Proteomics' 'GC-FID']
i = 5: Protocols: ['GC-FID' 'Global Proteomics']
i = 6: Protocols: ['GC-FID' 'Global Proteomics']
i = 7: Protocols: ['GC-FID' 'Global Proteomics']


## Get unique proteins

In [11]:
prot_list = [
    df.loc[df['Protocol'] == 'Global Proteomics', 'Formal Type'].unique()
    for df in df_list
]

unique_proteins = np.unique(np.concatenate(prot_list))
unique_proteins[:5]

array(['P0AE22', 'sp|A0A0M4F6K2', 'sp|A0A140FVW8', 'sp|A0A140FVX0',
       'sp|A0A140FVX4'], dtype=object)

## Split each protein entry by delimiter `|`

Protein entries look like: `sp|A9GAJ9|A9GAJ9_SORC5 Mcm` but have variable number of entries. 

Goal is to isolate the 6-character code. First, create a list of lists where each sublist is a single protein entry split along the delimiter.

In [13]:
proteins_split = [p.split('|') for p in unique_proteins]

Check if any of the split strings have length 6:

In [15]:
any_split_length_6 = [sum(protein_string_lengths==6)==1 for protein_string_lengths in proteins_split_length]
print(f"For N = {len(any_split_length_6)} total proteins, N = {sum(any_split_length_6)} have exactly one entry with length six")

For N = 2943 total proteins, N = 2880 have exactly one entry with length six


In [33]:
proteins_split_len6 = [protein for (protein, to_keep) in zip(proteins_split, any_split_length_6) if to_keep]

Iterate through split proteins, identify which index in sublist to save, and append that string to the list of protein ids

In [36]:
len(proteins_split)

2943

In [35]:
len(proteins_split_len6)

2880

In [52]:
protein_ids_for_uniprot = []
for idx, protein_ids in enumerate(proteins_split_len6):
    substring_len = [len(substring) for substring in protein_ids]
    split_to_save = [idx for idx, str_len in enumerate(substring_len) if str_len == 6]
    assert len(split_to_save) == 1, f"idx = {idx}\t {protein_ids}\nWrong number of protein IDs have the correct length"
    protein_ids_for_uniprot.append(protein_ids[split_to_save[0]])

convert list to dataframe and export to CSV

In [54]:
protein_ids_for_uniprot_df = pd.DataFrame(protein_ids_for_uniprot)
protein_ids_for_uniprot_df.to_csv('protein_ids_for_uniprot_tool.csv', index = False, header = False)

Save dataframe consisting of original protein string (from EDD) and extracted protein ID

In [60]:
protein_id_conversion_df = pd.DataFrame.from_dict(
    {
        'orig': unique_proteins,
        'has_6letter_substring': any_split_length_6,
        'extracted': None
    }
)
protein_id_conversion_df.loc[
    protein_id_conversion_df['has_6letter_substring'],
    'extracted'
] = protein_ids_for_uniprot

In [61]:
protein_id_conversion_df.to_csv('./data/protein_id_conversion_df_init.csv', index = False, header = True)