# Import ubiquitination-data from Hansen et al. 

This notebook reads the ubiquitination library files from Hansen et al. (2021) and formats them for analysis with StructureMap.

https://www.nature.com/articles/s41467-020-20509-1

## Load library data

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from pyteomics import fasta
human_fasta = fasta.IndexedUniProt('../data/human_fasta/uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_.fasta')

In [3]:
lib_untreated = pd.read_csv('../data/unformatted_ptm_data/DiGly_lib_U2OS_MG132_untreated.tsv', sep='\t')


In [4]:
lib_treated = pd.read_csv('../data/unformatted_ptm_data/DiGly_lib_U2OS_MG132_treated.tsv', sep='\t')


## Functions for data formatting

In [5]:
def get_all_idx(string, pattern):
    pos = re.finditer('u', string)
    pos_start = [p.start() for p in pos]
    pos_start_norm = [p-(i+1) for i,p in enumerate(pos_start)]
    return(pos_start_norm)

In [6]:
def test_get_all_idx():
    res = get_all_idx(string='VuNDKuNPHVALuYuALEVMESVVK', pattern='u')
    assert res==[0, 3, 9, 10]
    
test_get_all_idx()

In [7]:
def get_peptide_sequence_position(pep, prot, fastafile):
    try:
        seq = fastafile[prot].sequence
    except:
        seq = 'na'
    if seq != 'na':
        pep_pos = re.search(pep,seq)
        if pep_pos==None:
            start_pos = np.nan
        else:
            start_pos = pep_pos.start()
    else:
        start_pos = np.nan
    return(start_pos)

In [8]:
def format_library(lib, fastafile, ptm_pattern='\[GlyGly \(K\)\]', ptm_name='ub'):
    lib = lib.copy(deep=True)
    lib = lib[['ModifiedPeptide','ProteinGroups','UniProtIds']].drop_duplicates().reset_index(drop=True)
    lib['ModifiedPeptide'] = [re.sub('_','',s) for s in lib['ModifiedPeptide']]
    lib['ModifiedPeptide_u'] = [re.sub(ptm_pattern,'u',s) for s in lib['ModifiedPeptide']]
    lib['ModifiedPeptide_u_n'] = [re.sub('\[.+ \(.+\)\]','',s) for s in lib['ModifiedPeptide_u']]
    lib['NakedPeptide'] = [re.sub('\[.+ \(.+\)\]','',s) for s in lib['ModifiedPeptide']]
    # get position indeces for each ptm site
    lib['u_idx'] = [get_all_idx(s, 'u') for s in lib['ModifiedPeptide_u_n']]
    # count number of modifications on each peptide
    lib['len_u'] = [len(l) for l in lib['u_idx']]
    # only keep peptides with at least one ptm of interest
    lib_mod = lib[lib.len_u > 0].reset_index(drop=True)
    # explode protein groups
    lib_mod['all_UniProtIds'] = lib_mod.UniProtIds.str.split(';')
    lib_mod['n_protein_group'] = [len(p) for p in lib_mod['all_UniProtIds']]
    lib_mod_exp = lib_mod.explode('all_UniProtIds').reset_index(drop=True)
    # remove isoforms
    lib_mod_exp['uniprot_noIso'] = [re.sub('-\d+','',s) for s in lib_mod_exp['all_UniProtIds']]
    # keep only essential columns
    lib_mod_exp = lib_mod_exp[['ModifiedPeptide','UniProtIds','NakedPeptide','u_idx','uniprot_noIso','n_protein_group']]
    # explode ptm sites
    lib_mod_exp_exp = lib_mod_exp.explode('u_idx').reset_index(drop=True)
    lib_mod_exp_exp = lib_mod_exp_exp.drop_duplicates().reset_index(drop=True)
    # map peptides to fasta sequence
    lib_mod_exp_exp['pep_start'] = lib_mod_exp_exp.apply(lambda x : get_peptide_sequence_position(x["NakedPeptide"], x["uniprot_noIso"], fastafile), axis=1)
    # get ptm site on protein sequence
    lib_mod_exp_exp['ptm_prot_idx'] = lib_mod_exp_exp['pep_start']+lib_mod_exp_exp['u_idx']
    lib_mod_exp_exp['ptm_prot_site'] = lib_mod_exp_exp['ptm_prot_idx']+1
    
    res = lib_mod_exp_exp[['uniprot_noIso','ptm_prot_site']]
    res['AA'] = np.repeat('K',res.shape[0])
    res[ptm_name] = np.repeat(1,res.shape[0])
    res = res.rename(columns={"uniprot_noIso": "protein_id", "ptm_prot_site": "position"})
    res = res[['protein_id','AA','position',ptm_name]]
    res = res.dropna(subset=['position'])
    
    res = res.drop_duplicates().reset_index(drop=True)
    
    return(res)
    

## Untreated data

In [9]:
lib_untreated_f = format_library(lib=lib_untreated, fastafile=human_fasta, 
                                 ptm_pattern='\[GlyGly \(K\)\]', ptm_name='ub_untreated')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['AA'] = np.repeat('K',res.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res[ptm_name] = np.repeat(1,res.shape[0])


In [10]:
lib_untreated_f

Unnamed: 0,protein_id,AA,position,ub_untreated
0,Q9NZ09,K,89.0,1
1,Q09666,K,1107.0,1
2,Q9Y4D1,K,791.0,1
3,F5GZS6,K,114.0,1
4,P08195,K,145.0,1
...,...,...,...,...
23807,H0Y7H8,K,1130.0,1
23808,Q8NFD5,K,1608.0,1
23809,Q9NWB6,K,258.0,1
23810,O94986,K,1534.0,1


## Treated data

In [11]:
lib_treated_f = format_library(lib=lib_treated, fastafile=human_fasta, 
                               ptm_pattern='\[GlyGly \(K\)\]', ptm_name='ub_treated')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['AA'] = np.repeat('K',res.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res[ptm_name] = np.repeat(1,res.shape[0])


In [12]:
lib_treated_f

Unnamed: 0,protein_id,AA,position,ub_treated
0,O14682,K,318.0,1
1,Q92674,K,612.0,1
2,Q8NCN4,K,176.0,1
3,P39748,K,99.0,1
4,O14950,K,164.0,1
...,...,...,...,...
41767,Q9NR50,K,367.0,1
41768,H3BPE1,K,2415.0,1
41769,O00238,K,114.0,1
41770,A0A087X256,K,394.0,1


## Merge treated and untreated data

In [13]:
lib_merged = lib_treated_f.merge(lib_untreated_f, how='outer', on=['protein_id','AA','position'])
lib_merged = lib_merged.fillna(0)

  key_col = Index(lvals).where(~mask_left, rvals)


### Extract ubi-sites exclusive for the treated condition¶

In [14]:
lib_shared = lib_merged[(lib_merged.ub_treated==1) & (lib_merged.ub_untreated==1)]
lib_shared = lib_shared[['protein_id', 'AA', 'position', 'ub_treated']].reset_index(drop=True)
lib_shared = lib_shared.rename(columns={"ub_treated": "ub_shared"})

In [15]:
lib_shared

Unnamed: 0,protein_id,AA,position,ub_shared
0,O14682,K,318.0,1.0
1,P39748,K,99.0,1.0
2,P62701,K,128.0,1.0
3,Q6NZI2,K,312.0,1.0
4,Q9HCE1,K,339.0,1.0
...,...,...,...,...
14522,P11021,K,573.0,1.0
14523,O00299,K,119.0,1.0
14524,P30260,K,431.0,1.0
14525,F8W9X7,K,45.0,1.0


In [16]:
lib_shared.to_csv('../data/ptm_data/ubi_lib_shared.csv', index=False)

### Extract ubi-sites exclusive for the treated condition

In [17]:
lib_only_treated = lib_merged[(lib_merged.ub_treated==1) & (lib_merged.ub_untreated==0)]
lib_only_treated = lib_only_treated[['protein_id', 'AA', 'position', 'ub_treated']].reset_index(drop=True)
lib_only_treated = lib_only_treated.rename(columns={"ub_treated": "ub_treated_only"})

In [18]:
lib_only_treated

Unnamed: 0,protein_id,AA,position,ub_treated_only
0,Q92674,K,612.0,1.0
1,Q8NCN4,K,176.0,1.0
2,O14950,K,164.0,1.0
3,Q14667,K,185.0,1.0
4,Q9HA64,K,275.0,1.0
...,...,...,...,...
27240,Q9NR50,K,367.0,1.0
27241,H3BPE1,K,2415.0,1.0
27242,O00238,K,114.0,1.0
27243,A0A087X256,K,394.0,1.0


In [19]:
lib_only_treated.to_csv('../data/ptm_data/ubi_lib_only_treated.csv', index=False)

### Extract ubi-sites exclusive for the untreated condition

In [20]:
lib_only_untreated = lib_merged[(lib_merged.ub_treated==0) & (lib_merged.ub_untreated==1)]
lib_only_untreated = lib_only_untreated[['protein_id', 'AA', 'position', 'ub_untreated']].reset_index(drop=True)
lib_only_untreated = lib_only_untreated.rename(columns={"ub_untreated": "ub_untreated_only"})

In [21]:
lib_only_untreated

Unnamed: 0,protein_id,AA,position,ub_untreated_only
0,Q9NZ09,K,89.0,1.0
1,Q969S3,K,146.0,1.0
2,Q7Z401,K,1333.0,1.0
3,B1AK88,K,283.0,1.0
4,Q9UBT2,K,623.0,1.0
...,...,...,...,...
9280,O15350,K,345.0,1.0
9281,H0Y7H8,K,1130.0,1.0
9282,Q8NFD5,K,1608.0,1.0
9283,Q9NWB6,K,258.0,1.0


In [22]:
lib_only_untreated.to_csv('../data/ptm_data/ubi_lib_only_untreated.csv', index=False)