 #### Ref: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3528536/
 #### Files downloaded from http://www.weizmann.ac.il/Structural_Biology/faculty_pages/ELevy/intDef/interface_def.html

### we will extract surface amino acids (cat 2) from the dataset and check the correlation of their stickiness score with the solubility from eSOL data

In [1]:
import pandas as pd
import numpy as np


In [2]:


df = pd.read_csv('data/matrix_intDef_ec_propensity.tab', sep='\t')
df.head()

Unnamed: 0,ensID,pdbID,pos.ens,pos.pdb,aa,ASA.rel.cplx,ASA.rel.alone,nsub,sym,len,homo,cat,patch.alone.size,patch.cplx.size,patch.alone,patch.cplx
0,P76342,1xdy_5E,63,19,A,112.4,112.4,1,NPS,262,1.0,2,7,7,0.4348,0.063
1,P76342,1xdy_5E,64,20,L,19.3,19.3,1,NPS,262,1.0,1,0,0,-0.9138,0.0
2,P76342,1xdy_5E,65,21,E,98.6,98.6,1,NPS,262,1.0,2,10,10,-1.0007,-0.179
3,P76342,1xdy_5E,66,22,F,29.3,29.3,1,NPS,262,1.0,2,11,11,-2.2957,-0.093
4,P76342,1xdy_5E,67,23,S,54.6,54.6,1,NPS,262,1.0,2,8,8,-1.4256,-0.161


In [3]:
df.shape

(118270, 16)

In [4]:
#taking only the surface aminoacids
df2 = df.loc[df['cat']==2]
df2.head()

Unnamed: 0,ensID,pdbID,pos.ens,pos.pdb,aa,ASA.rel.cplx,ASA.rel.alone,nsub,sym,len,homo,cat,patch.alone.size,patch.cplx.size,patch.alone,patch.cplx
0,P76342,1xdy_5E,63,19,A,112.4,112.4,1,NPS,262,1.0,2,7,7,0.4348,0.063
2,P76342,1xdy_5E,65,21,E,98.6,98.6,1,NPS,262,1.0,2,10,10,-1.0007,-0.179
3,P76342,1xdy_5E,66,22,F,29.3,29.3,1,NPS,262,1.0,2,11,11,-2.2957,-0.093
4,P76342,1xdy_5E,67,23,S,54.6,54.6,1,NPS,262,1.0,2,8,8,-1.4256,-0.161
5,P76342,1xdy_5E,68,24,K,58.7,58.7,1,NPS,262,1.0,2,10,10,-0.3994,-0.158


In [5]:
#stickiness score of aa
st = pd.read_csv('data/CROWDING_EC.mat', sep='\t')
st.sort_index(inplace=True)
st = st.reset_index(drop=True)
st.head()

Unnamed: 0,ensID,pdbID,pos.ens,pos.pdb,aa,rate,ndef,ASA.rel.cplx,ASA.rel.alone,len,patch.compo.400abs,aa.prop,ab.all
0,P76342,1xdy_5E,141,97,I,0.6596,10,7.1,7.1,262,0.0,1.1109,
1,P76342,1xdy_5E,142,98,Y,0.4077,10,0.1,0.1,262,0.0,0.8806,
2,P76342,1xdy_5E,143,99,R,0.2909,10,21.3,21.3,262,0.0,-0.0876,
3,P76342,1xdy_5E,144,100,M,0.8333,10,5.3,5.3,262,0.0,1.0124,
4,P76342,1xdy_5E,140,96,R,0.2909,10,4.2,4.2,262,0.0,-0.0876,


In [6]:
merged = pd.merge(df, st, on=['ensID', 'pos.ens'])

In [7]:
merged_with_surface = merged.loc[merged['cat'] == 2].copy()
merged_with_surface.head()

Unnamed: 0,ensID,pdbID_x,pos.ens,pos.pdb_x,aa_x,ASA.rel.cplx_x,ASA.rel.alone_x,nsub,sym,len_x,...,pos.pdb_y,aa_y,rate,ndef,ASA.rel.cplx_y,ASA.rel.alone_y,len_y,patch.compo.400abs,aa.prop,ab.all
0,P76342,1xdy_5E,63,19,A,112.4,112.4,1,NPS,262,...,19,A,1.889,10,112.4,112.4,262,0.063,0.0062,
2,P76342,1xdy_5E,65,21,E,98.6,98.6,1,NPS,262,...,21,E,2.619,10,98.6,98.6,262,-0.179,-0.7893,
3,P76342,1xdy_5E,66,22,F,29.3,29.3,1,NPS,262,...,22,F,1.585,10,29.3,29.3,262,-0.093,1.2727,
4,P76342,1xdy_5E,67,23,S,54.6,54.6,1,NPS,262,...,23,S,2.155,6,54.6,54.6,262,-0.161,0.1376,
5,P76342,1xdy_5E,68,24,K,58.7,58.7,1,NPS,262,...,24,K,3.383,10,58.7,58.7,262,-0.158,-1.1806,


In [8]:
#these are propensities of surface amino acids

aa_prop = merged_with_surface['aa.prop'].groupby([st.ensID, st.pdbID]).apply(list).reset_index()
aa_prop.head()

Unnamed: 0,ensID,pdbID,aa.prop
0,O32583,1zud4,"[-0.1771, -0.4114, -0.7485, -0.7893, -1.1806, ..."
1,P00448,1ixbB,"[-0.7485, -0.7893, -0.1799, 0.0062, -1.1806, -..."
2,P00452,1r1r_2B,"[0.9138, -0.7893, 0.1204, -0.7485, -1.1806, -0..."
3,P00509,1x2aB,"[-0.2693, 0.0062, -0.1799, 0.0062, 0.9138, -0...."
4,P00803,1b12_3C,"[-0.7893, -1.1806, 1.1109, -1.1806, -1.1806, 1..."


In [9]:
aa_prop['Stickiness score'] = aa_prop['aa.prop'].apply(np.mean)

In [10]:
aa_prop.shape 

(397, 4)

# Merge with solubility from E. coli sequences (eSOL dataset)

In [11]:
ecoli_w3110 = pd.read_csv('data/ecoli_eSOL_JWXXXX.csv')
ecoli_w3110.head()


Unnamed: 0,Accession,Sequence,ECK number,Solubility(%),unknown_bases
0,JW0002 thrB ECK0003,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,ECK0003,32.0,False
1,JW0003 thrC ECK0004,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,ECK0004,18.0,False
2,JW0004 yaaX ECK0005,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,ECK0005,78.0,False
3,JW0005 yaaA ECK0006,MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQI...,ECK0006,7.0,False
4,JW0007 talB ECK0008,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,ECK0008,85.0,False


In [12]:
ecoli_w3110[['JW', 'name', 'ECK_num']] = ecoli_w3110['Accession'].str.split(' ', expand=True)
ecoli_w3110.head()

Unnamed: 0,Accession,Sequence,ECK number,Solubility(%),unknown_bases,JW,name,ECK_num
0,JW0002 thrB ECK0003,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,ECK0003,32.0,False,JW0002,thrB,ECK0003
1,JW0003 thrC ECK0004,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,ECK0004,18.0,False,JW0003,thrC,ECK0004
2,JW0004 yaaX ECK0005,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,ECK0005,78.0,False,JW0004,yaaX,ECK0005
3,JW0005 yaaA ECK0006,MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQI...,ECK0006,7.0,False,JW0005,yaaA,ECK0006
4,JW0007 talB ECK0008,MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIP...,ECK0008,85.0,False,JW0007,talB,ECK0008


In [13]:
ecoli_w3110.shape


(3198, 8)

In [14]:
#Map to ecoli (mapping downloaded from uniprot https://www.uniprot.org/docs/ecoli)

a = pd.read_csv('data/ecoli_mapping.txt',sep='\t', header=None)
#Ordered_Locus_name                      Swiss-Prot_entry_name    AC Length         Gene_name_and_synonyms
a.head()

Unnamed: 0,0
0,b0001;JW4367 LPT_ECOLI P0AD86 21 thrL
1,b0002;JW0001 AK1H_ECOLI P00561 820 thrA;thrA1;...
2,b0003;JW0002 KHSE_ECOLI P00547 310 thrB
3,b0004;JW0003 THRC_ECOLI P00934 428 thrC
4,b0005;JW0004 YAAX_ECOLI P75616 98 yaaX


In [15]:
b = pd.DataFrame()
b[[1,2,3,4,5]] = a[0].str.split(' ', 4, expand=True)
# b.head()
b[['b_num', 'JW']] = b[1].str.split(';', 1, expand=True)
# b.head()
b = b.rename(columns={2:'Swiss-prot_entry_name', 3:'ensID', 4:'Length', 5:'Gene_name_and_synonyms'})
b.head()

Unnamed: 0,1,Swiss-prot_entry_name,ensID,Length,Gene_name_and_synonyms,b_num,JW
0,b0001;JW4367,LPT_ECOLI,P0AD86,21,thrL,b0001,JW4367
1,b0002;JW0001,AK1H_ECOLI,P00561,820,thrA;thrA1;thrA2,b0002,JW0001
2,b0003;JW0002,KHSE_ECOLI,P00547,310,thrB,b0003,JW0002
3,b0004;JW0003,THRC_ECOLI,P00934,428,thrC,b0004,JW0003
4,b0005;JW0004,YAAX_ECOLI,P75616,98,yaaX,b0005,JW0004


In [16]:
mapping = b[['Swiss-prot_entry_name', 'ensID','Length', 'Gene_name_and_synonyms', 'b_num', 'JW']].copy()

## Map aa propensity with sequences

In [17]:
mapped_df = pd.merge(aa_prop, mapping, on=['ensID']) #first map aa_prop
final_df = pd.merge(mapped_df, ecoli_w3110, on=['JW']) #then map with ecoli

In [18]:
# final_df.head()

In [19]:
final_df.shape

(348, 16)

In [20]:
#Correlation

from scipy.stats import spearmanr

spearmanr(final_df['Stickiness score'], final_df['Solubility(%)'])

SpearmanrResult(correlation=-0.05145269119617839, pvalue=0.3385580978337458)

In [21]:
#Solubility has almost no correlation with surface propensity/stickiness

In [22]:
final_df.to_pickle('surface_aminoacid_scoring.pkl.gz')