## Calculation of Sequence Properties

This notebook computes sequence descriptors for the 29,998 IDRs

Author: Giulio Tesei

Contact: giulio.tesei@bio.ku.dk

In [2]:
import numpy as np 
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import itertools
from localcider.sequenceParameters import SequenceParameters
import time
from ast import literal_eval

def calc_seq_prop(seq_name,df,r):
    """df: DataFrame to be populated with sequence properties
    r: DataFrame of aa-specific parameters"""
        
    fasta = list(df.loc[seq_name].fasta).copy()
    fasta_kappa = fasta.copy()
    N = len(fasta)
    
    # calculate properties that do not depend on charges
    df.loc[seq_name,'fK'] = sum([fasta.count(a) for a in ['K']])/N
    df.loc[seq_name,'fR'] = sum([fasta.count(a) for a in ['R']])/N
    df.loc[seq_name,'fE'] = sum([fasta.count(a) for a in ['E']])/N
    df.loc[seq_name,'fD'] = sum([fasta.count(a) for a in ['D']])/N
    df.loc[seq_name,'faro'] = sum([fasta.count(a) for a in ['W','Y','F']])/N
    df.loc[seq_name,'mean_lambda'] = np.mean(r.loc[fasta].lambdas)
    pairs = np.array(list(itertools.combinations(fasta,2)))
    pairs_indices = np.array(list(itertools.combinations(range(N),2)))
    # calculate sequence separations
    ij_dist = np.diff(pairs_indices,axis=1).flatten().astype(float)
    # calculate lambda sums
    ll = r.lambdas.loc[pairs[:,0]].values+r.lambdas.loc[pairs[:,1]].values
    # calculate SHD
    beta = -1
    df.loc[seq_name,'shd'] = np.sum(ll*np.power(np.abs(ij_dist),beta))/N
    SeqOb = SequenceParameters(df_idrome.loc[seq_name].fasta)
    df.loc[seq_name,'omega'] = SeqOb.get_kappa_X(grp1=['F','Y','W'])
    
    # fix charges
    if df.loc[seq_name,'first'] == 1:
        r.loc['X'] = r.loc[fasta[0]]
        r.loc['X','q'] = r.loc[fasta[0],'q'] + 1.
        fasta[0] = 'X'
        if r.loc['X','q'] > 0:
            fasta_kappa[0] = 'K'
        else:
            fasta_kappa[0] = 'A'
    if df.loc[seq_name,'last'] == df.loc[seq_name,'N_FL']:
        r.loc['Z'] = r.loc[fasta[-1]]
        r.loc['Z','q'] = r.loc[fasta[-1],'q'] - 1.
        fasta[-1] = 'Z'
        if r.loc['Z','q'] < 0:
            fasta_kappa[-1] = 'D'
        else:
            fasta_kappa[-1] = 'A'
            
    # calculate properties that depend on charges    
    pairs = np.array(list(itertools.combinations(fasta,2)))
    # calculate charge products
    qq = r.q.loc[pairs[:,0]].values*r.q.loc[pairs[:,1]].values
    # calculate SCD
    df.loc[seq_name,'scd'] = np.sum(qq*np.sqrt(ij_dist))/N
    SeqOb = SequenceParameters(''.join(fasta_kappa))
    kappa = SeqOb.get_kappa()
    df.loc[seq_name,'kappa'] = 0 if kappa==-1 else kappa
    df.loc[seq_name,'fcr'] = r.q.loc[list(fasta)].abs().mean()
    df.loc[seq_name,'ncpr'] = r.q.loc[list(fasta)].mean()

In [3]:
# conformational properties from MD simulations
df_idrome = pd.read_csv('md_simulations/data/conf_prop.csv',index_col=0)
df_idrome.nu = df_idrome.nu.apply(lambda x: round(x,3))

# aa-specific properties
r = pd.read_csv('md_simulations/data/residues.csv').set_index('one',drop=False)

# sequences
sequences = pd.read_csv('md_simulations/data/idr_all.csv.gz',header=0,sep=';')
sequences.sort_values('uniprot',inplace=True)
sequences['seq_name'] = sequences.uniprot+'_'+sequences['first'].apply(lambda x : 
                '{:g}'.format(x))+'_'+sequences['last'].apply(lambda x : '{:g}'.format(x))
sequences.set_index('seq_name',inplace=True)
df_idrome['UniProt_ID'] = sequences.uniprot
df_idrome['fasta'] = sequences.seq
df_idrome['first'] = sequences['first']
df_idrome['last'] = sequences['last']
df_idrome['N_FL'] = sequences.nres_unip # number of residues in full-length protein

# gene and protein names from UniProt
df_gene_protein_names = pd.read_csv('md_simulations/data/gene_protein_names.csv',index_col=0)
df_idrome['protein_name'] = df_gene_protein_names.loc[df_idrome.UniProt_ID].proteinName.values
df_idrome['gene_name'] = df_gene_protein_names.loc[df_idrome.UniProt_ID].geneName.values

In [None]:
for i,seq_name in enumerate(df_idrome.index):
    calc_seq_prop(seq_name,df_idrome,r)
    print(i,end='-')

In [17]:
df_idrome['N_term'] = df_idrome['first'].apply(lambda x: True if x == 1 else False)
df_idrome['C_term'] = df_idrome['last'] == df_idrome['N_FL']

In [25]:
df_idrome = df_idrome.rename({'Rg':'Rg/nm','ete':'Ree/nm','rh':'Rh/nm'},axis=1)

In [None]:
cols = ['UniProt_ID', 'N', 'nu', 'nu_svr', 'ete2_Rg2', 'S', 'Delta', 'Rg/nm', 'Ree/nm', 'Rh/nm',
       'fK', 'fR', 'fE', 'fD', 'faro', 'mean_lambda', 'shd',
       'scd', 'kappa', 'fcr', 'ncpr', 'fasta', 'N_term',
       'C_term', 'first', 'last', 'N_FL', 'protein_name', 'gene_name',
        'fdomain']

In [13]:
df_idrome[cols].to_csv('IDRome_DB.csv')

In [14]:
df_idrome[cols].to_excel('IDRome_DB.xlsx')