# Imports
Versions <br>
python - 3.6.7 <br>
Pandas - 0.23.4 <br>
Biopython - 1.73 <br>
Scipy - 1.2.0 <br>
iupred2d - https://iupred2a.elte.hu/download <br>

In [5]:
import os
import ntpath
import datetime
import pandas as pd
from Bio import SeqIO
from scipy import signal
from iupred2a import iupred2a
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from tqdm.auto import tqdm

from multiprocessing import Pool
# Windows
CWD = os.getcwd()

# Statics

In [6]:
RESIDUES = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
            'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Kyte & Doolittle {kd} index of hydrophobicity
HP = {'A': 1.8, 'R':-4.5, 'N':-3.5, 'D':-3.5, 'C': 2.5,
      'Q':-3.5, 'E':-3.5, 'G':-0.4, 'H':-3.2, 'I': 4.5,
      'L': 3.8, 'K':-3.9, 'M': 1.9, 'F': 2.8, 'P':-1.6,
      'S':-0.8, 'T':-0.7, 'W':-0.9, 'Y':-1.3, 'V': 4.2, 'U': 0.0}

# Classes

In [13]:
import time
class IuPred:
    def __init__(self, glob, short, long):
        self.glob = glob
        self.short = short
        self.long = long


class HydroPhobicIndex:
    def __init__(self, hpilist):
        self.hpilist = hpilist
        
def calc_idrpred(args):
    idx, seq = args
    glob = iupred2a.iupred(str(seq), 'glob')
    short = iupred2a.iupred(str(seq), 'short')
    long = iupred2a.iupred(str(seq), 'long')
    idrpred = IuPred(glob, short, long)
    return idx, idrpred
        

class MakeMatrix:
    def __init__(self, dbfasta):  
        self.df = pd.DataFrame()
        executables = [
             'self.fasta2df(dbfasta)',
#             'self.amino_acid_analysis()',
#             'self.idr_iupred()',
#             'self.hydrophobic()',
#             'self.add_iupred_features()',
#             'self.add_hydrophobic_features()',
#             'self.add_biochemical_combinations()',
#             'self.add_lowcomplexity_features()' ,
#             #'self.add_plaac()'
        ]        
        for e in executables:
            start = time.time()     
            print(e)
            exec(e)
            end = time.time()        
            print(str(round(end - start, 2))+'s '+e)

    def fasta2df(self, dbfasta):
        rows = list()
        with open(dbfasta) as f:
            for record in SeqIO.parse(dbfasta, 'fasta'):
                seqdict = dict()
                seq = str(record.seq)
                id = record.description.split('|')                
                if id[0] == 'sp':
                    uniprot_id = id[1]
                    name = id[2].split(' ')[0]
                    rows.append([name, uniprot_id, seq])
                elif id[0] == 'tr':
                    uniprot_id = id[1]
                    name = id[2].split(' ')[0]
                    rows.append([name, uniprot_id, seq])
                else:
                    uniprot_id = id[0]
                    name = id[2].split(' ')[0]
                    rows.append([name, uniprot_id, seq])                    
        self.df = pd.DataFrame(rows, columns=['protein_name', 'uniprot_id', 'sequence'])
        print(len(self.df))

#     def set_identifiers(self, fastas):
#         self.df['llps'] = 0
#         for i in fastas:
#             fname = ntpath.basename(i).split('.')[0]
#             self.df[fname] = 0
#             with open(i) as f:
#                 for record in SeqIO.parse(f, 'fasta'):
#                     id_split = record.id.split('|')
#                     protid = (id_split[1])
#                     self.df.loc[self.df['uniprot_id'] == protid, fname] = 1
#                     self.df.loc[self.df['uniprot_id'] == protid, 'llps'] = 1

    def idr_iupred(self):
        self.df['iupred'] = object
        data = list(self.df["sequence"].iteritems())
        p = Pool(32)
        for idx, idrpred in tqdm(p.imap(calc_idrpred, data), total=self.df.shape[0]):
            self.df.at[idx, 'iupred'] = idrpred
        #for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
        #for index, row in self.df.iterrows():
#             seq = row['sequence']
#             glob = iupred2a.iupred(str(seq), 'glob')
#             short = iupred2a.iupred(str(seq), 'short')
#             long = iupred2a.iured(str(seq), 'long')
#             idrpred = IuPred(glob, short, long)
#             self.df.at[index, 'iupred'] = idrpred
            
    def hydrophobic(self):
        for index, row in self.df.iterrows():
            hpilst = pd.Series(list(row['sequence'])).map(HP).tolist()
            self.df.loc[index, 'HydroPhobicIndex'] = HydroPhobicIndex(hpilst)
            
    def amino_acid_analysis(self):
        for res in RESIDUES:
            self.df['fraction_'+res] = self.df['sequence'].str.count(res) / self.df['sequence'].str.len()
        self.df['length'] = self.df['sequence'].str.len()
        for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
        #for index, row in self.df.iterrows():
            seq = row['sequence']   
            seqanalysis = ProteinAnalysis(seq)
            acidist = seqanalysis.get_amino_acids_percent() 
            self.df.loc[index, 'IEP'] = seqanalysis.isoelectric_point()
            if 'X' not in seq and 'B' not in seq:
                self.df.loc[index, 'molecular_weight'] = seqanalysis.molecular_weight()
            if 'U' not in seq and 'X' not in seq and 'B' not in seq:
                self.df.loc[index, 'gravy'] = seqanalysis.gravy()
          

    def add_iupred_features(self):
        for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
        #for index, row in self.df.iterrows():
            idr = row['iupred'].glob[0]
            self.df.loc[index, 'idr_percetage'] = sum(i > .5 for i in list(idr))
            self.df.loc[index, 'idr_50'] = sum(i > .5 for i in list(idr)) / len(str(row['sequence']))
            self.df.loc[index, 'idr_60'] = sum(i > .6 for i in list(idr)) / len(str(row['sequence']))
            self.df.loc[index, 'idr_70'] = sum(i > .7 for i in list(idr)) / len(str(row['sequence']))
            self.df.loc[index, 'idr_80'] = sum(i > .8 for i in list(idr)) / len(str(row['sequence']))
            self.df.loc[index, 'idr_90'] = sum(i > .9 for i in list(idr)) / len(str(row['sequence']))

    def add_hydrophobic_features(self):
        hpi0, hpi1, hpi2, hpi3, hpi4, hpi5 = list(), list(), list(), list(), list(), list() 
        for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
        #for index, row in self.df.iterrows():
            sw = convolve_signal(row['HydroPhobicIndex'].hpilist, window=30)
            hpi0.append(sum(i < -1.5 for i in sw) / len(sw))
            # self.df.loc[index, 'hpi_<-1.5_frac'] = hpi
            hpi1.append(sum(i < -2.0 for i in sw) / len(sw))
            # self.df.loc[index, 'hpi_<-2.0_frac'] = hpi
            hpi2.append(sum(i < -2.5 for i in sw) / len(sw))
            # self.df.loc[index, 'hpi_<-2.5_frac'] = hpi
            hpi3.append(sum(i < -1.5 for i in sw))
            # self.df.loc[index, 'hpi_<-1.5'] = hpi
            hpi4.append( sum(i < -2.0 for i in sw))
            # self.df.loc[index, 'hpi_<-2.0'] = hpi
            hpi5.append(sum(i < -2.5 for i in sw))
            # self.df.loc[index, 'hpi_<-2.5'] = hpi 
        self.df['hpi_<-1.5_frac'] = hpi0
        self.df['hpi_<-2.0_frac'] = hpi1
        self.df['hpi_<-2.5_frac'] = hpi2
        self.df['hpi_<-1.5'] = hpi3
        self.df['hpi_<-2.0'] = hpi4
        self.df['hpi_<-2.5'] = hpi5
            

    def add_biochemical_combinations(self):
        df = self.df
        df = df.assign(Asx=df['fraction_D'] + df['fraction_N'])
        df = df.assign(Glx=df['fraction_E'] + df['fraction_Q'])
        df = df.assign(Xle=df['fraction_I'] + df['fraction_L'])
        df = df.assign(Pos_charge=df['fraction_K'] + df['fraction_R'] + df['fraction_H'])
        df = df.assign(Neg_charge=df['fraction_D'] + df['fraction_E'])
        df = df.assign(Aromatic=df['fraction_F'] + df['fraction_W'] + df['fraction_Y'] + df['fraction_H'])
        df = df.assign(Alipatic=df['fraction_V'] + df['fraction_I'] + df['fraction_L'] + df['fraction_M'])
        df = df.assign(Small=df['fraction_P'] + df['fraction_G'] + df['fraction_A'] + df['fraction_S'])
        df = df.assign(Hydrophilic=(df['fraction_S'] + df['fraction_T'] + df['fraction_H'] + 
                                    df['fraction_N'] + df['fraction_Q'] + df['fraction_E'] +
                                    df['fraction_D'] + df['fraction_K'] + df['fraction_R']))
        df = df.assign(Hydrophobic= (df['fraction_V'] + df['fraction_I'] + df['fraction_L'] +
                                     df['fraction_F'] + df['fraction_W'] + df['fraction_Y'] +
                                     df['fraction_M']))
        
        # Added in version 2
        for dimer in ['GV', 'VG', 'VP', 'PG', 'FG', 'RG', 'GR', 'GG', 'YG', 'GS', 'SG', 'GA', 'GF', 'GD', 'DS']:
            self.df[dimer] = self.df['sequence'].str.count(dimer)
        df = df.assign(alpha_helix=df['fraction_V'] + df['fraction_I'] + df['fraction_Y'] + df['fraction_F']
                      + df['fraction_W'] + df['fraction_L'])
        df = df.assign(beta_turn=df['fraction_N'] + df['fraction_P'] + df['fraction_G'] + df['fraction_S'])
        df = df.assign(beta_sheet=df['fraction_E'] + df['fraction_M'] + df['fraction_A'] + df['fraction_L'])
        #Calculates the aromaticity value of a protein according to Lobry, 1994. 
        # It is simply the relative frequency of Phe+Trp+Tyr.
        df = df.assign(aromaticity=df['fraction_F'] + df['fraction_W'] + df['fraction_Y'])

        
        
        
        self.df = df
        del df
        
    def add_lowcomplexityscore(self):
        lcs_window = 20
        lcs_cutoff = 7
        for index, row in self.df.iterrows():
            seq = str(row['sequence'])
            if len(seq) > lcs_window+1:
                sig = list()
                for i in range(len(seq)):
                    window = (seq[i: i+lcs_window])
                    if len(window) == lcs_window:
                        acid_comp = len(list(set(window)))
                        sig.append(acid_comp)
                score = sum([1 if i <= 7 else 0 for i in sig])
                self.df.loc[index, 'lcs_score'] = score
                self.df.loc[index, 'lcs_fraction'] = score / len(sig)
                
                
    def add_lowcomplexity_features(self):
        n_window = 20
        cutoff = 7       
        n_halfwindow = int(n_window / 2)        
        lcs_lowest_complexity = list()
        lcs_scores = list()
        lcs_fractions = list()
        for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
        #for index, row in self.df.iterrows():            
            # Determine low complexity scores
            seq = str(row['sequence'])
            lcs_acids = list()
            sig = list()
            
            # New
            lc_bool = [False] * len(seq)
            for i in range(len(seq)):
                if i < n_halfwindow:
                    peptide = seq[:n_window]        
                elif i+n_halfwindow > int(len(seq)):
                    peptide = seq[-n_window:]        
                else:
                    peptide = seq[i-n_halfwindow:i+n_halfwindow]       
                complexity = (len(set(peptide)))
                if complexity <= 7:
                    for bool_index in (i-n_halfwindow, i+n_halfwindow):
                        try:
                            lc_bool[bool_index] = True
                        except IndexError:
                            pass
                    lcs_acids.append(seq[i])
                sig.append(complexity)            
            # Adding low complexity scores to list
            low_complexity_list = pd.DataFrame({'bool':lc_bool, 'acid':list(seq)}, index=None)
            lcs_lowest_complexity.append(min(sig))
            lcs_scores.append(len(low_complexity_list.loc[low_complexity_list['bool'] == True]))
            lcs_fractions.append(len(low_complexity_list.loc[low_complexity_list['bool'] == True]) / len(seq))
            low_complexity_list = pd.DataFrame({'bool':lc_bool, 'acid':list(seq)}, index=None)
            if len(lcs_acids) >= n_window:
                for i in RESIDUES:
                    self.df.loc[index ,i+'_lcscore'] = (len(low_complexity_list.loc[
                        (low_complexity_list['bool'] == True) &
                        (low_complexity_list['acid'] == i)])
                    )
                    self.df.loc[index ,i+'_lcfraction'] = (len(low_complexity_list.loc[
                        (low_complexity_list['bool'] == True) & 
                        (low_complexity_list['acid'] == i)]) / len(lcs_acids)
                    )
        self.df['lcs_fractions'] = lcs_fractions
        self.df['lcs_scores'] = lcs_scores
        self.df['lcs_lowest_complexity'] = lcs_lowest_complexity
        
    def add_plaac(self):
        plaac = CWD+'/data/plaac/plaac_swissprot140219.tsv'
        plaac = pd.read_csv(plaac, sep='\t')
        plaac[['database', 'accession', 'name']] = plaac['SEQid'].str.split('|',expand=True)
        plaac = plaac.drop(['SEQid', 'database', 'name', 'PAPAaa', 'STARTaa', 
                            'ENDaa', 'COREaa', 'MW', 'MWstart', 'MWend', 'MWlen'], axis=1)
        self.df = pd.merge(self.df, plaac, left_on='uniprot_id', right_on='accession')
        self.df = self.df.drop('accession', axis=1)


def convolve_signal(sig, window=25):
    win = signal.hann(window)
    sig = signal.convolve(sig, win, mode='same') / sum(win)
    return sig


def average(l):
    return sum(l) / len(l)

In [14]:
def main(name, fasta_path, operating_system='Windows'):
    # Change pathing
    """ Generates and saves a file which contains features of a protein sequence.
    Parameters:
        name: Name of the file.
        fasta_path: Path of the fasta file which needs to be featured.
        operating_system: String which indicates which operating system is used only 'Windows' available.
    """
    data = MakeMatrix(fasta_path)   
    now = datetime.datetime.now()
    date = (str(now.day) + '-' + str(now.month)  + '-' +  str(now.year))
    #if operating_system == 'Windows':
    data.df.to_pickle('data/Dataframes/'+ name + '_llps_f2f_' + date + '.pkl')
    print('Generated file: ' + name + '_llps_f2f_' + date + '.pkl')
    return data.df

# Homo sapiens

In [15]:
human_proteome_path = CWD+'/data/fasta/UP000005640_9606.fasta'

In [16]:
# Human
df = main(name='UP000005640_9606', fasta_path=human_proteome_path)

self.fasta2df(dbfasta)
20600
0.31s self.fasta2df(dbfasta)
Generated file: UP000005640_9606_llps_f2f_20-11-2020.pkl


In [18]:
df[df["protein_name"] == "YBX1_HUMAN"]

Unnamed: 0,protein_name,uniprot_id,sequence


In [24]:
!egrep ">.*TFG" data/fasta/UP000005640_9606.fasta

>sp|Q5TFG8|ZC21B_HUMAN Zinc finger C2HC domain-containing protein 1B OS=Homo sapiens (Human) OX=9606 GN=ZC2HC1B PE=1 SV=2
>sp|Q8TB96|TIP_HUMAN T-cell immunomodulatory protein OS=Homo sapiens (Human) OX=9606 GN=ITFG1 PE=1 SV=1
>sp|Q92734|TFG_HUMAN Protein TFG OS=Homo sapiens (Human) OX=9606 GN=TFG PE=1 SV=2
>sp|Q969R8|ITFG2_HUMAN KICSTOR complex protein ITFG2 OS=Homo sapiens (Human) OX=9606 GN=ITFG2 PE=1 SV=1


In [None]:
/data/fasta/UP000005640_9606.fasta