In [64]:
import pandas as pd
from modlamp.descriptors import GlobalDescriptor

In [65]:
def get_mw(sequence):
    """Molecular Weight"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.calculate_MW(amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_isoelectric_point(sequence):
    """Isoelectric point"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.isoelectric_point(amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_charge_density(sequence):
    """Charge density"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.charge_density(ph=7, amide=True)
        return round(desc.descriptor[0][0], 5)
    except:
        return None

def get_charge(sequence):
    """Charge"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.calculate_charge(ph=7, amide=True)
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_instability_index(sequence):
    """Instability index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.instability_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_aromaticity(sequence):
    """Aromaticity"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.aromaticity()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_aliphatic_index(sequence):
    """Aliphatic index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.aliphatic_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_boman_index(sequence):
    """Boman index"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.boman_index()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

def get_hydrophobic_ratio(sequence):
    """Hydrophobic ratio"""
    try:
        desc = GlobalDescriptor([sequence])
        desc.hydrophobic_ratio()
        return round(desc.descriptor[0][0], 4)
    except:
        return None

In [66]:
def get_frequency_aa(sequence):

    list_residues = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'N', 'K', 'L', 'M', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    row_description = [
        sequence.count(residue)/len(sequence)*100 for residue in list_residues
    ]

    return row_description


In [67]:
def coding_df(df_data, task):
    matrix_description =[]

    for index in df_data.index:
        sequence = df_data["sequence"][index]

        row = [
            sequence,
            get_mw(sequence),
            get_isoelectric_point(sequence),
            get_charge_density(sequence),
            get_charge(sequence),
            get_instability_index(sequence),
            get_aromaticity(sequence),
            get_aliphatic_index(sequence),
            get_boman_index(sequence),
            get_hydrophobic_ratio(sequence)
        ]

        row_count = get_frequency_aa(sequence)

        row = row + row_count
        matrix_description.append(row)

    df_description = pd.DataFrame(data=matrix_description, columns=["sequence", "Molecular weight", "Isoelectric Point",
                                                                    "Charge Density", "Charge", "Instability Index", "Aromaticity",
                                                                    "Aliphatic Index", "Boman Index", "Hydrophobic Ratio",
                                                                    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'N', 'K', 'L', 'M', 
                                                                    'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])

    df_description["Task"] = task
    return df_description

In [68]:
df_data_antiviral = pd.read_csv("Antiviral/antiviral_nomod_canon.csv")
df_data_hlp = pd.read_csv("HLP/HLP.csv")
df_data_peplife = pd.read_csv("PEPLIFE/PEP_LIFE_nomod_canon.csv")
df_data_peptherdia = pd.read_csv("PepTherDia/PepTherDia_nomod_canon.csv")
df_data_plifepred = pd.read_csv("Plifepred/plifepred_nomod_canon.csv")
df_data_serum = pd.read_csv("Serum/serum_nomod_canon.csv")

In [69]:
print (df_data_antiviral.columns)

Index(['sequence', 'half_life_seconds', 'experimental_characteristics',
       'hl_category'],
      dtype='object')


In [70]:
df_allergen_described = coding_df(df_data_antiviral, "Antiviral")
df_celiac_described = coding_df(df_data_hlp, "HLP")
df_cytotoxic_described = coding_df(df_data_peplife, "PEPLIFE")
df_hemolytic_described = coding_df(df_data_peptherdia, "PepTherDia")
df_neurotoxin_described = coding_df(df_data_plifepred, "Plifepred")
df_toxic_described = coding_df(df_data_serum, "Serum")

In [71]:
df_concat = pd.concat([
    df_allergen_described,
    df_celiac_described,
    df_cytotoxic_described,
    df_hemolytic_described,
    df_neurotoxin_described,
    df_toxic_described,
], axis=0)

df_concat.head()

  df_concat = pd.concat([


Unnamed: 0,sequence,Molecular weight,Isoelectric Point,Charge Density,Charge,Instability Index,Aromaticity,Aliphatic Index,Boman Index,Hydrophobic Ratio,...,M,P,Q,R,S,T,V,W,Y,Task
0,AAAMSQVTN,891.01,12.25,0.00112,0.996,44.6,0.0,65.5556,0.7033,0.5556,...,11.111111,0.0,11.111111,0.0,11.111111,11.111111,11.111111,0.0,0.0,Antiviral
1,AACEVAKNLNESLIDLQELGKYEQYIKW,3268.71,4.6846,-0.00033,-1.068,46.1643,0.1071,104.6429,1.3254,0.3929,...,0.0,0.0,7.142857,0.0,3.571429,0.0,3.571429,3.571429,7.142857,Antiviral
2,AAGAVVNDL,827.93,6.5,-0.0,-0.004,-0.5444,0.0,141.1111,-0.4456,0.6667,...,0.0,0.0,0.0,0.0,0.0,0.0,22.222222,0.0,0.0,Antiviral
3,AAHLIDALYAEFLGGRVLTT,2130.45,7.6719,5e-05,0.096,23.265,0.1,132.0,-0.0175,0.55,...,0.0,0.0,0.0,5.0,0.0,10.0,5.0,0.0,5.0,Antiviral
4,AAHLIDALYAEFLGGRVLTTPVVHRALFYASAVLRQPFLAGVPSA,4779.56,10.1865,0.00046,2.194,58.3624,0.1111,121.5556,-0.0053,0.5556,...,0.0,6.666667,2.222222,6.666667,4.444444,4.444444,11.111111,0.0,4.444444,Antiviral


In [72]:
df_concat.to_csv("described_dataset_full.csv", index=False)