In [5]:
import pandas as pd
from modlamp.descriptors import GlobalDescriptor

In [6]:
# *args captura todos los argumentos posicionales adicionales pasados a la función.
# **kwargs captura todos los argumentos de palabra clave adicionales pasados a la función.

In [7]:
def get_calc_desc(sequence, method_name, *args, **kwargs):
    """Cálculo genérico para GlobalDescriptor"""
    try:
        desc = GlobalDescriptor([sequence])  # Crear el objeto
        method = getattr(desc, method_name)  # Obtener el método dinámicamente
        method(*args, **kwargs)  # Llamar al método con sus argumentos
        return round(desc.descriptor[0][0], 4)  # Redondear el resultado
    except:
        return None

In [8]:
def get_frequency_aa(sequence):

    list_residues = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'N', 'K', 'L', 'M', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    row_description = [
        sequence.count(residue)/len(sequence)*100 for residue in list_residues
    ]

    return row_description


In [9]:
def coding_df(df_data, dataset):
    matrix_description =[]

    for index in df_data.index:
        sequence = df_data["sequence"][index]
        hl_cat = df_data["hl_category"][index]
        
        row = [
            sequence,
            get_calc_desc(sequence, "calculate_MW", amide=True),
            get_calc_desc(sequence, "isoelectric_point", amide=True),
            get_calc_desc(sequence, "charge_density", ph=7, amide=True),
            get_calc_desc(sequence, "calculate_charge", ph=7, amide=True),
            get_calc_desc(sequence, "instability_index"),
            get_calc_desc(sequence, "aromaticity"),
            get_calc_desc(sequence, "aliphatic_index"),
            get_calc_desc(sequence, "boman_index"),
            get_calc_desc(sequence, "hydrophobic_ratio"),
            hl_cat
        ]

        row_count = get_frequency_aa(sequence)

        row = row + row_count
        matrix_description.append(row)

    df_description = pd.DataFrame(data=matrix_description, columns=["sequence", "Molecular weight", "Isoelectric Point",
                                                                    "Charge Density", "Charge", "Instability Index", "Aromaticity",
                                                                    "Aliphatic Index", "Boman Index", "Hydrophobic Ratio", "hl_category",
                                                                    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'N', 'K', 'L', 'M', 
                                                                    'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])

    df_description["Name Dataset"] = dataset
    return df_description

In [10]:
antiviral=pd.read_csv("../../data/Antiviral.csv")
hlp_10=pd.read_csv("../../data/HLP_10.csv")
hlp_16=pd.read_csv("../../data/HLP_16.csv")
peplife=pd.read_csv("../../data//PEP_LIFE.csv")
peptherdia=pd.read_csv("../../data/PepTherDia.csv")
plifepred=pd.read_csv("../../data/Plifepred.csv")
serum=pd.read_csv("../../data/Serum.csv")

In [11]:
df_antiviral_described = coding_df(antiviral, "Antiviral")
df_hlp_10_described = coding_df(hlp_10, "HLP_10")
df_hlp_16_described = coding_df(hlp_16, "HLP_16")
df_peplife_described = coding_df(peplife, "PEPLIFE")
df_plifepred_described = coding_df(plifepred, "Plifepred")
df_serum_described = coding_df(serum, "Serum")

In [12]:
df_concat = pd.concat([
    df_antiviral_described,
    df_hlp_10_described,
    df_hlp_16_described,
    df_peplife_described,
    df_plifepred_described,
    df_serum_described,
], axis=0)

df_concat.head()

Unnamed: 0,sequence,Molecular weight,Isoelectric Point,Charge Density,Charge,Instability Index,Aromaticity,Aliphatic Index,Boman Index,Hydrophobic Ratio,...,M,P,Q,R,S,T,V,W,Y,Name Dataset
0,AAAMSQVTN,891.01,12.25,0.0011,0.996,44.6,0.0,65.5556,0.7033,0.5556,...,11.111111,0.0,11.111111,0.0,11.111111,11.111111,11.111111,0.0,0.0,Antiviral
1,AACEVAKNLNESLIDLQELGKYEQYIKW,3268.71,4.6846,-0.0003,-1.068,46.1643,0.1071,104.6429,1.3254,0.3929,...,0.0,0.0,7.142857,0.0,3.571429,0.0,3.571429,3.571429,7.142857,Antiviral
2,AAGAVVNDL,827.93,6.5,-0.0,-0.004,-0.5444,0.0,141.1111,-0.4456,0.6667,...,0.0,0.0,0.0,0.0,0.0,0.0,22.222222,0.0,0.0,Antiviral
3,AAHLIDALYAEFLGGRVLTT,2130.46,7.6719,0.0,0.096,23.265,0.1,132.0,-0.0175,0.55,...,0.0,0.0,0.0,5.0,0.0,10.0,5.0,0.0,5.0,Antiviral
4,AAHLIDALYAEFLGGRVLTTPVVHRALFYASAVLRQPFLAGVPSA,4779.56,10.1865,0.0005,2.194,58.3624,0.1111,121.5556,-0.0053,0.5556,...,0.0,6.666667,2.222222,6.666667,4.444444,4.444444,11.111111,0.0,4.444444,Antiviral


In [13]:
df_concat.to_csv("described_dataset_full.csv", index=False)