In [36]:
import pandas as pd

import importlib.util
import os

funct_path= os.path.abspath('../../utils/functions.py')
spec = importlib.util.spec_from_file_location("functions", funct_path)
funct = importlib.util.module_from_spec(spec)
spec.loader.exec_module(funct)

In [37]:
antiviral=pd.read_csv('../../data/regular/Antiviral.csv')
hlp_10=pd.read_csv("../../data/regular/HLP_10.csv")
hlp_16=pd.read_csv("../../data/regular/HLP_16.csv")
peplife=pd.read_csv("../../data/regular/PEP_LIFE.csv")
peptherdia=pd.read_csv("../../data/regular/PepTherDia.csv")
plifepred=pd.read_csv("../../data/regular/Plifepred.csv")
serum=pd.read_csv("../../data/regular/Serum.csv")

In [38]:
dataframes = {
    "Antiviral": antiviral,
    "HLP_10": hlp_10,
    "HLP_16": hlp_16,
    "PEP_LIFE": peplife,
    "PepTherDia": peptherdia,
    "Plifepred": plifepred,
    "Serum": serum
}

In [39]:
#peptherdia y plifepred tienen por modificacion

In [40]:
for name, df in dataframes.items():
    if name == 'PepTherDia' or name == 'Plifepred':
        funct.show_duplicates(dataframes[name], name, 'sequence', 'modifications')
    else:
        funct.show_duplicates(dataframes[name], name, 'sequence', 'experimental_characteristics')
    df["length_sequence"] = df["sequence"].str.len()
    dataframes[name] = df[df["length_sequence"] < 50]
    if name == 'PepTherDia' or name == 'Plifepred':
        funct.show_duplicates(dataframes[name], name, 'sequence', 'modifications')
    else:
        funct.show_duplicates(dataframes[name], name, 'sequence', 'experimental_characteristics')

DataFrame: Antiviral, Duplicated Rows Shape: (457, 5)
DataFrame: Antiviral, Unique Rows Shape: (3316, 5)
DataFrame: Antiviral, Duplicated Rows Shape: (439, 6)
DataFrame: Antiviral, Unique Rows Shape: (3281, 6)
DataFrame: HLP_10, Duplicated Rows Shape: (0, 4)
DataFrame: HLP_10, Unique Rows Shape: (189, 4)
DataFrame: HLP_10, Duplicated Rows Shape: (0, 5)
DataFrame: HLP_10, Unique Rows Shape: (189, 5)
DataFrame: HLP_16, Duplicated Rows Shape: (0, 4)
DataFrame: HLP_16, Unique Rows Shape: (186, 4)
DataFrame: HLP_16, Duplicated Rows Shape: (0, 5)
DataFrame: HLP_16, Unique Rows Shape: (186, 5)
DataFrame: PEP_LIFE, Duplicated Rows Shape: (194, 5)
DataFrame: PEP_LIFE, Unique Rows Shape: (937, 5)
DataFrame: PEP_LIFE, Duplicated Rows Shape: (181, 6)
DataFrame: PEP_LIFE, Unique Rows Shape: (884, 6)
DataFrame: PepTherDia, Duplicated Rows Shape: (4, 5)
DataFrame: PepTherDia, Unique Rows Shape: (38, 5)
DataFrame: PepTherDia, Duplicated Rows Shape: (4, 6)
DataFrame: PepTherDia, Unique Rows Shape: (38,

In [41]:
for name, df in dataframes.items():
    print(f"Procesando dataframe: {name}")
    if name == 'PepTherDia' or name == 'Plifepred':
        dfres = funct.remove_duplicates(df, pd, 'sequence', 'modifications', 'half_life_seconds')
        funct.show_duplicates(dfres, name, 'sequence', 'modifications')

    else:
        dfres = funct.remove_duplicates(df, pd, 'sequence', 'experimental_characteristics', 'half_life_seconds')
        funct.show_duplicates(dfres, name, 'sequence', 'experimental_characteristics')
    dataframes[name] = dfres 

Procesando dataframe: Antiviral
DataFrame: Antiviral, Duplicated Rows Shape: (0, 6)
DataFrame: Antiviral, Unique Rows Shape: (3456, 6)
Procesando dataframe: HLP_10
DataFrame: HLP_10, Duplicated Rows Shape: (0, 5)
DataFrame: HLP_10, Unique Rows Shape: (189, 5)
Procesando dataframe: HLP_16
DataFrame: HLP_16, Duplicated Rows Shape: (0, 5)
DataFrame: HLP_16, Unique Rows Shape: (186, 5)
Procesando dataframe: PEP_LIFE
DataFrame: PEP_LIFE, Duplicated Rows Shape: (2, 6)
DataFrame: PEP_LIFE, Unique Rows Shape: (893, 6)
Procesando dataframe: PepTherDia
DataFrame: PepTherDia, Duplicated Rows Shape: (0, 6)
DataFrame: PepTherDia, Unique Rows Shape: (38, 6)
Procesando dataframe: Plifepred
DataFrame: Plifepred, Duplicated Rows Shape: (0, 6)
DataFrame: Plifepred, Unique Rows Shape: (248, 6)
Procesando dataframe: Serum
DataFrame: Serum, Duplicated Rows Shape: (0, 6)
DataFrame: Serum, Unique Rows Shape: (254, 6)


In [42]:
for key, dfres in dataframes.items():
    dfres.reset_index(drop=True, inplace=True)  
    count_mod = (dfres['is_mod'] == True).sum()   # Cuenta los modificados
    count_nomod = (dfres['is_mod'] == False).sum()  # Cuenta los no modificados
    
    print(f"{key}: {count_mod} modificados, {count_nomod} no modificados")
    dfres['hl_category'] = dfres['half_life_seconds'].apply(funct.categorize_hl) 
    #Esto actualiza el dataframe en el diccionario
    dataframes[key] = dfres
    #Esto filtra para solo los dataset que no tienen peptidos modificados
    dfres = dfres[dfres['is_mod'] == False].copy()
    #Se elimina columna is_mod
    dfres.drop(columns=['is_mod'], inplace=True)
    if 'modifications' in dfres.columns:
        dfres.drop(columns=['modifications'], inplace=True)
    #Verifica que el dataframe no estee vacio
    if not dfres.empty:
        dfres.to_csv(f"../../data/not_mod/{key}_nomod.csv", index=False)
    else:
        print(f"{key} no se ha podido guardar en archivo csv porque no existen secuencias sin modificaciones")

Antiviral: 1242 modificados, 2214 no modificados
HLP_10: 0 modificados, 189 no modificados
HLP_16: 0 modificados, 186 no modificados
PEP_LIFE: 702 modificados, 193 no modificados
PepTherDia: 38 modificados, 0 no modificados
PepTherDia no se ha podido guardar en archivo csv porque no existen secuencias sin modificaciones
Plifepred: 69 modificados, 179 no modificados
Serum: 173 modificados, 81 no modificados
