In [1]:
import pandas as pd

In [2]:
def check_peptipedia(df):
    df_pivote = pd.read_csv("../raw/pivoted_sequences_non_filter.csv")
    df["in_peptipedia"] = df["sequence"].isin(df_pivote["sequence"])

    pep_in_peptipedia = df[df["in_peptipedia"] == True]
    pep_in_peptipedia = pd.merge(pep_in_peptipedia, df_pivote, how="inner", on="sequence")
    pep_non_in_peptipedia = df[df["in_peptipedia"] == False]

    return pep_in_peptipedia, pep_non_in_peptipedia

In [3]:
def canon_or_notcanon(sequence):
     alphabet = set("ACDEFGHIKLMNPQRSTVWY")
     sequence = sequence.strip()
     is_canon = True
     for res in set(sequence):
         if res not in alphabet:
             is_canon = False
     return is_canon

In [4]:
def get_path(name):
    for file in list_file:
        if file['name'] == name:
            return file['path']
    return None

In [5]:
# Cambia los valores NaN de la columna is_mod por False
def mod_false(df):
    df['is_mod'].fillna('False', inplace=True)
    return df

In [6]:
list_file = [
    {"name": "H10", "path": "/HLP_10.csv"},
    {"name": "H16", "path": "/HLP_16.csv"},
    {"name": "PL", "path": "/PEP_LIFE.csv"},
    {"name": "PP", "path": "/plifepred.csv"},
    #{"name": "LN", "path": "/LIST_NULL.csv"},
    #{"name": "LN2", "path": "/LIST_NULL_2.csv"}
    #{"name": "THP", "path": "/thpdb.csv"}
]
data_total_pep=pd.DataFrame()
data_total_nopep=pd.DataFrame()

In [7]:
folder="../datasets"
for file in list_file:
    path = get_path(file['name'])
    
    if path:
        df = pd.read_csv(folder + path)
        print(f"Archivo {file['name']} cargado con éxito.")
        dataset_in_peptipedia, dataset_not_in_peptipedia = check_peptipedia(df)
        data_total_pep=pd.concat([data_total_pep, dataset_in_peptipedia])
        data_total_nopep=pd.concat([data_total_nopep, dataset_not_in_peptipedia])
    else:
        print(f"No se encontró la ruta para {file['name']}.")

Archivo H10 cargado con éxito.
Archivo H16 cargado con éxito.
Archivo PL cargado con éxito.
Archivo PP cargado con éxito.


In [8]:
mod_false(data_total_pep)
mod_false(data_total_nopep)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['is_mod'].fillna('False', inplace=True)


Unnamed: 0,sequence,half_life_seconds,in_peptipedia,modifications,is_mod
0,GSIGAASMEF,2.7013,False,,False
1,IGAASMEFCF,0.0629,False,,False
2,AASMEFCFDV,0.0556,False,,False
3,SMEFCFDVFK,0.0103,False,,False
4,EFCFDVFKEL,0.0088,False,,False
...,...,...,...,...,...
250,EAKSQGGSN,150.0000,False,,False
251,GCRFGTCT,67.5000,False,,False
254,FPRPGGGGNGDFEEIPEEYL,1500.0000,False,,False
258,KCNTATCATQRLANFLVHSSNNFGPILPPTNVGSNTY,2880.0000,False,Amidation,True


In [9]:
data_total_pep

Unnamed: 0,sequence,half_life_seconds,in_peptipedia,Anti_HIV,Therapeutic,Anti_nematode,Anti_coronaviridae,Anti_feline_coronavirus,Enzyme_inhibitor,Anti_west_nile_virus,...,Anti_puumala_virus,Anti_amnesic,Cytokine,Anti_hendra_virus,Potentiator,Neuropeptide,Anti_white_spot_syndrome_virus,Anti_herpesviridae,modifications,is_mod
0,YRGGLEPINF,0.0238,True,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,,False
1,ASEKMKILEL,0.0103,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,False
2,VLLPDEVSGL,8.3587,True,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,,False
0,TEWTSSNVMEERKIKV,0.0139,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,False
1,MEERKIKVYLPRMKME,0.0014,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,FNAPFDVGIKLSGVQYQQHSQAL,3906.0000,True,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Amidation,True
105,CSNLSTCVLGKLSQELHKLQTYPRTNTGSGTP,7380.0000,True,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,False
106,CYFQNCPRG,900.0000,True,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Amidation,True
107,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,13680.0000,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Acetylation, Amidation",True


In [10]:
#Ver en orden alfabético
data_total_pep.sort_values(by="sequence", inplace=True)
data_total_pep["is_canon"] = data_total_pep["sequence"].apply(canon_or_notcanon)
data_total_pep

Unnamed: 0,sequence,half_life_seconds,in_peptipedia,Anti_HIV,Therapeutic,Anti_nematode,Anti_coronaviridae,Anti_feline_coronavirus,Enzyme_inhibitor,Anti_west_nile_virus,...,Anti_amnesic,Cytokine,Anti_hendra_virus,Potentiator,Neuropeptide,Anti_white_spot_syndrome_virus,Anti_herpesviridae,modifications,is_mod,is_canon
123,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,414.0,True,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,,False,True
96,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,258.0,True,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,,False,True
215,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,216.0,True,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,"C-Terminal Amidation, Cyclic",True,True
85,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,216.0,True,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,Amidation,True,True
36,AD,3840.0,True,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,10080.0,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,"N-Terminal, Chemical Modification: 19 residue ...",True,True
78,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,31680.0,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,,False,True
23,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,31680.0,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,,False,True
189,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,5400.0,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,"C-Terminal Amidation, N-Terminal",True,True


In [11]:
data_total_nopep

Unnamed: 0,sequence,half_life_seconds,in_peptipedia,modifications,is_mod
0,GSIGAASMEF,2.7013,False,,False
1,IGAASMEFCF,0.0629,False,,False
2,AASMEFCFDV,0.0556,False,,False
3,SMEFCFDVFK,0.0103,False,,False
4,EFCFDVFKEL,0.0088,False,,False
...,...,...,...,...,...
250,EAKSQGGSN,150.0000,False,,False
251,GCRFGTCT,67.5000,False,,False
254,FPRPGGGGNGDFEEIPEEYL,1500.0000,False,,False
258,KCNTATCATQRLANFLVHSSNNFGPILPPTNVGSNTY,2880.0000,False,Amidation,True


In [12]:
canon_pep_list = []
canon_nopep_list = []
notcanon_nopep_list = []
notcanon_pep_list = []

for sequence in data_total_pep["sequence"].unique():
    if canon_or_notcanon(sequence):
        canon_pep_list.append(sequence)
    else:
        notcanon_pep_list.append(sequence)

for sequence in data_total_nopep["sequence"].unique():
    if canon_or_notcanon(sequence):
        canon_nopep_list.append(sequence)
    else:
        notcanon_nopep_list.append(sequence)
df_total_canon_pep = data_total_pep[data_total_pep["sequence"].isin(canon_pep_list)].drop_duplicates()
df_total_canon_nopep = data_total_nopep[data_total_nopep["sequence"].isin(canon_nopep_list)].drop_duplicates()
df_total_notcanon_nopep = data_total_nopep[data_total_nopep["sequence"].isin(notcanon_nopep_list)].drop_duplicates()
df_total_notcanon_pep = data_total_pep[data_total_pep["sequence"].isin(notcanon_pep_list)].drop_duplicates()

for sequence in data_total_pep["sequence"]:
    if canon_or_notcanon(sequence) == False:
        continue
    else:
        df_total_canon_pep = pd.concat([df_total_canon_pep, data_total_pep[data_total_pep["sequence"] == sequence]])
for sequence in data_total_nopep["sequence"]:
    if canon_or_notcanon(sequence) == False:
        df_total_notcanon_nopep = pd.concat([df_total_notcanon_nopep, data_total_nopep[data_total_nopep["sequence"] == sequence]])
    else:
        df_total_canon_nopep = pd.concat([df_total_canon_nopep, data_total_nopep[data_total_nopep["sequence"] == sequence]])

In [13]:
df_total_canon_pep.reset_index(drop=True, inplace=True)
df_total_canon_nopep.reset_index(drop=True, inplace=True)
df_total_notcanon_nopep.reset_index(drop=True, inplace=True)
df_total_notcanon_pep.reset_index(drop=True, inplace=True)

In [14]:
print(f"Duplicados en data_total_pep: {data_total_pep.duplicated().sum()}")
print(f"Duplicados en data_total_nopep: {data_total_nopep.duplicated().sum()}")

Duplicados en data_total_pep: 43
Duplicados en data_total_nopep: 67


In [15]:
df_total_canon_pep

Unnamed: 0,sequence,half_life_seconds,in_peptipedia,Anti_HIV,Therapeutic,Anti_nematode,Anti_coronaviridae,Anti_feline_coronavirus,Enzyme_inhibitor,Anti_west_nile_virus,...,Anti_amnesic,Cytokine,Anti_hendra_virus,Potentiator,Neuropeptide,Anti_white_spot_syndrome_virus,Anti_herpesviridae,modifications,is_mod,is_canon
0,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,414.0,True,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,,False,True
1,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,258.0,True,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,,False,True
2,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,216.0,True,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,"C-Terminal Amidation, Cyclic",True,True
3,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,216.0,True,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,Amidation,True,True
4,AD,3840.0,True,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,13680.0,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,"Acetylation, Amidation",True,True
312,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,10080.0,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,"N-Terminal, Chemical Modification: 19 residue ...",True,True
313,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,31680.0,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,,False,True
314,YTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWF,5400.0,True,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,"C-Terminal Amidation, N-Terminal",True,True


In [16]:
df_total_canon_nopep

Unnamed: 0,sequence,half_life_seconds,in_peptipedia,modifications,is_mod
0,GSIGAASMEF,2.7013,False,,False
1,IGAASMEFCF,0.0629,False,,False
2,AASMEFCFDV,0.0556,False,,False
3,SMEFCFDVFK,0.0103,False,,False
4,EFCFDVFKEL,0.0088,False,,False
...,...,...,...,...,...
1070,RRSSCFGGRIDIIGAQSGLGCNSFRY,246.0000,False,,False
1071,SDAAVDTSSEITTKDLKEKKEVVEEAENRKDVY,8400.0000,False,Acetylation,True
1072,FPRPGGGGNGDFEEIPEEYL,1500.0000,False,,False
1073,KCNTATCATQRLANFLVHSSNNFGPILPPTNVGSNTY,2880.0000,False,Amidation,True


In [17]:
if df_total_notcanon_pep.shape[0] == 0:
    print("No hay secuencias no canónicas en peptipedia.")

In [18]:
df_total_notcanon_pep

Unnamed: 0,sequence,half_life_seconds,in_peptipedia,Anti_HIV,Therapeutic,Anti_nematode,Anti_coronaviridae,Anti_feline_coronavirus,Enzyme_inhibitor,Anti_west_nile_virus,...,Anti_amnesic,Cytokine,Anti_hendra_virus,Potentiator,Neuropeptide,Anti_white_spot_syndrome_virus,Anti_herpesviridae,modifications,is_mod,is_canon
0,Pyr-HP,180.0,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"C-Terminal Amidation, Chemical Modification: P...",True,False
1,Pyr-HP,1200.0,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Chemical Modification: Pyr=pyroglutamic acid,True,False


In [19]:
df_total_notcanon_nopep

Unnamed: 0,sequence,half_life_seconds,in_peptipedia,modifications,is_mod
0,aAGIGILTv,,False,Stereochemistry,True
1,LP-(NMe)-FFD,86400.0,False,"C-Terminal Amidation, N-Terminal, Chemical Mod...",True
2,LP-(NMe)-FF-(NMe)-D,86400.0,False,"C-Terminal Amidation, N-Terminal, Chemical Mod...",True
3,LP-(NMe)-F-(NMe)-FD,86400.0,False,"C-Terminal Amidation, N-Terminal, Chemical Mod...",True
4,LP-(NMe)-F-(NMe)-F-(NMe)-D,86400.0,False,"C-Terminal Amidation, N-Terminal, Chemical Mod...",True
...,...,...,...,...,...
265,WAGaDASGE,774.0,False,,False
266,WAGaDASGE,612.0,False,Amidation,True
267,H-Aib-EGTFTSDVSSYLEGQAAKEFIAWLVK-Aib-R,35280.0,False,Amidation,True
268,YaGFlC,546.0,False,,False


In [20]:
df_total_canon_pep.to_csv("../datasets/canon/peptipedia_canon.csv", index=False)
df_total_canon_nopep.to_csv("../datasets/canon/no_peptipedia_canon.csv", index=False)
df_total_notcanon_nopep.to_csv("../datasets/no_canon/no_peptipedia_notcanon.csv", index=False)
df_total_notcanon_pep.to_csv("../datasets/no_canon/peptipedia_notcanon.csv", index=False)