In [135]:
import pandas as pd
import importlib.util
import os

funct_path= os.path.abspath('../../utils/functions.py')

In [136]:
# Importa funciones desde archivo externo
spec = importlib.util.spec_from_file_location("functions", funct_path)
funct = importlib.util.module_from_spec(spec)
spec.loader.exec_module(funct)

In [137]:
df = pd.read_excel('../../data/raw/antiviral_peptides.xlsx', header=0)

In [138]:
df = df.rename(columns={'Half_Life_Mammalian_': 'Mammalian', 'Half_Life__Yeast_': 'Yeast', 'Half_Life__E_coli_': 'E_coli'})

In [139]:
# Se hicieron categorias los organismos, existia una secuencia con 3 half life de diferentes organismos, estos se separaron cada uno en una fila, por lo que por cada secuencia puede existan 3 valores de half life de difrentes organismos.
df_melted = df.melt(id_vars=['Sequence', 'Source', 'Target_Organism', 'Assay', 'Linear_Cyclic', 'N-terminal_Modification', 'C-terminal_Modification', 'Other_Modification', 'Binding_Target'], 
                    value_vars=['Mammalian', 'Yeast', 'E_coli'],
                    var_name='experimental_characteristics', 
                    value_name='Half_Life')

In [140]:
# Se ordenan las secuencias
df_melted.sort_values(by='Sequence', inplace=True)

In [141]:
# Se utiliza una funcion para juntar las modificaciones en una sola columna y agregar una columna que identifique si la secuencia tiene modificaciones
funct.mod_false(df_melted, pd.notna)
df_coldrop=df_melted.drop(['N-terminal_Modification', 'C-terminal_Modification', 'Other_Modification'], axis=1)
df_coldrop['modifications'] = df_coldrop['modifications'].replace([', Free', 'Free,', ', None', 'None,'], '', regex=True)
df_coldrop = df_coldrop.rename(columns={'Sequence': 'sequence'})
df_coldrop['sequence'] = df_coldrop['sequence'].str.upper()

In [142]:
df_coldrop

Unnamed: 0,sequence,Source,Target_Organism,Assay,Linear_Cyclic,Binding_Target,experimental_characteristics,Half_Life,is_mod,modifications
4420,AAAMSQVTN,Synthetic construct,HIV,Protease inhibition assay,Linear,autolysis,E_coli,>10 hour,False,
448,AAAMSQVTN,Synthetic construct,HIV,Protease inhibition assay,Linear,autolysis,Mammalian,4.4 hour,False,
2434,AAAMSQVTN,Synthetic construct,HIV,Protease inhibition assay,Linear,autolysis,Yeast,>20 hour,False,
5584,AACEVAKNLNESLIDLQELGKYEQYIKW,Synthetic construct(derived from SARS-CoV spik...,SARS-CoV,Plaque formation assay,Linear,membrane,E_coli,>10 hour,False,
69,AACEVAKNLNESLIDLQELGKYEQYIKW,Synthetic construct,SARS-CoV,Plaque reduction assay,Linear,cell membrane,Mammalian,4.4 hour,False,
...,...,...,...,...,...,...,...,...,...,...
1559,YAGAVVNDL,Synthetic construct,HSV,Ribonucleotide reductase assay,Linear,ribonucleotide reductase,Mammalian,,True,Acetylation
5530,YAGAVVNDL,Synthetic construct,HSV,Ribonucleotide reductase assay,Linear,ribonucleotide reductase,E_coli,,False,
2124,YAIIXYNKYXNC,Synthetic construct,"ZIKV,DENV,WNV",protease enzymatic inhibition assay,Linear,NS2B-NS3 protease,Yeast,,True,Thioether-acyl moiety (−S-Ac−)(forms a macrocy...
4110,YAIIXYNKYXNC,Synthetic construct,"ZIKV,DENV,WNV",protease enzymatic inhibition assay,Linear,NS2B-NS3 protease,E_coli,,True,Thioether-acyl moiety (−S-Ac−)(forms a macrocy...


In [143]:
# Se filtran las columnas
df_coldrop=df_coldrop.filter(['sequence', 'experimental_characteristics', 'Assay', 'Target_Organism', 'Binding_Target', 'Source', 'modifications', 'is_mod', 'Half_Life'])

In [144]:
df_coldrop

Unnamed: 0,sequence,experimental_characteristics,Assay,Target_Organism,Binding_Target,Source,modifications,is_mod,Half_Life
4420,AAAMSQVTN,E_coli,Protease inhibition assay,HIV,autolysis,Synthetic construct,,False,>10 hour
448,AAAMSQVTN,Mammalian,Protease inhibition assay,HIV,autolysis,Synthetic construct,,False,4.4 hour
2434,AAAMSQVTN,Yeast,Protease inhibition assay,HIV,autolysis,Synthetic construct,,False,>20 hour
5584,AACEVAKNLNESLIDLQELGKYEQYIKW,E_coli,Plaque formation assay,SARS-CoV,membrane,Synthetic construct(derived from SARS-CoV spik...,,False,>10 hour
69,AACEVAKNLNESLIDLQELGKYEQYIKW,Mammalian,Plaque reduction assay,SARS-CoV,cell membrane,Synthetic construct,,False,4.4 hour
...,...,...,...,...,...,...,...,...,...
1559,YAGAVVNDL,Mammalian,Ribonucleotide reductase assay,HSV,ribonucleotide reductase,Synthetic construct,Acetylation,True,
5530,YAGAVVNDL,E_coli,Ribonucleotide reductase assay,HSV,ribonucleotide reductase,Synthetic construct,,False,
2124,YAIIXYNKYXNC,Yeast,protease enzymatic inhibition assay,"ZIKV,DENV,WNV",NS2B-NS3 protease,Synthetic construct,Thioether-acyl moiety (−S-Ac−)(forms a macrocy...,True,
4110,YAIIXYNKYXNC,E_coli,protease enzymatic inhibition assay,"ZIKV,DENV,WNV",NS2B-NS3 protease,Synthetic construct,Thioether-acyl moiety (−S-Ac−)(forms a macrocy...,True,


In [145]:
#se eliminan valores nulos en las columnas sequence y Half_Life y se separan los valores de Half_Life que son flotantes de los que son objetos
df_coldrop.dropna(subset=['sequence', 'Half_Life'], inplace=True)
df_float=df_coldrop[~df_coldrop['Half_Life'].str.startswith('>', '<')]
df_object=df_coldrop[df_coldrop['Half_Life'].str.startswith('>', '<')]

In [146]:
df_coldrop

Unnamed: 0,sequence,experimental_characteristics,Assay,Target_Organism,Binding_Target,Source,modifications,is_mod,Half_Life
4420,AAAMSQVTN,E_coli,Protease inhibition assay,HIV,autolysis,Synthetic construct,,False,>10 hour
448,AAAMSQVTN,Mammalian,Protease inhibition assay,HIV,autolysis,Synthetic construct,,False,4.4 hour
2434,AAAMSQVTN,Yeast,Protease inhibition assay,HIV,autolysis,Synthetic construct,,False,>20 hour
5584,AACEVAKNLNESLIDLQELGKYEQYIKW,E_coli,Plaque formation assay,SARS-CoV,membrane,Synthetic construct(derived from SARS-CoV spik...,,False,>10 hour
69,AACEVAKNLNESLIDLQELGKYEQYIKW,Mammalian,Plaque reduction assay,SARS-CoV,cell membrane,Synthetic construct,,False,4.4 hour
...,...,...,...,...,...,...,...,...,...
3549,YAGAVVNDL,Yeast,Ribonucleotide reductase assay,HSV,ribonucleotide reductase,Synthetic construct,,False,10 min
3550,YAGAVVNDL,Yeast,Ribonucleotide reductase assay,HSV,ribonucleotide reductase,Synthetic construct,Acetylation,True,10 min
5535,YAGAVVNDL,E_coli,Ribonucleotide reductase assay,HSV,ribonucleotide reductase,Synthetic construct,,False,2 min
1563,YAGAVVNDL,Mammalian,Ribonucleotide reductase assay,HSV,ribonucleotide reductase,Synthetic construct,,False,2.8 hour


In [147]:
#se convierten los valores de horas o minutos a segundos

df_float = df_float[df_float['Half_Life'].str.contains(r'^\d+\.?\d*\s*(hour|min)$')]

'''Originalmente se uso un codigo menos eficiente para convertir los valores, en el nuevo codigo se utiliza lo siguiente:
apply se utiliza para aplicar a cada valor de la columna
lambda se utiliza para definir una funcion anonima o sin nombre, es como una funcion pequeña
round para redondear el valor
split para separar el valor en dos partes, el numero y la unidad de tiempo siendo el 0 el numero y el 1 la unidad de tiempo'''
funct.handm_to_seconds(df_float, 'Half_Life')

  df_float = df_float[df_float['Half_Life'].str.contains(r'^\d+\.?\d*\s*(hour|min)$')]


In [148]:
df_float.reset_index(drop=True, inplace=True)
df_object.reset_index(drop=True, inplace=True)

In [149]:
df_float.rename(columns={'Half_Life': 'half_life_seconds'}, inplace=True)
df_object.rename(columns={'Half_Life': 'half_life'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_object.rename(columns={'Half_Life': 'half_life'}, inplace=True)


In [150]:
df_float=df_float.drop(columns=['Assay', 'Source', 'Target_Organism', 'Binding_Target'])

In [151]:
df_float

Unnamed: 0,sequence,experimental_characteristics,modifications,is_mod,half_life_seconds
0,AAAMSQVTN,Mammalian,,False,15840.0
1,AACEVAKNLNESLIDLQELGKYEQYIKW,Mammalian,,False,15840.0
2,AACEVAKNLNESLIDLQELGKYEQYIKW,Mammalian,,False,15840.0
3,AAGAVVNDL,Mammalian,,False,15840.0
4,AAHLIDALYAEFLGGRVLTT,Mammalian,,False,15840.0
...,...,...,...,...,...
3768,YAGAVVNDL,Yeast,,False,36000.0
3769,YAGAVVNDL,Yeast,Acetylation,True,36000.0
3770,YAGAVVNDL,E_coli,,False,7200.0
3771,YAGAVVNDL,Mammalian,,False,10080.0


In [152]:
df_float.to_csv('../../data/regular/Antiviral.csv', index=False)