In [21]:
import pandas as pd
import ast
import importlib.util
import os

funct_path= os.path.abspath('../../utils/functions.py')

In [22]:
# Importa funciones desde archivo externo
spec = importlib.util.spec_from_file_location("functions", funct_path)
funct = importlib.util.module_from_spec(spec)
spec.loader.exec_module(funct)

In [23]:
# Se lee el archivo
with open('../../data/raw/PEP_LIFE.data', 'r') as f:
    content = f.read()

In [24]:
# Convierte una cadena de texto a un diccionario
data = ast.literal_eval(content)

In [25]:
rows = []
for key, value in data.items():
    sequence, modifications = key
    half_life_data = value['half_life_data']
    references = value['references']
    url_sources = value['url_sources']
    
    # Extraer datos de cada entrada en half_life_data
    for entry, url in half_life_data:
        row = {
            'Sequence': sequence,
            'Modifications': modifications,
            'Test Sample': entry['test_sample'],
            'Vivo/Vitro': entry['vivo_vitro'],
            'Half Life (seconds)': entry['half_life_seconds'],
            'Reference URL': references[0] if references else None,
            'Source URL': url,
            'Other Sources': ', '.join(url_sources)
        }
        rows.append(row)

In [26]:
df = pd.DataFrame(data)

In [27]:
df=df.transpose()
df=df.reset_index()

In [28]:
df['half_life_data'] = df['half_life_data'].astype(str).str.replace(r'[\[\]():{}]', '', regex=True)
df['modifications'] = df['modifications'].astype(str).str.replace(r'[\'()]', '', regex=True)
df['modifications'] = df['modifications'].str.rstrip(',')

In [29]:
extracted_data = df['half_life_data'].apply(funct.extract_data)
df_expanded = pd.DataFrame(extracted_data.tolist())
df['half_life_data'] = df_expanded[1]

In [30]:
df_concat = pd.concat([df, df_expanded], axis=1)
df_concat.drop(columns=[1], inplace=True)

In [31]:
#renombra columnas, cambia a mayusculas la secuencia y pone en mayusculas la primera letra de las modificaciones, ademas de agregar columna is_mod
df_concat = df_concat.rename(columns={'level_0': 'sequence', 'half_life_data': 'half_life_seconds', 3: 'experimental_characteristics', 5: 'vivo_or_vitro'})
funct.mod_torf(df_concat)
df_concat['sequence'] = df_concat['sequence'].str.upper()
df_concat['modifications'] = df_concat['modifications'].str.title()

In [32]:
df_concat.dropna(subset=['sequence', 'half_life_seconds'], inplace=True)

In [33]:
df_concat=df_concat.filter(['sequence','experimental_characteristics','modifications','vivo_or_vitro','is_mod','half_life_seconds'])

In [34]:
df_concat

Unnamed: 0,sequence,experimental_characteristics,modifications,vivo_or_vitro,is_mod,half_life_seconds
0,RRWQWR,Human serum,,in vitro,False,1800
1,RRWWRF,Human serum,,in vitro,False,1800
2,RRWQWR,Human serum,C-Terminal Amidation,in vitro,True,1800
3,RRWWRF,Human serum,C-Terminal Amidation,in vitro,True,1800
4,RRWQWR,Human serum,N-Terminal,in vitro,True,3600
...,...,...,...,...,...,...
1132,FNAPFDVGIKLSGAQYQQHGRAL,Intravenous injection of peptide into Sprague...,C-Terminal Amidation,in vivo,True,1320
1133,FNAPFDVGIKLSGVQYQQHSQAL,Mouse plasma,C-Terminal Amidation,in vitro,True,3906
1134,FNAPFDVGIKLSGVQYQQHSQAL,Mouse liver homogenate,"C-Terminal Amidation, Chemical Modification: I...",in vitro,True,906
1135,ASISGRDTHRLTRTLNCSSIVKEIIGKLPEPELKTDDEGPSLRNKS...,Injected intravenously in Balb/c mice blood,"Cyclic, Chemical Modification: Glycosylation A...",in vivo,True,126


In [35]:
# Intenta convertir a float, si no puede, pone False
df_concat['hl_is_float'] = df_concat['half_life_seconds'].apply(funct.float_or_false)
df_float=df_concat[df_concat['hl_is_float']==True]
df_object=df_concat[df_concat['hl_is_float']==False]

In [36]:
df_float['half_life_seconds'] = df_float['half_life_seconds'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_float['half_life_seconds'] = df_float['half_life_seconds'].astype(float)


In [37]:
df_float=df_float.drop(columns=['vivo_or_vitro', 'hl_is_float'])

In [38]:
df_float=df_float.filter(['sequence', 'half_life_seconds', 'is_mod', 'modifications', 'experimental_characteristics'])

In [39]:
df_float

Unnamed: 0,sequence,half_life_seconds,is_mod,modifications,experimental_characteristics
0,RRWQWR,1800.0,False,,Human serum
1,RRWWRF,1800.0,False,,Human serum
2,RRWQWR,1800.0,True,C-Terminal Amidation,Human serum
3,RRWWRF,1800.0,True,C-Terminal Amidation,Human serum
4,RRWQWR,3600.0,True,N-Terminal,Human serum
...,...,...,...,...,...
1132,FNAPFDVGIKLSGAQYQQHGRAL,1320.0,True,C-Terminal Amidation,Intravenous injection of peptide into Sprague...
1133,FNAPFDVGIKLSGVQYQQHSQAL,3906.0,True,C-Terminal Amidation,Mouse plasma
1134,FNAPFDVGIKLSGVQYQQHSQAL,906.0,True,"C-Terminal Amidation, Chemical Modification: I...",Mouse liver homogenate
1135,ASISGRDTHRLTRTLNCSSIVKEIIGKLPEPELKTDDEGPSLRNKS...,126.0,True,"Cyclic, Chemical Modification: Glycosylation A...",Injected intravenously in Balb/c mice blood


In [None]:
df_float.to_csv('../../data/regular/PEP_LIFE.csv', index=False)