In [21]:
import pandas as pd
import ast
import re

In [22]:
def mod_false(df):
    for i in range(len(df)):
        if df.loc[i,'modifications'] != '':
            df.loc[i,'is_mod'] = 'True'
        else:
            df.loc[i,'is_mod'] = 'False'

In [23]:
def extract_data(text):
    return re.findall(r"'(.*?)'", text)

In [24]:
with open('../raw/PEP_LIFE.data', 'r') as f:
    content = f.read()

In [25]:
data = ast.literal_eval(content)

In [26]:
rows = []
for key, value in data.items():
    sequence, modifications = key
    half_life_data = value['half_life_data']
    references = value['references']
    url_sources = value['url_sources']
    
    # Extraer datos de cada entrada en half_life_data
    for entry, url in half_life_data:
        row = {
            'Sequence': sequence,
            'Modifications': modifications,
            'Test Sample': entry['test_sample'],
            'Vivo/Vitro': entry['vivo_vitro'],
            'Half Life (seconds)': entry['half_life_seconds'],
            'Reference URL': references[0] if references else None,
            'Source URL': url,
            'Other Sources': ', '.join(url_sources)
        }
        rows.append(row)


In [27]:
df = pd.DataFrame(data)

In [28]:
df_t = df.transpose()
df_t.reset_index(inplace=True)
df_t=df_t.filter(['level_0','half_life_data','modifications'])

In [29]:
df_t

Unnamed: 0,level_0,half_life_data,modifications
0,RRWQWR,"[({'half_life_seconds': '1800', 'test_sample':...",()
1,RRWWRF,"[({'half_life_seconds': '1800', 'test_sample':...",()
2,RRWQWR,"[({'half_life_seconds': '1800', 'test_sample':...","(C-Terminal Amidation,)"
3,RRWWRF,"[({'half_life_seconds': '1800', 'test_sample':...","(C-Terminal Amidation,)"
4,RRWQWR,"[({'half_life_seconds': '3600', 'test_sample':...","(N-Terminal,)"
...,...,...,...
1132,FNAPFDVGIKLSGAQYQQHGRAL,"[({'half_life_seconds': '1320', 'test_sample':...","(C-Terminal Amidation,)"
1133,FNAPFDVGIKLSGVQYQQHSQAL,"[({'half_life_seconds': '3906', 'test_sample':...","(C-Terminal Amidation,)"
1134,FNAPFDVGIKLSGVQYQQHSQAL,"[({'half_life_seconds': '906', 'test_sample': ...","(C-Terminal Amidation, Chemical Modification: ..."
1135,ASISGRDTHRLTRTLNCSSIVKEIIGKLPEPELKTDDEGPSLRNKS...,"[({'half_life_seconds': '126', 'test_sample': ...","(Cyclic, Chemical Modification: Glycosylation ..."


In [30]:
df_t['half_life_data'] = df_t['half_life_data'].astype(str).str.replace(r'[\[\]():{}]', '', regex=True)
df_t['modifications'] = df_t['modifications'].astype(str).str.replace(r'[\'()]', '', regex=True)
df_t['modifications'] = df_t['modifications'].str.rstrip(',')

In [31]:
extracted_data = df_t['half_life_data'].apply(extract_data)
df_expanded = pd.DataFrame(extracted_data.tolist())
df_expanded[1]=pd.to_numeric(df_expanded[1], errors='coerce')
df_t['half_life_data'] = df_expanded[1]

In [32]:
df_expanded

Unnamed: 0,0,1,2,3,4,5,6
0,half_life_seconds,1800.0,test_sample,Human serum,vivo_vitro,in vitro,http//crdd.osdd.net/raghava/peplife/display_su...
1,half_life_seconds,1800.0,test_sample,Human serum,vivo_vitro,in vitro,http//crdd.osdd.net/raghava/peplife/display_su...
2,half_life_seconds,1800.0,test_sample,Human serum,vivo_vitro,in vitro,http//crdd.osdd.net/raghava/peplife/display_su...
3,half_life_seconds,1800.0,test_sample,Human serum,vivo_vitro,in vitro,http//crdd.osdd.net/raghava/peplife/display_su...
4,half_life_seconds,3600.0,test_sample,Human serum,vivo_vitro,in vitro,http//crdd.osdd.net/raghava/peplife/display_su...
...,...,...,...,...,...,...,...
1132,half_life_seconds,1320.0,test_sample,Intravenous injection of peptide into Sprague...,vivo_vitro,in vivo,http//crdd.osdd.net/raghava/peplife/display_su...
1133,half_life_seconds,3906.0,test_sample,Mouse plasma,vivo_vitro,in vitro,http//crdd.osdd.net/raghava/peplife/display_su...
1134,half_life_seconds,906.0,test_sample,Mouse liver homogenate,vivo_vitro,in vitro,http//crdd.osdd.net/raghava/peplife/display_su...
1135,half_life_seconds,126.0,test_sample,Injected intravenously in Balb/c mice blood,vivo_vitro,in vivo,http//crdd.osdd.net/raghava/peplife/display_su...


In [33]:
df_t = df_t.rename(columns={'level_0': 'sequence', 'half_life_data': 'half_life_seconds'})
mod_false(df_t)

In [34]:
df_t

Unnamed: 0,sequence,half_life_seconds,modifications,is_mod
0,RRWQWR,1800.0,,False
1,RRWWRF,1800.0,,False
2,RRWQWR,1800.0,C-Terminal Amidation,True
3,RRWWRF,1800.0,C-Terminal Amidation,True
4,RRWQWR,3600.0,N-Terminal,True
...,...,...,...,...
1132,FNAPFDVGIKLSGAQYQQHGRAL,1320.0,C-Terminal Amidation,True
1133,FNAPFDVGIKLSGVQYQQHSQAL,3906.0,C-Terminal Amidation,True
1134,FNAPFDVGIKLSGVQYQQHSQAL,906.0,"C-Terminal Amidation, Chemical Modification: I...",True
1135,ASISGRDTHRLTRTLNCSSIVKEIIGKLPEPELKTDDEGPSLRNKS...,126.0,"Cyclic, Chemical Modification: Glycosylation a...",True


In [35]:
#Por si se quieren quitar las modificaciones
df_filtered = df_t[df_t['is_mod'] == 'False']
df_filtered = df_filtered.drop(columns=['is_mod', 'modifications'])

In [36]:
df_filtered

Unnamed: 0,sequence,half_life_seconds
0,RRWQWR,1800.0
1,RRWWRF,1800.0
12,RRWQWRMKKLG,1800.0
16,AAGIGILTV,22.0
22,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVV,144.0
...,...,...
1124,HGDGSFSDEMNTILDNLAARDFINWLIQTKITDK,138600.0
1126,RREAQNPQAGAVELGGGLGGLNALALEGPPQKRG,594.0
1128,GGFL,780.0
1130,YRQSMNNFQGLRSFGCRFGTCTVQKLAHQIYQFTDKDKDNVAPRSK...,67.5


In [37]:
#df_filtered.to_csv('../datasets/PEP_LIFE.csv', index=False)
df_t.to_csv('../PEPLIFE/PEP_LIFE.csv', index=False)