In [186]:
import pandas as pd
import ast
import re

In [187]:
with open('raw/PEP_LIFE.data', 'r') as f:
    contenido = f.read()

In [188]:
data = ast.literal_eval(contenido)

In [189]:
rows = []
for key, value in data.items():
    sequence, modifications = key
    half_life_data = value['half_life_data']
    references = value['references']
    url_sources = value['url_sources']
    
    # Extraer datos de cada entrada en half_life_data
    for entry, url in half_life_data:
        row = {
            'Sequence': sequence,
            'Modifications': modifications,
            'Test Sample': entry['test_sample'],
            'Vivo/Vitro': entry['vivo_vitro'],
            'Half Life (seconds)': entry['half_life_seconds'],
            'Reference URL': references[0] if references else None,
            'Source URL': url,
            'Other Sources': ', '.join(url_sources)
        }
        rows.append(row)


In [190]:
df = pd.DataFrame(data)

In [191]:
df.shape

(4, 1137)

In [192]:
df_t = df.transpose()
df_t.reset_index(inplace=True)
df_t.drop(columns=['level_1'], inplace=True)
#df_t.drop(columns=['references', 'url_sources','level_1','modifications'], inplace=True)

In [193]:
df_t['half_life_data'] = df_t['half_life_data'].astype(str).str.replace(r'[\[\]():{}]', '', regex=True)
def extract_data(text):
    return re.findall(r"'(.*?)'", text)

extracted_data = df_t['half_life_data'].apply(extract_data)
df_expanded = pd.DataFrame(extracted_data.tolist())
df_expanded[1]=pd.to_numeric(df_expanded[1], errors='coerce')
df_t['half_life_data'] = df_expanded[1]

In [194]:
df_t = df_t.rename(columns={'level_0': 'sequence', 'half_life_data': 'half_life_seconds'})
df_t.to_csv('datasets/PEP_LIFE.csv', index=False)

In [195]:
df = pd.read_csv('raw/10mer-peptides.txt', delimiter="\t")
df=df.rename(columns={'## Peptide Sequence(10mer)':'sequence', 'Half-life(Sec)': 'half_life_seconds'})
df.to_csv('datasets/10mer-peptides.csv', index=False)

In [196]:
df = pd.read_csv('raw/16mer-peptides.txt', delimiter="\t")
df=df.rename(columns={'## Peptide Sequence(10mer)':'sequence', 'Half-life(Sec)': 'half_life_seconds'})
df.to_csv('datasets/16mer-peptides.csv', index=False)

In [197]:
with open('raw/allseq', 'r') as file:
    lines = file.readlines()

data = []
current_id = None
current_sequence = []

for line in lines:
    line = line.strip()
    if line.startswith('>'):
        if current_id is not None:
            # Añadir el bloque anterior al DataFrame
            data.append([current_id, ''.join(current_sequence)])
        # Iniciar un nuevo bloque
        current_id = line[1:]  # Quitar el '>'
        current_sequence = []
    else:
        current_sequence.append(line)

# Añadir el último bloque al DataFrame
if current_id is not None:
    data.append([current_id, ''.join(current_sequence)])

# Crear el DataFrame
df = pd.DataFrame(data, columns=['ID', 'Sequence'])

In [198]:
df

Unnamed: 0,ID,Sequence
0,Th1001,LVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...
1,Th1002,Heavy chain:QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGV...
2,Th1003,LKIAAFNIQTFGETKMSNATLVSYIVQILSRYDIALVQEVRDSHLT...
3,Th1004,MGADDVVDSSKSFVMENFSSYHGTKPGYVDSIQKGIQKPKSGTQGN...
4,Th1005,LPAQVAFTPYAPEPGSTCRLREYYDQTAQMCCSKCSPGQHAKVFCT...
...,...,...
234,Th1235,ANSLLEETKQGNLERECIEELCNKEEAREVFENDPETDYFYPKYLV...
235,Th1236,N.A
236,Th1237,FPTIPLSRLFDNAMLRAHRLHQLAFDTYQEFEEAYIPKEQKYSFLQ...
237,Th1238,AIRRYYLGAVELSWDYRQSELLRELHVDTRFPATAPGALPLGPSVL...
