In [42]:
import pandas as pd
import ast
import re

In [43]:
with open('raw/PEP_LIFE.data', 'r') as f:
    contenido = f.read()

In [44]:
data = ast.literal_eval(contenido)

In [45]:
rows = []
for key, value in data.items():
    sequence, modifications = key
    half_life_data = value['half_life_data']
    references = value['references']
    url_sources = value['url_sources']
    
    # Extraer datos de cada entrada en half_life_data
    for entry, url in half_life_data:
        row = {
            'Sequence': sequence,
            'Modifications': modifications,
            'Test Sample': entry['test_sample'],
            'Vivo/Vitro': entry['vivo_vitro'],
            'Half Life (seconds)': entry['half_life_seconds'],
            'Reference URL': references[0] if references else None,
            'Source URL': url,
            'Other Sources': ', '.join(url_sources)
        }
        rows.append(row)


In [46]:
df = pd.DataFrame(data)

In [47]:
df.shape

(4, 1137)

In [48]:
df_t = df.transpose()
df_t.reset_index(inplace=True)
df_t.drop(columns=['level_1'], inplace=True)
#df_t.drop(columns=['references', 'url_sources','level_1','modifications'], inplace=True)

In [49]:
df_t['half_life_data'] = df_t['half_life_data'].astype(str).str.replace(r'[\[\]():{}]', '', regex=True)
def extract_data(text):
    return re.findall(r"'(.*?)'", text)

extracted_data = df_t['half_life_data'].apply(extract_data)
df_expanded = pd.DataFrame(extracted_data.tolist())
df_expanded[1]=pd.to_numeric(df_expanded[1], errors='coerce')
df_t['half_life_data'] = df_expanded[1]

In [50]:
df_t = df_t.rename(columns={'level_0': 'sequence', 'half_life_data': 'half_life_seconds'})
df_t.to_csv('datasets/PEP_LIFE.csv', index=False)

In [51]:
df = pd.read_csv('raw/10mer-peptides.txt', delimiter="\t")
df=df.rename(columns={'## Peptide Sequence(10mer)':'sequence', 'Half-life(Sec)': 'half_life_seconds'})
df.to_csv('datasets/10mer-peptides-Gorris.csv', index=False)

In [52]:
df = pd.read_csv('raw/16mer-peptides.txt', delimiter="\t")
df=df.rename(columns={'## Peptide Sequence(10mer)':'sequence', 'Half-life(Sec)': 'half_life_seconds'})
df.to_csv('datasets/16mer-peptides-Gorris.csv', index=False)

In [53]:
with open('raw/allseq', 'r') as file:
    lines = file.readlines()

data = []
current_id = None
current_sequence = []

for line in lines:
    line = line.strip()
    if line.startswith('>'):
        if current_id is not None:
            # Añadir el bloque anterior al DataFrame
            data.append([current_id, ''.join(current_sequence)])
        # Iniciar un nuevo bloque
        current_id = line[1:]  # Quitar el '>'
        current_sequence = []
    else:
        current_sequence.append(line)

# Añadir el último bloque al DataFrame
if current_id is not None:
    data.append([current_id, ''.join(current_sequence)])

# Crear el DataFrame
df = pd.DataFrame(data, columns=['ID', 'Sequence'])

df.to_csv('datasets/allseq.csv', index=False)

In [54]:
df = pd.read_csv('raw/thpdb.csv', delimiter=',', encoding='latin1')


In [55]:
df.head()

Unnamed: 0,ID,ThPP ID,Therapeutic peptide Name,Peptide Sequence,Mode of Activity,Molecular Weight (Daltons),Chemical Formula,Isoelectric Point,Hydrophobicity,Melting Point (Ê,...,Specific Indication,Chemical Name,Formulation,Physical Form,Route of administration,Recommended dosage,Contraindications,Side effects,Useful links,Unnamed: 38
0,1001,Th1001,Lepirudin,LVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,Ia,6963.425,C287H440N80O110S6,4.04,-0.777,65,...,heparin-induced thrombocytopenia (HIT) and ass...,"[Leu1, Thr2]-63-desulfohirudin",Each vial of REFLUDAN contains 50 mg lepirudin...,"Sterile, white, freeze-dried powder",Intravenous infusion,Recommended dose is 0.4 mg/kg body weight (up ...,Hypersensitivity,N.A.,http://www.drugs.com/pro/refludan.html,"17381384, 16690967, 16553503, 16466327, 163709..."
1,1002,Th1001,Lepirudin,LVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,Ia,6963.425,C287H440N80O110S6,4.04,-0.777,65,...,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,http://www.drugs.com/drug-interactions/lepirud...,"17381384, 16690967, 16553503, 16466327, 163709..."
2,1003,Th1001,Lepirudin,LVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,Ia,6963.425,C287H440N80O110S6,4.04,-0.777,65,...,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,http://www.rxlist.com/refludan-drug.htm,"17381384, 16690967, 16553503, 16466327, 163709..."
3,1004,Th1001,Lepirudin,LVYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,Ia,6963.425,C287H440N80O110S6,4.04,-0.777,65,...,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,N.A.,"17381384, 16690967, 16553503, 16466327, 163709..."
4,1005,Th1002,Cetuximab,Heavy chain:QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGV...,IIIc,145781.6,C6484H10042N1732O2023S36,8.48,-0.413,71,...,Used for treatment of metastatic colorectal ca...,N.A.,Formulated in a solution with no preservatives...,"Sterile, clear, colorless liquid of pH 7.0-7.4...",Intravenous infusion,Generally given once every week for 6 to 7 wee...,allergic,"Rash (Acne like), Generalized weakness, malais...",http://chemocare.com/chemotherapy/drug-info/er...,"28222590, 28069552, 27033349, 26254368, 261197..."
