In [7]:
from aaindex import aaindex1
  #get full AAindex1 record

sequence = "ACDEFGHIKLMNPQRSTVWY-"

descriptors = ['ARGP820101', 'BIGC670101', 'FAUJ880106','CHAM820101','GRAR740102','RADA880108','FAUJ880111','FAUJ880112','BHAR880101','CHAM830107','FAUJ880109']  # exemple
embedding_seq = []

for aa in sequence:
    vecteur_aa = [aaindex1[d]['values'][aa] for d in descriptors]
    embedding_seq.append(vecteur_aa)
#'

In [11]:
import json
import numpy as np
import pandas as pd
from aaindex import aaindex1

In [25]:

INPUT_JSON = "validation_matches_subset_dssp.json"  # chemin vers ton JSON
OUTPUT_XY = "XY_train.csv"   # chemin de sortie
WINDOW_SIZE = 17             # taille de la fenêtre
PADDING_AA = '-'             # caractère pour padding

SEQUENCE_AA = "ACDEFGHIKLMNPQRSTVWY-"
embedding_lookup = {}
for aa in SEQUENCE_AA:
    embedding_lookup[aa] = [aaindex1[d]['values'][aa] for d in descriptors]


# Liste de descripteurs à utiliser
DESCRIPTORS = [
    'ARGP820101', 'BIGC670101', 'FAUJ880106','CHAM820101','GRAR740102',
    'RADA880108','FAUJ880111','FAUJ880112','BHAR880101','CHAM830107','FAUJ880109'
]

In [31]:

def load_json(json_path):
    """Charge un JSON et retourne la liste des entrées"""
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data

def embed_aa(aa):
    """Retourne le vecteur embedding pour un résidu AA"""
    return embedding_lookup.get(aa, embedding_lookup['-'])

In [32]:
def sequence_to_windows(seq, window_size=17, pad_char='-'):
    """Découpe une séquence en fenêtres centrées, avec padding aux bords"""
    half_w = window_size // 2
    padded_seq = pad_char * half_w + seq + pad_char * half_w
    windows = [padded_seq[i - half_w: i + half_w + 1] for i in range(half_w, len(seq) + half_w)]
    return windows

def window_to_flat_embedding(window):
    """Transforme une fenêtre de résidus en vecteur flatten"""
    flat_vector = []
    for aa in window:
        flat_vector.extend(embed_aa(aa))
    return flat_vector


In [35]:
X, y = [], []
data = load_json(INPUT_JSON)
for entry in data:
    seq = entry['primary_sequence']
    ss = entry['secondary_structure']
    windows = sequence_to_windows(seq, 10)
    for w, s in zip(windows, ss):
        X.append(window_to_flat_embedding(w))
        y.append(s)


In [None]:
def prepare_tabular_dataset(data, window_size=17):
    """
    Transforme un dataset en X et y tabulaire
    data : liste de dict avec 'primary_sequence' et 'secondary_structure'
    """
    X, y = [], []
    for entry in data:
        seq = entry['primary_sequence']
        ss = entry['secondary_structure']
        windows = sequence_to_windows(seq, window_size)
        for w, s in zip(windows, ss):
            X.append(window_to_flat_embedding(w))
            y.append(s)
    return np.array(X), np.array(y)

In [None]:
if __name__ == "__main__":
    # Charger les données
    data = load_json(INPUT_JSON)
    
    # Préparer X et y
    X, y = prepare_tabular_dataset(data, window_size=WINDOW_SIZE)
    
    # Créer un DataFrame pour sauvegarde
    df_X = pd.DataFrame(X)
    df_X['target'] = y
    df_X.to_csv(OUTPUT_XY, index=False)
    
    print(f"✅ Dataset généré : {OUTPUT_XY} | {X.shape[0]} lignes, {X.shape[1]} features")

In [44]:
from Models.embedded_tabular_data import prepare_tabular_dataset

In [45]:
X,y = prepare_tabular_dataset(INPUT_JSON,10)