In [6]:
import gzip
import json
import os
import re


In [None]:
import json
import re

aa_alphabet = list('ACDEFGHIKLMNPQRSTVWY')


def normalize_id(protein_id):
    """
    Normalise un ID de protéine pour ne garder que le PDB et la chaîne.
    Exemples :
        '1XTE_1_A'      -> '1XTE_A'
        '20#1XTE_1_A'   -> '1XTE_A'
        'TBM#T0453'     -> 'T0453'
    """
    if '#' in protein_id:
        protein_id = protein_id.split('#', 1)[1]
    match = re.match(r'([0-9A-Za-z]{4})_?\d*_?([A-Za-z-])?', protein_id)
    if match:
        pdb = match.group(1)
        chain = match.group(2) if match.group(2) else ''
        return f"{pdb}_{chain}" if chain else pdb
    return protein_id

def match_subset_to_dssp(subset_data, dssp_data, save_path=None):
    """
    Compare un subset de protéines avec le dataset DSSP sur la séquence primaire.
    Retourne une liste de dicts contenant les IDs, séquences et DSSP correspondants.
    """
    # Création d'un lookup par séquence primaire dans DSSP
    seq_to_dssp_entry = {}
    for dssp_id, entry in dssp_data.items():
        seq = entry['Sequence']
        seq_to_dssp_entry[seq] = {
            'dssp_id': dssp_id,
            'DSSP': entry.get('DSSP', '')
        }

    matches_info = []
    for record in subset_data:
        seq_train = ''.join([aa_alphabet[idx] for idx in record['primary']])
        if seq_train in seq_to_dssp_entry:
            dssp_entry = seq_to_dssp_entry[seq_train]
            matches_info.append({
                'subset_id': record['id'],
                'dssp_id': dssp_entry['dssp_id'],
                'primary_sequence': seq_train,
                'secondary_structure': dssp_entry['DSSP'],
                'mask': record.get('mask', None),
                'evolutionary': record.get('evolutionary', None),
                'tertiary': record.get('tertiary', None)
            })

    if save_path:
        with open(save_path, 'w') as out_file:
            json.dump(matches_info, out_file, indent=2)

    print(f"Total sequences in subset: {len(subset_data)}")
    print(f"Matches found in DSSP: {len(matches_info)}")
    print(f"Sequences not found in DSSP: {len(subset_data) - len(matches_info)}")

    return matches_info

In [None]:
subset_data_json_path = "training_30.json"
dssp_json_path = "full_protein_dssp_annotations.json"

# Charger les fichiers JSON
with open(subset_data_json_path, 'r') as f:
    subset_data = json.load(f)  # liste de dicts avec 'id' et 'primary'

with open(dssp_json_path, 'r') as f:
    dssp_data = json.load(f)      # dict: ID -> {"Sequence":..., "DSSP":...}

In [None]:
matches = match_subset_to_dssp(subset_data, dssp_data, save_path='matches_subset_dssp.json')