In [3]:
from boltz.rescore.load.protein import Protein
import os

def read_fasta_as_dict(fasta_file_path):
    """
    Read a FASTA file and return it as a dictionary.
    
    Args:
        fasta_file_path (str): Path to the FASTA file
        
    Returns:
        dict: Dictionary with headers as keys and sequences as values
    """
    fasta_dict = {}
    
    try:
        with open(fasta_file_path, 'r', encoding='utf-8') as file:
            current_header = None
            current_sequence = []
            
            for line in file:
                line = line.strip()
                
                if line.startswith('>'):
                    # If we have a previous sequence, save it
                    if current_header is not None:
                        fasta_dict[current_header] = ''.join(current_sequence)
                    
                    # Start new sequence
                    current_header = line[1:]  # Remove the '>' character
                    current_sequence = []
                    
                elif line:  # Skip empty lines
                    current_sequence.append(line)
            
            # Don't forget the last sequence
            if current_header is not None:
                fasta_dict[current_header] = ''.join(current_sequence)
                
    except FileNotFoundError:
        print(f"Error: File {fasta_file_path} not found.")
        return {}
    except IOError as e:
        print(f"Error reading file: {e}")
        return {}
    
    return fasta_dict


files = {'aa2ar': '3eml',
       'abl1': '2hzi',
       'aces': '1e66',
       'ada': '2e1w',
       'adrb2': '3ny8' ,
       'ampc': '1l2s',
       'andr': '2am9',
       'csf1r': '3krj',
       'cxcr4': '3odu',
       'def': '1lru',
       'drd4': '5wiu',
       'egfr': '2rgp',
       'fa10': '3kl6',
       'fa7': '1w7x',
       'fabp4': '2nnq',
       'fgfr1': '3c4f',
       'fkb1a': '1j4h',
       'glcm': '2v3f',
       'hdac8': '3f07',
       'hivpr': '1xl2',
       'hmdh': '3ccw',
       'hs90a': '1uyg',
       'ital': '2ica',
       'kit': '3g0e',
       'kith': '2b8t',
       'lck': '2of2',
       'mapk2': '3m2w',
       'mk01': '2ojg',
       'mt1': '6me4',
       'nram': '1b9v',
       'parp1': '3l3m',
       'plk1': '2owb',
       'ppara': '2p54',
       'ptn1': '2azr',
       'pur2': '1njs',
       'reni': '3g6z',
       'rock1': '2etr',
       'src': '3el8',
       'thrb': '1ype',
       'try1': '2ayw',
       'tryb1': '2zec',
       'urok': '1sqt',
       'xiap': '3hl5'
  }

for k, v in files.items():
    print(f"Processing {k} with PDB ID {v}")
    print("=" * 50)
    
    fasta_file = f"/ru-auth/local/home/ichen/lyu_scratch/data/FASTA_files/{v}.fasta"
    seq_dict = read_fasta_as_dict(fasta_file)
    
    # Get protein sequence from PDB
    protein = Protein(f'/lustre/fs6/lyu_lab/scratch/ichen/data/dudez_boltz_rescore/raw/rec_crg/{k.upper()}.pdb')
    seq_dict2, lig_id = protein.get_sequence()

    if len(seq_dict2) > 1:
        print(f"Found {len(seq_dict2)} PDB sequences for {k} ({v})")
    
    print(seq_dict2)

    print(seq_dict)


Processing aa2ar with PDB ID 3eml
{'A': 'IMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNYFVVSLAAADIAVGVLAIPFAITISTGFXAAXXGXLFIACFVLVLTQSSIFSLLAIAIDRYIAIRIPLRYNGLVTGTRAKGIIAICWVLSFAIGLTPMLGWNNXGQSQGXGEGQVAXLFEDVVPMNYMVYFNFFACVLVPLLLMLGVYLRIFLAARRQLRSTLQKEVXAAKSLAIIVGLFALCWLPLXIINCFTFFXPDXSXAPLWLMYLAIVLSXTNSVVNPFIYAYRIREFRQTFRKIIRSXVLRQ'}
{'3EML_1|Chain A|Human Adenosine A2A receptor/T4 lysozyme chimera|Homo sapiens (9606)': 'DYKDDDDAMGQPVGAPPIMGSSVYITVELAIAVLAILGNVLVCWAVWLNSNLQNVTNYFVVSLAAADIAVGVLAIPFAITISTGFCAACHGCLFIACFVLVLTQSSIFSLLAIAIDRYIAIRIPLRYNGLVTGTRAKGIIAICWVLSFAIGLTPMLGWNNCGQPKEGKNHSQGCGEGQVACLFEDVVPMNYMVYFNFFACVLVPLLLMLGVYLRIFLAARRQLNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYRSTLQKEVHAAKSLAIIVGLFALCWLPLHIINCFTFFCPDCSHAPLWLMYLAIVLSHTNSVVNPFIYAYRIREFRQTFRKIIRSHVLRQQEPFKAHHHHHHHHHH'}
Processing abl1 with PDB ID 2hzi
{'A': 'DKWEMERTDITMKXKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEV