In [1]:
import os,sys

import pandas as pd
pd.set_option('display.max_columns', 20)
pd.set_option('display.expand_frame_repr', False)

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from timeit import default_timer as timer
import re
import optuna
import matplotlib 
import matplotlib.pyplot as plt
from pathlib import Path


# helper--
class dotdict(dict):
	__setattr__ = dict.__setitem__
	__delattr__ = dict.__delitem__

	def __getattr__(self, name):
		try:
			return self[name]
		except KeyError:
			raise AttributeError(name)

def time_to_str(t, mode='min'):
	if mode=='min':
		t  = int(t)/60
		hr = t//60
		min = t%60
		return '%2d hr %02d min'%(hr,min) 
	elif mode=='sec':
		t   = int(t)
		min = t//60
		sec = t%60
		return '%2d min %02d sec'%(min,sec)

	else:
		raise NotImplementedError

def gpu_memory_use():
    if torch.cuda.is_available():
        device = torch.device(0)
        free, total = torch.cuda.mem_get_info(device)
        used= (total - free) / 1024 ** 3
        return int(round(used))
    else:
        return 0

def set_aspect_equal(ax):
	x_limits = ax.get_xlim()
	y_limits = ax.get_ylim()
	z_limits = ax.get_zlim()

	# Compute the mean of each axis
	x_middle = np.mean(x_limits)
	y_middle = np.mean(y_limits)
	z_middle = np.mean(z_limits)

	# Compute the max range across all axes
	max_range = max(x_limits[1] - x_limits[0],
					y_limits[1] - y_limits[0],
					z_limits[1] - z_limits[0]) / 2.0

	# Set the new limits to ensure equal scaling
	ax.set_xlim(x_middle - max_range, x_middle + max_range)
	ax.set_ylim(y_middle - max_range, y_middle + max_range)
	ax.set_zlim(z_middle - max_range, z_middle + max_range)


print('torch',torch.__version__)
print('torch.cuda',torch.version.cuda)

print('IMPORT OK!!!')

torch 2.5.1+cu121
torch.cuda 12.1
IMPORT OK!!!


In [2]:
#tuning
class SequenceStructureDataset(Dataset):
    def __init__(self, csv_file, data_dir, transform=None):
        self.df = pd.read_csv(csv_file)
        self.data_dir = Path(data_dir)
        self.transform = transform
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        features = np.load(self.data_dir / f"{row['ID']}_features.npy")
        coords   = np.load(self.data_dir / f"{row['ID']}_coords.npy")
        sample = {'features': torch.from_numpy(features).float(),
                  'coords':   torch.from_numpy(coords).float()}
        return self.transform(sample) if self.transform else sample


DATA_KAGGLE_DIR = '/kaggle/input/stanford-rna-3d-folding'

train_dataset = SequenceStructureDataset(
    csv_file=f"{DATA_KAGGLE_DIR}/train_labels.csv",
    data_dir=f"{DATA_KAGGLE_DIR}/train_data"
)
val_dataset = SequenceStructureDataset(
    csv_file=f"{DATA_KAGGLE_DIR}/validation_labels.csv",
    data_dir=f"{DATA_KAGGLE_DIR}/validation_data"
)

In [4]:
from datetime import datetime
import pytz
print('LOGGING TIME OF START:',  datetime.strftime(datetime.now(pytz.timezone('Asia/Singapore')), "%Y-%m-%d %H:%M:%S"))


try:
    import Bio
except:
    #for drfold2 --------
    #!pip install biopython
    !pip install /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install biopython
print('PIP INSTALL OK !!!!')

LOGGING TIME OF START: 2025-05-13 06:31:15
Processing /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: biopython
Successfully installed biopython-1.85
Processing /kaggle/input/biopython/biopython-1.85-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
biopython is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
PIP INSTALL OK !!!!


In [5]:
# MODE = 'local' #'local' # submit
MODE = 'submit'

DATA_KAGGLE_DIR = '/kaggle/input/stanford-rna-3d-folding'
if MODE == 'local':
    valid_df = pd.read_csv("/kaggle/input/validation-sequences-clean-csv/validation_sequences_clean.csv")
    label_df = pd.read_csv("/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv")
    label_df['target_id'] = label_df['ID'].apply(lambda x: '_'.join(x.split('_')[:-1]))

if MODE == 'submit':
	valid_df = pd.read_csv(f'{DATA_KAGGLE_DIR}/test_sequences.csv')

print('len(valid_df)',len(valid_df))
print(valid_df.iloc[0])
print('')


# cfg = dotdict(
#     num_conf = 5,
#     max_length=480,
# )
NUM_CONF=5
MAX_LENGTH=480
DEVICE='cuda' #'cpu'

print('MODE:', MODE)
print('SETTING OK!!!')

len(valid_df) 12
target_id                                                      R1107
sequence           GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUU...
temporal_cutoff                                           2022-05-28
description        CPEB3 ribozyme\nHuman\nhuman CPEB3 HDV-like ri...
all_sequences      >7QR4_1|Chain A|U1 small nuclear ribonucleopro...
Name: 0, dtype: object

MODE: submit
SETTING OK!!!


In [6]:
import os
os.makedirs('/kaggle/working/drfold', exist_ok=True)
!cp -r /kaggle/input/drfold2/DRfold2/* /kaggle/working/drfold/

In [7]:
# import sys, os
# CFG97 = os.path.join('/kaggle/input/drfold/DRfold2/DRfold2', 'cfg_97')
# assert os.path.isdir(CFG97), f"{CFG97} not found!"
# sys.path.insert(0, CFG97)

# # now this should succeed:
# from EvoMSA2XYZ import MSA2XYZ
# from RNALM2.Model import RNA2nd
# from data         import parse_seq, Get_base, BASE_COOR
# from data         import write_frame_coor_to_pdb, parse_pdb_to_xyz

# print("imported!", MSA2XYZ, RNA2nd)

In [8]:
import os
import numpy as np

# 1) Point at the cfg_97 folder (adjust this to your actual path)
CFG97 = '/kaggle/input/drfold2/DRfold2/cfg_97'

# 2) Load the base.npy file once into a Python variable
BASE_COOR = np.load(os.path.join(CFG97, 'base.npy'))

# 3) Now import your parsing and model code
import sys
sys.path.insert(0, CFG97)
from data import parse_seq, Get_base
# (no BASE_COOR to import from data.py)

# 4) When you need the 3×3 base coordinates for a sequence, call:
sequence = "ACGUACGUA"
base_coords = Get_base(sequence, BASE_COOR)
# base_coords.shape == (len(sequence), 3, 3)

In [9]:
import os, sys, shutil

# 1) make sure you have a writable copy
shutil.copytree(
  '/kaggle/input/drfold2/DRfold2',
  '/kaggle/working/drfold',
  dirs_exist_ok=True
)

# 2) point Python at the cfg_97 folder under that copy
BASE = '/kaggle/working/drfold'
sys.path.insert(0, os.path.join(BASE, 'cfg_97'))
from EvoMSA2XYZ import MSA2XYZ
from RNALM2.Model import RNA2nd
from data import parse_seq, Get_base
from util import outpdb
# parse_pdb_to_xyz write_frame_coor_to_pdb

import numpy as np

def parse_pdb_to_xyz(pdb_file, atom_name=" P  "):
    coords, resid, resname = [], [], []
    with open(pdb_file) as f:
        for line in f:
            if line.startswith("ATOM") and line[12:16].strip() == atom_name.strip():
                x = float(line[30:38]); y = float(line[38:46]); z = float(line[46:54])
                coords.append((x,y,z))
                resid.append(int(line[22:26]))
                resname.append(line[17:20].strip())
    return np.array(coords, dtype=np.float32), resname, resid

will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint


  RNAlm.load_state_dict(torch.load(saved_model,map_location=torch.device('cpu')),strict=False)


In [10]:

###########################################################3
KAGGLE_TRUTH_PDB_DIR ='/kaggle/working/drfold/kaggle-casp15-truth'
USALIGN = '/kaggle/working/USalign' 
os.system('cp /kaggle/input/usalign/USalign /kaggle/working/')
os.system('sudo chmod u+x /kaggle/working/USalign')

# evaluate helper
def get_truth_df(target_id, label_df):
    truth_df = label_df[label_df['target_id'] == target_id]
    truth_df = truth_df.reset_index(drop=True)
    return truth_df

def parse_usalign_for_tm_score(output):
    # Extract TM-score based on length of reference structure (second)
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    if not tm_score_match:
        raise ValueError('No TM score found')
    return float(tm_score_match)

def parse_usalign_for_transform(output):
    # Locate the rotation matrix section
    matrix_lines = []
    found_matrix = False

    for line in output.splitlines():
        if "The rotation matrix to rotate Structure_1 to Structure_2" in line:
            found_matrix = True
        elif found_matrix and re.match(r'^\d+\s+[-\d.]+\s+[-\d.]+\s+[-\d.]+\s+[-\d.]+$', line):
            matrix_lines.append(line)
        elif found_matrix and not line.strip():
            break  # Stop parsing if an empty line is encountered after the matrix

    # Parse the rotation matrix values
    rotation_matrix = []
    for line in matrix_lines:
        parts = line.split()
        row_values = list(map(float, parts[1:]))  # Skip the first column (index)
        rotation_matrix.append(row_values)
    return np.array(rotation_matrix)



# data helper
def make_data(seq):
    aa_type = parse_seq(seq)
    base = Get_base(seq, BASE_COOR)
    seq_idx = np.arange(len(seq)) + 1

    msa = aa_type[None, :]
    msa = torch.from_numpy(msa)
    msa = torch.cat([msa, msa], 0) #???
    msa = F.one_hot(msa.long(), 6).float()

    base_x  = torch.from_numpy(base).float()
    seq_idx = torch.from_numpy(seq_idx).long()
    return msa, base_x, seq_idx
    
def make_dummy_solution():
    solution=dotdict()
    for i, row in valid_df.iterrows():
        target_id = row.target_id
        sequence = row.sequence
        solution[target_id]=dotdict(
            target_id=target_id,
            sequence=sequence,
            coord=[],
        )
    return solution

def solution_to_submit_df(solution):
    submit_df = []
    for k,s in solution.items():
        df = coord_to_df(s.sequence, s.coord, s.target_id)
        submit_df.append(df)
    
    submit_df = pd.concat(submit_df)
    return submit_df
 

def coord_to_df(sequence, coord, target_id):
    L = len(sequence)
    df = pd.DataFrame()
    df['ID'] = [f'{target_id}_{i + 1}' for i in range(L)]
    df['resname'] = [s for s in sequence]
    df['resid'] = [i + 1 for i in range(L)]

    num_coord = len(coord)
    for j in range(num_coord):
        df[f'x_{j+1}'] = coord[j][:, 0]
        df[f'y_{j+1}'] = coord[j][:, 1]
        df[f'z_{j+1}'] = coord[j][:, 2]
    return df

In [11]:
################### start here !!! #######################################################3
out_dir = '/kaggle/working/model-output'
os.makedirs(out_dir, exist_ok=True)
solution = make_dummy_solution()


#load model (these are moified versions, not the same from their github repo)
rnalm = RNA2nd(dict(
    s_in_dim=5,
    z_in_dim=2,
    s_dim= 512,
    z_dim= 128,
    N_elayers=18,
))
rnalm_file = '/kaggle/working/drfold/model_hub/RCLM/epoch_67000'
print(rnalm_file)
print(
    rnalm.load_state_dict(torch.load(rnalm_file, map_location='cpu', weights_only=True), strict=False)
    #Unexpected key(s) in state_dict: "ss_head.linear.weight", "ss_head.linear.bias".
)
rnalm = rnalm.to(DEVICE)
rnalm = rnalm.eval()
total_time_taken = 0
max_gpu_mem_used = 0

will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
/kaggle/working/drfold/model_hub/RCLM/epoch_67000
_IncompatibleKeys(missing_keys=[], unexpected_keys=['ss_head.linear.weight', 'ss_head.linear.bias'])


In [12]:
def write_frame_coor_to_pdb(coord, seq, savefile):
    """
    coord: np.ndarray of shape (L,3,3) from your model (P, sugar, N)
    seq:   string of length L
    savefile: path to write PDB with only C1' atom per residue
    """
    L = coord.shape[0]
    with open(savefile, 'w') as f:
        count = 1
        for i, res in enumerate(seq):
            x, y, z = coord[i, 1]   # channel=1 → sugar atom
            # PDB ATOM line building:
            # atom serial, atom name, residue name, chain A, residue seq, x,y,z, occup,temp, element
            f.write(
                f"ATOM  {count:5d}  C1' {res:>3s} A{ i+1:4d}"
                f"{x:8.3f}{y:8.3f}{z:8.3f}  1.00  0.00           C\n"
            )
            count += 1
        f.write("TER\n")


In [13]:
cfg = dict(
    seq_dim=6,
    msa_dim=7,
    N_ensemble=10,   # how many ensemble members
    N_cycle=8,      # how many recycling cycles
    m_dim=64,
    s_dim=64,
    z_dim=64,
)
for c in range(NUM_CONF): 
    msa2xyz = MSA2XYZ(**cfg)
    msa2xyz_file = [
        f'/kaggle/working/drfold/model_hub/cfg_97/model_{k}' for k in [0,1,2,8,9]
    ][c]
    print(msa2xyz_file)
    print(
        msa2xyz.load_state_dict(torch.load(msa2xyz_file, map_location='cpu', weights_only=True), strict=True)
    )
    msa2xyz.msaxyzone.premsa.rnalm = rnalm
    msa2xyz = msa2xyz.to(DEVICE)
    msa2xyz = msa2xyz.eval()
 
    for i,row in valid_df.iterrows():
        start_timer = timer()
        
        target_id = row.target_id
        sequence = row.sequence
        seq = row.sequence    
        
        L = len(sequence)
        if L>MAX_LENGTH:
            i0 = np.random.choice(L-MAX_LENGTH+1)
            i1 = i0 + MAX_LENGTH
        else:
            i0 = 0
            i1 = L
        
        seq = sequence[i0:i1]
        print(c,i,target_id, L, seq[:75]+'...')
        
        msa, base_x, seq_idx = make_data(seq)
        msa, base_x, seq_idx = msa.to(DEVICE), base_x.to(DEVICE), seq_idx.to(DEVICE)
        secondary = None #secondary structure
    
        with torch.no_grad(): 
            out = msa2xyz.pred(msa, seq_idx, secondary, base_x, np.array(list(seq)))

        # key = list(out.keys()) # plddt(L,L), coor(L,3,3), dist_p(L,L,38), dist_c, dist_n,
        # for k in key:
        #     print(k, type(out[k]), out[k].shape)
 
        
        if L!=len(seq):
             out['coor'] = np.pad(out['coor'] ,((i0, L - i1), (0, 0), (0, 0)), 'constant', constant_values=0)


        print('out:',  out['coor'].shape)
        time_taken = timer()-start_timer
        total_time_taken += time_taken
        print('time_taken:', time_to_str(time_taken, mode='sec')) 
        
        gpu_mem_used = gpu_memory_use()
        max_gpu_mem_used = max(max_gpu_mem_used,gpu_mem_used)
        print('gpu_mem_used:', gpu_mem_used, 'GB')

        torch.cuda.empty_cache() 
        sugar_xyz = out['coor'][:, 1, :]   # shape (L,3)
        solution[target_id].coord.append(sugar_xyz)
    print('')
    
#-----end of conformation generation ----
print('MAX_LENGTH', MAX_LENGTH)
print('### total_time_taken:', time_to_str(total_time_taken, mode='min'))
print('### max_gpu_mem_used:', max_gpu_mem_used, 'GB')
print('')

submit_df = solution_to_submit_df(solution)
submit_df.to_csv(f'submission.csv', index=False)
print(submit_df)
print('SUBMIT OK!!!!!!')
print('')


will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
will do checkpoint
/kaggle/working/drfold/model_hub/cfg_97/model_0
<All keys matched successfully>
0 0 R1107 69 GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUUGCACUCCGGCUGCGAAUUCUGCU...


  return fn(*args, **kwargs)


out: (69, 3, 3)
time_taken:  0 min 12 sec
gpu_mem_used: 2 GB
0 1 R1108 69 GGGGGCCACAGCAGAAGCGUUCACGUCGCGGCCCCUGUCAGCCAUUGCACUCCGGCUGCGAAUUCUGCU...
out: (69, 3, 3)
time_taken:  0 min 10 sec
gpu_mem_used: 2 GB
0 2 R1116 157 CGCCCGGAUAGCUCAGUCGGUAGAGCAGCGGCUAAAACAGCUCUGGGGUUGUACCCACCCCAGAGGCCCACGUGG...
out: (157, 3, 3)
time_taken:  0 min 31 sec
gpu_mem_used: 3 GB
0 3 R1117v2 30 UUGGGUUCCCUCACCCCAAUCAUAAAAAGG...
out: (30, 3, 3)
time_taken:  0 min 10 sec
gpu_mem_used: 2 GB
0 4 R1126 363 GGAAUCUCGCCCGAUGUUCGCAUCGGGAUUUGCAGGUCCAUGGAUUACACCAUGCAACGCAGACCUGUAGAUGCC...
out: (363, 3, 3)
time_taken:  4 min 10 sec
gpu_mem_used: 8 GB
0 5 R1128 238 GGAAUAUCGUCAUGGUGAUUCGUCACCAUGAGGCUAGAUCUCAUAUCUAGCGCUUUCGAGCGCUAGAGUCCUUAU...
out: (238, 3, 3)
time_taken:  1 min 20 sec
gpu_mem_used: 4 GB
0 6 R1136 374 GGAUACGUCUACGCUCAGUGACGGACUCUCUUCGGAGAGUCUGACAUCCGAACCAUACACGGAUGUGCCUCGCCG...
out: (374, 3, 3)
time_taken:  4 min 24 sec
gpu_mem_used: 8 GB
0 7 R1138 720 GCGGGCGUAUAGGUUCGUCUAUACGUCCGCGUUUUCCGAGAAGAGGUA

In [21]:
import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    """Parse TM-score from USalign output with detailed debugging"""
    print(f"DEBUG: Raw USalign output:")
    print(f"'{output}'")
    print(f"DEBUG: Output length: {len(output)}")
    
    if not output.strip():
        print("Warning: Empty output from USalign")
        return 0.0
    
    # Look for all TM-score patterns
    tm_score_matches = re.findall(r'TM-score=\s*([\d.]+)', output)
    print(f"DEBUG: Found TM-score matches: {tm_score_matches}")
    
    if len(tm_score_matches) == 0:
        print("Warning: No TM-score found in output")
        return 0.0
    elif len(tm_score_matches) == 1:
        print("Warning: Only one TM-score found, using it")
        return float(tm_score_matches[0])
    else:
        print(f"Found {len(tm_score_matches)} TM-scores, using the second one")
        return float(tm_score_matches[1])

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def test_usalign():
    """Test if USalign is working properly"""
    usalign_path = "/kaggle/working/USalign"
    
    # Check if file exists
    if not os.path.exists(usalign_path):
        print(f"ERROR: USalign not found at {usalign_path}")
        return False
    
    # Check if it's executable
    if not os.access(usalign_path, os.X_OK):
        print(f"ERROR: USalign at {usalign_path} is not executable")
        print("Trying to make it executable...")
        os.chmod(usalign_path, 0o755)
    
    # Test basic execution
    try:
        test_output = os.popen(f'{usalign_path} 2>&1').read()
        print(f"USalign test output: {test_output[:200]}...")
        return True
    except Exception as e:
        print(f"ERROR testing USalign: {e}")
        return False

def score_and_report_debug(solution: pd.DataFrame, submission: pd.DataFrame):
    """Scoring function with extensive debugging"""
    print("=== Starting scoring with debug output ===")
    
    # Test USalign first
    if not test_usalign():
        print("USalign test failed, cannot proceed with scoring")
        return {}, 0.0
    
    # extract target_id
    solution['target_id'] = solution['ID'].str.split('_').str[0]
    submission['target_id'] = submission['ID'].str.split('_').str[0]

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))
    print(f"Native structure indices: {native_idxs}")

    usalign = "/kaggle/working/USalign"
    per_target = {}

    # Test with just the first target for debugging
    target_ids = solution['target_id'].unique()
    print(f"Found {len(target_ids)} targets, testing first one for debugging...")
    
    for target_idx, (tid, grp_nat) in enumerate(solution.groupby('target_id')):
        print(f"\n=== Processing target {tid} ({target_idx+1}/{len(target_ids)}) ===")
        grp_pred = submission[submission['target_id'] == tid]
        
        print(f"Native group shape: {grp_nat.shape}")
        print(f"Predicted group shape: {grp_pred.shape}")
        
        best_of_five = []

        for pred_cnt in range(1, 6):
            print(f"\n--- Testing prediction {pred_cnt} ---")
            best_for_this_pred = 0.0
            
            for nat_cnt in native_idxs:
                print(f"Comparing prediction {pred_cnt} vs native {nat_cnt}")
                
                n_nat = write2pdb(grp_nat, nat_cnt, 'native.pdb')
                n_pred = write2pdb(grp_pred, pred_cnt, 'predicted.pdb')
                
                print(f"Native atoms written: {n_nat}, Predicted atoms written: {n_pred}")
                
                if n_nat > 0 and n_pred > 0:
                    cmd = f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    print(f"Running command: {cmd}")
                    
                    try:
                        out = os.popen(cmd).read()
                        score = parse_tmscore_output(out)
                        print(f"TM-score: {score}")
                        best_for_this_pred = max(best_for_this_pred, score)
                    except Exception as e:
                        print(f"Error running USalign: {e}")
                        continue
                else:
                    print("Skipping due to empty structures")
            
            best_of_five.append(best_for_this_pred)
            print(f"Best score for prediction {pred_cnt}: {best_for_this_pred}")

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, best = {max(best_of_five):.4f}")
        
        # Only process first target for debugging, remove this break for full scoring
        if target_idx == 0:
            print("=== Debug mode: stopping after first target ===")
            break

    overall = np.mean([max(v) for v in per_target.values()]) if per_target else 0.0
    print(f"\n>>> mean best-of-5 TM-score = {overall:.4f}")
    return per_target, overall

# Quick function to check PDB files
def check_pdb_files():
    """Check if PDB files are being created correctly"""
    for filename in ['native.pdb', 'predicted.pdb']:
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                content = f.read()
                print(f"\n=== {filename} content (first 500 chars) ===")
                print(content[:500])
                print(f"=== {filename} total lines: {len(content.splitlines())} ===")
        else:
            print(f"{filename} does not exist")

# Main execution
if __name__ == "__main__":
    solution = pd.read_csv(
        "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
    )
    submission = pd.read_csv("submission.csv")

    print("Solution columns:", solution.columns.tolist())
    print("Submission columns:", submission.columns.tolist())
    print("Solution shape:", solution.shape)
    print("Submission shape:", submission.shape)

    # Run debug scoring
    per_target_scores, mean_tm = score_and_report_debug(solution, submission)
    
    # Check PDB files after scoring
    check_pdb_files()

In [None]:
submission.to_csv("/kaggle/working/drfold2_submission_with_confidence.csv", index=False)