In [1]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import os, sys, re, joblib
from tqdm.auto import tqdm

# --- PyTorch for Neural Network ---
import torch
import torch.nn as nn

# --- Energy imports ---
from scipy.spatial.distance import cdist
from typing import Dict, List, Tuple, Optional

# --- Helper Functions (Copied from Training Notebook) ---
USALIGN_PATH = "/home/max/Documents/Protenix-KaggleRNA3D/af3-dev/USalign/USalign"
TEMP_DIR = "./temp_pdb_inference/"
os.makedirs(TEMP_DIR, exist_ok=True)

def get_coords(df, pred_idx):
    return df[[f'x_{pred_idx}', f'y_{pred_idx}', f'z_{pred_idx}']].values

def calculate_radius_of_gyration(coords):
    center_of_mass = np.mean(coords, axis=0)
    return np.sqrt(np.mean(np.sum((coords - center_of_mass)**2, axis=1)))

def calculate_rmsd(coords1, coords2):
    from scipy.spatial.transform import Rotation as R
    if coords1.shape != coords2.shape: return np.nan
    coords1_centered = coords1 - coords1.mean(axis=0)
    coords2_centered = coords2 - coords2.mean(axis=0)
    rotation, rmsd = R.align_vectors(coords1_centered, coords2_centered)
    return rmsd

# --- Final Evaluation Script (Provided by you) ---
def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)
    if len(tm_score_match) > 1:
        return float(tm_score_match[1])
    return np.nan

def write_target_line(atom_name, atom_serial, residue_name, chain_id, residue_num,
                      x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='C'):
    atom_name_padded = f" {atom_name.ljust(3)}" if len(atom_name) < 4 else atom_name
    return (
        f"ATOM  {atom_serial:5d} {atom_name_padded:<4s} {residue_name:<3s} {chain_id}"
        f"{residue_num:4d}    {x_coord:8.3f}{y_coord:8.3f}{z_coord:8.3f}"
        f"{occupancy:6.2f}{b_factor:6.2f}          {atom_type:>2s}  \n"
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str):
    resolved_cnt = 0
    written_resids = set()
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            resid = int(row['resid'])
            if resid in written_resids: continue
            x, y, z = row.get(f'x_{xyz_id}'), row.get(f'y_{xyz_id}'), row.get(f'z_{xyz_id}')
            if pd.notna(x) and x > -1e17:
                resolved_cnt += 1
                f.write(write_target_line( "C1'", resid, row['resname'], 'A', resid, x, y, z))
                written_resids.add(resid)
    return resolved_cnt


def get_base_target_id(long_id):
    """Correctly extracts the base target ID (e.g., '9L5R_2') from a full ID ('9L5R_2_1')."""
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)
    
    native_idxs = sorted(int(c.split('_')[1]) for c in solution.columns if c.startswith('x_'))
    per_target, best_scores = {}, []

    for tid, grp_nat in tqdm(solution.groupby('target_id'), desc="Scoring Targets"):
        grp_pred = submission[submission['target_id'] == tid]
        if grp_pred.empty:
            print(f"Warning: No submission found for target {tid}. Skipping.")
            continue
        
        best_of_five = []
        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat = write2pdb(grp_nat, nat_cnt, os.path.join(TEMP_DIR, 'native.pdb'))
                n_pred = write2pdb(grp_pred, pred_cnt, os.path.join(TEMP_DIR, 'predicted.pdb'))
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(f'{USALIGN_PATH} {os.path.join(TEMP_DIR, "predicted.pdb")} {os.path.join(TEMP_DIR, "native.pdb")} -atom " C1\'"').read()
                    score = parse_tmscore_output(out)
                    if score is not None:
                        best_for_this_pred = max(best_for_this_pred, score)
            best_of_five.append(best_for_this_pred)
        
        per_target[tid] = best_of_five
        best_scores.append(max(best_of_five))

    overall = np.mean(best_scores)
    print(f"\n>>> FINAL mean best-of-5 TM-score = {overall:.4f} (scored on {len(best_scores)} targets)")
    return per_target, overall

def calculate_ground_truth_tm(pred_df, pred_idx, native_df):
    """Calculates the true TM-score for one prediction against ALL possible native structures."""
    best_tm_for_this_pred = 0.0
    pred_path = os.path.join(TEMP_DIR, 'predicted_for_gt.pdb')
    native_path = os.path.join(TEMP_DIR, 'native_for_gt.pdb')
    
    n_pred = write2pdb(pred_df, pred_idx, pred_path)
    if n_pred == 0: return 0.0

    native_indices = sorted(int(c.split('_')[1]) for c in native_df.columns if c.startswith('x_'))
    for nat_idx in native_indices:
        n_nat = write2pdb(native_df, nat_idx, native_path)
        if n_nat > 0:
            cmd = f'{USALIGN_PATH} {pred_path} {native_path} -atom " C1\'"'
            output = os.popen(cmd).read()
            tm = parse_tmscore_output(output)
            if tm is not None and tm > best_tm_for_this_pred:
                best_tm_for_this_pred = tm
    return best_tm_for_this_pred

In [2]:
# --- Configuration ---
# Paths to the saved model and scaler from training
MODEL_SAVE_PATH = 'meta_learner_model.pth'
SCALER_SAVE_PATH = 'meta_learner_scaler.pkl'

MODEL_CONFIG = {
    'ProteinX': {
        'predictions_path': '/home/max/Documents/Standford_3DRNA_PredictionData/Protenix_Baseline_Validation/submission.csv',
        'confidences_path': '/home/max/Documents/Standford_3DRNA_PredictionData/Protenix_Baseline_Validation/confidence.csv',
        'ranking_path': '/home/max/Documents/Standford_3DRNA_PredictionData/Protenix_Baseline_Validation/ranking_scores.csv'
    },
    'DrFo2': {
        'predictions_path': '/home/max/Documents/Standford_3DRNA_PredictionData/DRfold2_Baseline_validation/submission.csv',
        'confidences_path': '/home/max/Documents/Standford_3DRNA_PredictionData/DRfold2_Baseline_validation/confidence.csv',
        'ranking_path': None
    },
    'Ribonanza': {
        # Both predictions and confidences are in this one file
        'predictions_path': '/home/max/Documents/Standford_3DRNA_PredictionData/Ribonanza_Baseline_Validation/ribonanzanet2_submission_with_confidence.csv',
        'confidences_path': '/home/max/Documents/Standford_3DRNA_PredictionData/Ribonanza_Baseline_Validation/ribonanzanet2_submission_with_confidence.csv',
        'ranking_path': None
    }
}

SEQUENCES_PATH = '/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_sequences_clean.csv'
LABELS_PATH = '/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_labels_clean.csv'

# Constants
NUM_PREDICTIONS_PER_MODEL = 5
NUCLEOTIDES = ['A', 'C', 'G', 'U']

# --- Define the NN Architecture (must match the trained model) ---
class MetaLearnerNN(nn.Module):
    def __init__(self, input_features):
        super(MetaLearnerNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_features, 64),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(32, 16),
            nn.ReLU(),
            
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
    def forward(self, x): return self.layers(x)

print("✅ Configuration and model architecture defined.")

✅ Configuration and model architecture defined.


In [3]:
from scipy.spatial.distance import cdist
from typing import Dict
import numpy as np

class RNAEnergyScorer:
    def __init__(self, clash_penalty: float = 10.0):
        self.CLASH_PENALTY = clash_penalty
        self.VDW_RADIUS_C1 = 1.7
        self.SOLVATION_CUTOFF = 6.0
        self.BASE_PAIR_CUTOFF = 4.5
        self.BASE_PAIR_ENERGIES = {
            ('A', 'U'): -2.0, ('U', 'A'): -2.0,
            ('G', 'C'): -3.0, ('C', 'G'): -3.0,
            ('G', 'U'): -1.5, ('U', 'G'): -1.5,
        }

    def _calculate_vdw_energy(self, coords: np.ndarray) -> float:
        if coords.shape[0] < 2:
            return 0.0

        distances = cdist(coords, coords)
        sigma = self.VDW_RADIUS_C1 * 2
        r = distances[np.triu_indices_from(distances, k=1)]
        r = r[r > 0.1]

        clashes = np.sum(r < 1.0) * self.CLASH_PENALTY

        ratio = sigma / r
        repulsive_term = np.sum(np.minimum(ratio**12, 1e6))
        attractive_term = np.sum(ratio**6)

        return (repulsive_term - attractive_term) + clashes

    def _calculate_solvation_energy(self, coords: np.ndarray) -> float:
        if coords.shape[0] < 2:
            return 0.0

        distances = cdist(coords, coords)
        neighbor_counts = np.sum((distances > 0) & (distances < self.SOLVATION_CUTOFF), axis=1)
        exposure_factors = 1.0 - np.minimum(neighbor_counts, 12) / 12.0
        return -np.sum(exposure_factors)

    def _calculate_base_pair_energy(self, sequence: str, coords: np.ndarray) -> float:
        if len(sequence) != coords.shape[0]:
            return 0.0

        energy = 0.0
        distances = cdist(coords, coords)

        for i in range(len(sequence)):
            potential_partners = np.where(
                (distances[i] < self.BASE_PAIR_CUTOFF) & (np.abs(np.arange(len(sequence)) - i) > 2)
            )[0]

            for j in potential_partners:
                if i >= j:
                    continue
                pair = (sequence[i], sequence[j])
                if pair in self.BASE_PAIR_ENERGIES:
                    weight = 1.0 - (distances[i, j] / self.BASE_PAIR_CUTOFF)
                    energy += self.BASE_PAIR_ENERGIES[pair] * weight
        return energy

    def calculate_all_energies(self, sequence: str, coords: np.ndarray) -> Dict[str, float]:
        if coords.shape[0] == 0:
            return {'e_vdw': 0.0, 'e_solvation': 0.0, 'e_base_pair': 0.0}

        return {
            'e_vdw': self._calculate_vdw_energy(coords),
            'e_solvation': self._calculate_solvation_energy(coords),
            'e_base_pair': self._calculate_base_pair_energy(sequence, coords)
        }

def extract_energy_features(sequence: str, coords: np.ndarray, scorer: RNAEnergyScorer) -> Dict[str, float]:
    if coords.shape[0] == 0 or len(sequence) == 0:
        return {
            'e_vdw': 0.0, 'e_solvation': 0.0, 'e_base_pair': 0.0,
            'e_balance': 0.0, 'e_attractive_per_res': 0.0, 'e_repulsive_per_res': 0.0
        }

    energies = scorer.calculate_all_energies(sequence, coords)
    e_attractive = energies['e_solvation'] + energies['e_base_pair']
    e_repulsive = energies['e_vdw']

    return {
        'e_vdw': e_repulsive,
        'e_solvation': energies['e_solvation'],
        'e_base_pair': energies['e_base_pair'],
        'e_balance': e_attractive / (e_repulsive + 1e-6),
        'e_attractive_per_res': e_attractive / len(sequence),
        'e_repulsive_per_res': e_repulsive / len(sequence)
    }

In [4]:
# --- Step 1: Load and Pre-process All Data ---
print("--- Step 1: Loading and Pre-processing All Data ---")
try:
    df_sequences = pd.read_csv(SEQUENCES_PATH)
    df_labels = pd.read_csv(LABELS_PATH)
    df_labels['target_id'] = df_labels['ID'].apply(get_base_target_id)
    
    data_dfs = {}
    for model_name, config in MODEL_CONFIG.items():
        if model_name == 'Ribonanza':
            df_raw = pd.read_csv(config['predictions_path'])
            pred_cols = ['ID', 'resname', 'resid'] + [col for col in df_raw.columns if col.startswith(('x_', 'y_', 'z_'))]
            data_dfs[f'{model_name}_preds'] = df_raw[pred_cols].copy()
            rename_dict = {f'confidence_{i}': f'plddt_{i}' for i in range(1, NUM_PREDICTIONS_PER_MODEL + 1)}
            data_dfs[f'{model_name}_conf'] = df_raw.rename(columns=rename_dict)
        else:
            data_dfs[f'{model_name}_preds'] = pd.read_csv(config['predictions_path'])
            data_dfs[f'{model_name}_conf'] = pd.read_csv(config['confidences_path'])
        
        if config.get('ranking_path'):
            data_dfs[f'{model_name}_rank'] = pd.read_csv(config['ranking_path'])

    print("All data files loaded.")
except FileNotFoundError as e:
    print(f"FATAL ERROR: Cannot load a data file: {e}.")
    raise e

--- Step 1: Loading and Pre-processing All Data ---
All data files loaded.


In [5]:
# =============================================================================
# FINAL INFERENCE PIPELINE
# =============================================================================
from sklearn.cluster import DBSCAN

# Helper function to compute the RMSD matrix for clustering
def calculate_pairwise_rmsd_matrix(coords_dict: Dict) -> (np.ndarray, list):
    candidate_keys = list(coords_dict.keys())
    coords_list = [coords_dict[key] for key in candidate_keys]
    n = len(candidate_keys)
    rmsd_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(i + 1, n):
            if coords_list[i].shape[0] > 0 and coords_list[i].shape == coords_list[j].shape:
                rmsd = calculate_rmsd(coords_list[i], coords_list[j])
                rmsd_matrix[i, j] = rmsd
                rmsd_matrix[j, i] = rmsd
            else:
                rmsd_matrix[i, j] = 999.0
                rmsd_matrix[j, i] = 999.0
                
    return rmsd_matrix, candidate_keys

# --- Step 1: Feature Generation ---
print("--- Starting Final Feature Generation for Inference ---")
# Initialize the custom energy scorer
energy_scorer = RNAEnergyScorer()
meta_data_rows = []
all_target_ids_in_sequence_file = df_sequences['target_id'].unique()

for target_id in tqdm(all_target_ids_in_sequence_file, desc="Processing Targets"):
    sequence_row = df_sequences[df_sequences['target_id'] == target_id]
    if sequence_row.empty: continue
    sequence = sequence_row.iloc[0]['sequence']
    
    all_candidate_coords = {}
    for model_name in MODEL_CONFIG:
        target_preds = data_dfs[f'{model_name}_preds'][data_dfs[f'{model_name}_preds']['ID'].str.startswith(target_id)]
        if target_preds.empty: continue
        for i in range(1, NUM_PREDICTIONS_PER_MODEL + 1):
            all_candidate_coords[f'{model_name}_{i}'] = get_coords(target_preds, i)

    cluster_info = {}
    if len(all_candidate_coords) > 1:
        rmsd_matrix, candidate_keys = calculate_pairwise_rmsd_matrix(all_candidate_coords)
        dbscan = DBSCAN(eps=7.5, min_samples=2, metric='precomputed', n_jobs=-1)
        cluster_labels = dbscan.fit_predict(rmsd_matrix)
        num_clusters_found = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
        labels_series = pd.Series(cluster_labels)
        cluster_sizes = labels_series.value_counts().to_dict()
        for k_idx, key in enumerate(candidate_keys):
            label = cluster_labels[k_idx]
            cluster_info[key] = {
                'cluster_label': label,
                'is_outlier': 1 if label == -1 else 0,
                'cluster_size': cluster_sizes.get(label, 0),
                'num_clusters_total': num_clusters_found
            }

    for model_name in MODEL_CONFIG:
        df_conf = data_dfs[f'{model_name}_conf']
        target_confs = df_conf[df_conf['ID'].str.startswith(target_id)]
        if target_confs.empty: continue
        df_rank = data_dfs.get(f'{model_name}_rank')
        target_rank = df_rank[df_rank['target_id'] == target_id] if df_rank is not None else None

        for i in range(1, NUM_PREDICTIONS_PER_MODEL + 1):
            candidate_key = f'{model_name}_{i}'
            if f'plddt_{i}' not in target_confs.columns or candidate_key not in all_candidate_coords:
                continue
            
            features = {'target_id': target_id, 'model_source_str': model_name, 'prediction_index': i}
            
            # Base Features
            features['sequence_length'] = len(sequence)
            for nuc in NUCLEOTIDES: features[f'percent_{nuc}'] = sequence.count(nuc) / len(sequence)
            plddt_scores = target_confs[f'plddt_{i}'].values
            features['std_plddt'] = np.std(plddt_scores)
            if target_rank is not None and not target_rank.empty:
                features.update({'ptm': target_rank.iloc[0].get(f'ptm_{i}', 0), 'ranking_score': target_rank.iloc[0].get(f'ranking_score_{i}', 0)})
            else:
                features.update({'ptm': 0, 'ranking_score': 0})

            # Structural, Ensemble, and Engineered Features
            candidate_coords = all_candidate_coords[candidate_key]
            if candidate_coords is None or candidate_coords.shape[0] == 0: continue
            
            rmsd_to_others = [calculate_rmsd(candidate_coords, other_coords) for key, other_coords in all_candidate_coords.items() if key != candidate_key]
            features['avg_rmsd_to_others'] = np.nanmean(rmsd_to_others)
            info = cluster_info.get(candidate_key, {})
            features['is_outlier'] = info.get('is_outlier', 1)
            features['cluster_size'] = info.get('cluster_size', 0)
            features['num_clusters_total'] = info.get('num_clusters_total', 0)
            if features['sequence_length'] > 0:
                features['rog_normalized'] = calculate_radius_of_gyration(candidate_coords) / np.sqrt(features['sequence_length'])
            else:
                features['rog_normalized'] = 0.0

            # Model-Specific Interaction Features
            base_plddt_std = features['std_plddt']
            base_rog_norm = features['rog_normalized']
            base_percent_low_conf = np.mean(plddt_scores < 70)
            for source in MODEL_CONFIG.keys():
                features[f'std_plddt_x_{source}'] = 0.0
                features[f'rog_norm_x_{source}'] = 0.0
                features[f'low_conf_x_{source}'] = 0.0
            features[f'std_plddt_x_{model_name}'] = base_plddt_std
            features[f'rog_norm_x_{model_name}'] = base_rog_norm
            features[f'low_conf_x_{model_name}'] = base_percent_low_conf

            # Final Energy Features
            try:
                energy_features = extract_energy_features(sequence, candidate_coords, energy_scorer)
                features.update(energy_features)
            except Exception as e:
                null_energy = {'e_vdw': 0.0, 'e_solvation': 0.0, 'e_base_pair': 0.0,
                               'e_balance': 0.0, 'e_attractive_per_res': 0.0, 'e_repulsive_per_res': 0.0}
                features.update(null_energy)
            
            meta_data_rows.append(features)

# --- Step 2: Finalize DataFrame ---
if not meta_data_rows:
    raise ValueError("FATAL ERROR: No feature rows were generated.")

# This is the full dataframe with all generated columns, used for submission assembly and analysis
df_inference = pd.DataFrame(meta_data_rows)
model_mapping = {name: i for i, name in enumerate(MODEL_CONFIG.keys())}
df_inference['model_source'] = df_inference['model_source_str'].map(model_mapping)
df_inference.fillna(df_inference.median(numeric_only=True), inplace=True)

print("\n--- Final Feature Set Generation Complete ---")
print(f"Total features generated: {len(df_inference.columns) - 3}")
display(df_inference.head())

--- Starting Final Feature Generation for Inference ---


Processing Targets:   0%|          | 0/94 [00:00<?, ?it/s]


--- Final Feature Set Generation Complete ---
Total features generated: 29


Unnamed: 0,target_id,model_source_str,prediction_index,sequence_length,percent_A,percent_C,percent_G,percent_U,std_plddt,ptm,...,std_plddt_x_Ribonanza,rog_norm_x_Ribonanza,low_conf_x_Ribonanza,e_vdw,e_solvation,e_base_pair,e_balance,e_attractive_per_res,e_repulsive_per_res,model_source
0,9L5R_2,ProteinX,1,193,0.170984,0.26943,0.217617,0.341969,6.674131,0.500917,...,0.0,0.0,0.0,-12.105778,-164.5,0.0,13.588554,-0.852332,-0.062724,0
1,9L5R_2,ProteinX,2,193,0.170984,0.26943,0.217617,0.341969,6.291851,0.391554,...,0.0,0.0,0.0,-12.341462,-164.166667,0.0,13.302044,-0.850604,-0.063945,0
2,9L5R_2,ProteinX,3,193,0.170984,0.26943,0.217617,0.341969,6.346398,0.425882,...,0.0,0.0,0.0,-11.055151,-164.666667,0.0,14.895019,-0.853195,-0.057281,0
3,9L5R_2,ProteinX,4,193,0.170984,0.26943,0.217617,0.341969,6.726162,0.492399,...,0.0,0.0,0.0,-12.341723,-163.666667,0.0,13.261251,-0.848014,-0.063947,0
4,9L5R_2,ProteinX,5,193,0.170984,0.26943,0.217617,0.341969,6.35663,0.480664,...,0.0,0.0,0.0,15.924505,-164.0,-0.969639,-10.359483,-0.854765,0.08251,0


In [6]:
# --- Step 2: Predict with Model and Select Top 5 (CORRECTED) ---
print("--- Loading models and making predictions... ---")

# Load the fitted scaler and model
scaler = joblib.load(SCALER_SAVE_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MetaLearnerNN(input_features=scaler.n_features_in_).to(device)
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()

try:
    X_inference = df_inference[scaler.feature_names_in_]
except KeyError as e:
    print(f"ERROR: A feature is missing from the inference data that was present in training: {e}")
    # This is a fallback to help debug if the columns still don't match
    print("\nFeatures in inference data:", df_inference.columns.tolist())
    print("Features expected by scaler:", scaler.feature_names_in_)
    raise e

# Scale the inference data
X_inference_scaled = scaler.transform(X_inference)

# Make predictions
with torch.no_grad():
    X_tensor = torch.tensor(X_inference_scaled, dtype=torch.float32).to(device)
    predicted_scores = model(X_tensor).cpu().numpy().flatten()

# Add predictions to the dataframe
df_inference['predicted_tm'] = predicted_scores

# --- Core Selection Logic ---
# For each target_id, find the 5 candidates with the highest predicted score
df_top5 = df_inference.groupby('target_id').apply(lambda x: x.nlargest(5, 'predicted_tm')).reset_index(drop=True)

print("✅ Top 5 candidates selected for each target.")
print(f"Total selected structures: {len(df_top5)}")
display(df_top5[['target_id', 'model_source_str', 'prediction_index', 'predicted_tm']].head(10))

--- Loading models and making predictions... ---
✅ Top 5 candidates selected for each target.
Total selected structures: 470


  df_top5 = df_inference.groupby('target_id').apply(lambda x: x.nlargest(5, 'predicted_tm')).reset_index(drop=True)


Unnamed: 0,target_id,model_source_str,prediction_index,predicted_tm
0,8K85_A,Ribonanza,4,0.880325
1,8K85_A,Ribonanza,3,0.880277
2,8K85_A,Ribonanza,5,0.879872
3,8K85_A,Ribonanza,2,0.879004
4,8K85_A,Ribonanza,1,0.878701
5,8KEB_A,Ribonanza,4,0.826261
6,8KEB_A,Ribonanza,5,0.825076
7,8KEB_A,ProteinX,5,0.597488
8,8KEB_A,ProteinX,1,0.596196
9,8KEB_A,ProteinX,2,0.583674


In [7]:
# --- Step 3: Assemble the Ensembled Submission File (CORRECTED) ---
print("--- Assembling final submission file with robust merging... ---")

final_submission_rows = []
# Group by target_id to process one RNA at a time
for target_id, group in tqdm(df_top5.groupby('target_id'), desc="Assembling Submission"):
    
    # Get the base information (ID, resname, resid) from any of the original prediction files.
    # Set 'resid' as the index to prepare for clean assignment.
    base_info = data_dfs['ProteinX_preds'][data_dfs['ProteinX_preds']['ID'].str.startswith(f"{target_id}_")]
    if base_info.empty:
        base_info = data_dfs['DrFo2_preds'][data_dfs['DrFo2_preds']['ID'].str.startswith(f"{target_id}_")]
        
    target_submission_df = base_info[['ID', 'resname', 'resid']].copy().set_index('resid')
    
    # Iterate from k=1 to 5 to create the new columns x_1, y_1, z_1 ... x_5, y_5, z_5
    for k, (_, selection_row) in enumerate(group.iterrows(), 1):
        
        source_model = selection_row['model_source_str']
        original_pred_idx = selection_row['prediction_index']
        
        # Get the original coordinate data for this selection
        original_coords_df = data_dfs[f'{source_model}_preds'][data_dfs[f'{source_model}_preds']['ID'].str.startswith(f"{target_id}_")]
        
        # Prepare the coordinates to add, with 'resid' as the index
        coords_to_add = original_coords_df[['resid', f'x_{original_pred_idx}', f'y_{original_pred_idx}', f'z_{original_pred_idx}']].copy()
        coords_to_add = coords_to_add.set_index('resid')
        
        # Assign the new columns. This is safer than merging.
        target_submission_df[f'x_{k}'] = coords_to_add[f'x_{original_pred_idx}']
        target_submission_df[f'y_{k}'] = coords_to_add[f'y_{original_pred_idx}']
        target_submission_df[f'z_{k}'] = coords_to_add[f'z_{original_pred_idx}']
        
    # Reset the index to make 'resid' a column again, matching the submission format
    final_submission_rows.append(target_submission_df.reset_index())

# Concatenate all targets into the final submission dataframe
if final_submission_rows:
    ensembled_submission_df = pd.concat(final_submission_rows)
    ensembled_submission_df.to_csv('ensembled_submission.csv', index=False)
    print("Ensembled submission file 'ensembled_submission.csv' created successfully.")
    display(ensembled_submission_df.head())
else:
    print("ERROR: No data was assembled for the final submission.")

--- Assembling final submission file with robust merging... ---


Assembling Submission:   0%|          | 0/94 [00:00<?, ?it/s]

Ensembled submission file 'ensembled_submission.csv' created successfully.


Unnamed: 0,resid,ID,resname,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1,8K85_A_1,G,-10.031795,21.430502,24.104576,11.359874,19.273932,-24.725073,1.739657,25.807976,-21.325783,-25.030249,-3.009271,22.493553,-11.122301,25.338013,20.354237
1,2,8K85_A_2,G,-12.683558,19.124477,19.230865,11.292404,13.435502,-24.485357,2.243749,20.615433,-21.924036,-22.164127,-6.809379,19.551247,-6.083194,22.807205,19.369934
2,3,8K85_A_3,A,-13.850716,18.713408,14.441497,12.951257,9.367302,-22.20678,4.780207,16.301336,-21.403898,-21.007872,-9.34649,15.168128,-2.446923,19.225378,19.853275
3,4,8K85_A_4,G,-12.531183,18.946608,9.559502,15.312449,6.391326,-18.45376,8.139541,12.631317,-19.601051,-20.43655,-9.683,9.947579,-0.673806,14.22598,20.273897
4,5,8K85_A_5,A,-10.470436,19.03057,4.495241,17.220327,4.177942,-13.849748,11.442865,9.733561,-16.63953,-20.069801,-9.160627,4.501581,0.640619,8.873339,20.48245


In [8]:
# --- Step 4: Evaluate the Final Result ---
print("--- Evaluating the performance of the meta-learner ensemble... ---")

# Load the ground truth labels
solution_df = pd.read_csv(LABELS_PATH)

# The submission dataframe is the one we just created
submission_df = pd.read_csv('ensembled_submission.csv')

# Run the scoring
_, final_score = score_and_report(solution_df, submission_df)

--- Evaluating the performance of the meta-learner ensemble... ---


Scoring Targets:   0%|          | 0/94 [00:00<?, ?it/s]


>>> FINAL mean best-of-5 TM-score = 0.4725 (scored on 94 targets)


In [None]:
# =============================================================================
# ANALYSIS CELL
# =============================================================================

print("\n--- In-Depth Ensemble Performance Analysis ---")

# Step 1: Calculate the true TM-score for every candidate in df_inference.
print("Calculating ground truth TM-scores for analysis (efficiently)...")

# This list will store small dataframes, each with results for one target
all_target_results = []

# Get the list of unique targets from the inference results
unique_targets_to_analyze = df_inference['target_id'].unique()

for target_id in tqdm(unique_targets_to_analyze, desc="Calculating True TM-scores"):
    # Get the slice of predictions for this one target
    inference_slice = df_inference[df_inference['target_id'] == target_id].copy()
    
    # Get the corresponding labels once for this target
    native_df_for_target = df_labels[df_labels['ID'].apply(get_base_target_id) == target_id]
    
    # This list will hold the calculated true TM-scores for this target
    true_scores = []
    
    # Iterate through the rows of the slice (15 rows per target)
    for _, row in inference_slice.iterrows():
        model_name = row['model_source_str']
        pred_idx = row['prediction_index']
        
        # Get the prediction data for the specific model
        pred_df_for_target = data_dfs[f'{model_name}_preds'][data_dfs[f'{model_name}_preds']['ID'].apply(get_base_target_id) == target_id]

        if native_df_for_target.empty or pred_df_for_target.empty:
            tm_score = np.nan
        else:
            tm_score = calculate_ground_truth_tm(pred_df_for_target, pred_idx, native_df_for_target)
            
        true_scores.append(tm_score)
        
    # Add the list of scores as a new column to the slice
    inference_slice['true_tm_score'] = true_scores
    all_target_results.append(inference_slice)

# Concatenate all the small dataframes back into one
df_analysis = pd.concat(all_target_results)

print("✅ Analysis data prepared successfully.")

# --- DEBUGGING CHECK ---
if df_analysis['true_tm_score'].isnull().all():
    print("\n❌ CRITICAL ERROR: Could not calculate any true TM-scores.")
else:
    print("\n✅ True TM-scores calculated and merged successfully.")
    
    # --- Metric 1: "Oracle Score" ---
    oracle_best_scores = df_analysis.groupby('target_id')['true_tm_score'].max()
    oracle_score = oracle_best_scores.mean()
    print(f"1. Oracle Score (Theoretical Max): {oracle_score:.4f}")

    # --- Metric 2: Your Model's Achieved Score ---
    print(f"2. Your Model's Achieved Score:     {final_score:.4f}")
    if oracle_score > 0:
        print(f"   (Your model achieved {final_score/oracle_score:.2%} of the theoretical maximum performance)")

    # --- Metric 3: Recall@5 ---
    recall_hits = 0
    total_targets = df_analysis['target_id'].nunique()
    for target_id, group in df_analysis.groupby('target_id'):
        if group['true_tm_score'].notna().any():
            best_candidate_true_idx = group['true_tm_score'].idxmax()
            top_5_predicted_indices = group.nlargest(5, 'predicted_tm').index
            if best_candidate_true_idx in top_5_predicted_indices:
                recall_hits += 1

    if total_targets > 0:
        recall_at_5 = recall_hits / total_targets
        print(f"\n3. Recall@5 (Found the single best candidate): {recall_at_5:.2%} ({recall_hits}/{total_targets} targets)")

    # --- Metric 4: Distribution of selected models ---
    print("\n4. Distribution of Models in Your Final Top 5 Selection:")
    model_distribution = df_top5['model_source_str'].value_counts(normalize=True)
    print(model_distribution)

    # --- Metric 5: Average Predicted vs. True TM-score for each model ---
    print("\n5. Model Performance Analysis (Avg Predicted vs. True TM-score):")
    analysis_summary = df_analysis.groupby('model_source_str').agg(
        avg_predicted_tm=('predicted_tm', 'mean'),
        avg_true_tm=('true_tm_score', 'mean'),
        count=('target_id', 'size')
    ).sort_values(by='avg_true_tm', ascending=False)
    print(analysis_summary)


--- In-Depth Ensemble Performance Analysis ---
Calculating ground truth TM-scores for analysis (efficiently)...


Calculating True TM-scores:   0%|          | 0/94 [00:00<?, ?it/s]

✅ Analysis data prepared successfully.

✅ True TM-scores calculated and merged successfully.
1. Oracle Score (Theoretical Max): 0.5021
2. Your Model's Achieved Score:     0.4725
   (Your model achieved 94.11% of the theoretical maximum performance)

3. Recall@5 (Found the single best candidate): 54.26% (51/94 targets)

4. Distribution of Models in Your Final Top 5 Selection:
model_source_str
Ribonanza    0.538298
ProteinX     0.259574
DrFo2        0.202128
Name: proportion, dtype: float64

5. Model Performance Analysis (Avg Predicted vs. True TM-score):
                  avg_predicted_tm  avg_true_tm  count
model_source_str                                      
ProteinX                  0.396947     0.403984    470
DrFo2                     0.386938     0.353896    470
Ribonanza                 0.426319     0.319214    470
