In [1]:
import shutil
import os

# Copy USalign to working directory and make it executable
shutil.copy2("/kaggle/input/usalign/USalign", "/kaggle/working/USalign")
os.chmod("/kaggle/working/USalign", 0o755)

print("USalign copied to /kaggle/working/ and made executable")

USalign copied to /kaggle/working/ and made executable


In [2]:
import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_compare_models(solution: pd.DataFrame, submission1: pd.DataFrame, 
                           submission2: pd.DataFrame, submission3: pd.DataFrame,
                           model_names: list = None):
    """
    Compare 3 models and select best prediction for each target
    
    Args:
        solution: Ground truth dataframe
        submission1, submission2, submission3: Model prediction dataframes
        model_names: List of model names (default: ['Model1', 'Model2', 'Model3'])
    
    Returns:
        results_df: DataFrame with best predictions and metadata
        mean_tm: Overall mean TM score
    """
    if model_names is None:
        model_names = ['Model1', 'Model2', 'Model3']
    
    submissions = [submission1, submission2, submission3]
    
    # Add target_id to all dataframes
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    for sub in submissions:
        sub['target_id'] = sub['ID'].apply(get_base_target_id)
    
    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))
    
    usalign = "/kaggle/working/USalign"
    
    # Find common targets across all submissions
    common_targets = set(solution['target_id'].unique())
    for sub in submissions:
        common_targets &= set(sub['target_id'].unique())
    common_targets = sorted(list(common_targets))
    
    print(f"Scoring {len(common_targets)} common targets across {len(submissions)} models...")
    
    results_list = []
    all_best_tm_scores = []
    
    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_preds = [sub[sub['target_id'] == tid] for sub in submissions]
        
        best_overall_tm = 0.0
        best_model_idx = 0
        best_pred_idx = 1
        best_model_scores = []
        
        # Test all models and all predictions for this target
        for model_idx, grp_pred in enumerate(grp_preds):
            model_best_scores = []
            
            for pred_cnt in range(1, 6):
                best_for_this_pred = 0.0
                for nat_cnt in native_idxs:
                    n_nat = write2pdb(grp_nat, nat_cnt, 'native.pdb')
                    n_pred = write2pdb(grp_pred, pred_cnt, 'predicted.pdb')
                    if n_nat > 0 and n_pred > 0:
                        out = os.popen(
                            f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                        ).read()
                        best_for_this_pred = max(best_for_this_pred,
                                               parse_tmscore_output(out))
                model_best_scores.append(best_for_this_pred)
                
                # Check if this is the best overall prediction
                if best_for_this_pred > best_overall_tm:
                    best_overall_tm = best_for_this_pred
                    best_model_idx = model_idx
                    best_pred_idx = pred_cnt
            
            best_model_scores.append(model_best_scores)
        
        # Get the winning prediction data
        winning_submission = grp_preds[best_model_idx]
        winning_row = winning_submission.iloc[0]  # Get first row for metadata
        
        # Extract sequence info
        sequence = winning_row.get('sequence', '')
        sequence_length = len(sequence) if sequence else len(grp_nat)
        
        # Store results
        result_entry = {
            'ID': tid,
            'sequence': sequence,
            'sequence_length': sequence_length,
            'chosen_model': model_names[best_model_idx],
            'chosen_prediction': best_pred_idx,
            'tm_score': best_overall_tm
        }
        
        # Add the winning coordinates
        for coord in ['x', 'y', 'z']:
            result_entry[f'{coord}_{best_pred_idx}'] = winning_submission[f'{coord}_{best_pred_idx}'].tolist()
        
        results_list.append(result_entry)
        all_best_tm_scores.append(best_overall_tm)
        
        # Print detailed results
        print(f"{tid}:")
        for i, model_scores in enumerate(best_model_scores):
            print(f"  {model_names[i]}: TM-scores = {[f'{s:.4f}' for s in model_scores]}, best = {max(model_scores):.4f}")
        print(f"  Winner: {model_names[best_model_idx]} (prediction {best_pred_idx}) with TM-score = {best_overall_tm:.4f}")
        print()
    
    # Create results dataframe
    results_df = pd.DataFrame(results_list)
    
    # Calculate mean TM score
    mean_tm = np.mean(all_best_tm_scores) if all_best_tm_scores else 0.0
    
    print(f"Overall Mean TM-score: {mean_tm:.4f}")
    
    return results_df, mean_tm

# Example usage:
# Load your data
solution = pd.read_csv("/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv")
submission1 = pd.read_csv("/kaggle/input/predictions/drfold2_submission_with_confidence.csv") 
submission2 = pd.read_csv("/kaggle/input/predictions/protenix_submission_with_confidence.csv")    
submission3 = pd.read_csv("/kaggle/input/predictions/ribonanzanet2_submission_with_confidence.csv") 

# Run comparison with custom model names
model_names = ['DrFold2', 'Protenix', 'Ribonanzanet2']
results_df, mean_tm = score_and_compare_models(
    solution, submission1, submission2, submission3, model_names
)

# Save results
results_df.to_csv("/kaggle/working/best_predictions_comparison.csv", index=False)
print(f"Results saved to best_predictions_comparison.csv")
print(f"Final mean TM-score across all best predictions: {mean_tm:.4f}")

Scoring 94 common targets across 3 models...
8K85_A:
  DrFold2: TM-scores = ['0.1928', '0.1353', '0.1789', '0.1151', '0.1648'], best = 0.1928
  Protenix: TM-scores = ['0.2433', '0.2315', '0.2004', '0.2158', '0.2557'], best = 0.2557
  Ribonanzanet2: TM-scores = ['0.2595', '0.2500', '0.2671', '0.2666', '0.2631'], best = 0.2671
  Winner: Ribonanzanet2 (prediction 3) with TM-score = 0.2671

8KEB_A:
  DrFold2: TM-scores = ['0.5013', '0.4555', '0.5033', '0.5064', '0.4871'], best = 0.5064
  Protenix: TM-scores = ['0.5655', '0.6077', '0.5995', '0.6291', '0.5555'], best = 0.6291
  Ribonanzanet2: TM-scores = ['0.1686', '0.2007', '0.1595', '0.4732', '0.4652'], best = 0.4732
  Winner: Protenix (prediction 4) with TM-score = 0.6291

8KHH_A:
  DrFold2: TM-scores = ['0.5209', '0.4610', '0.4841', '0.5290', '0.4848'], best = 0.5290
  Protenix: TM-scores = ['0.6210', '0.6174', '0.5512', '0.6226', '0.5862'], best = 0.6226
  Ribonanzanet2: TM-scores = ['0.4789', '0.1586', '0.1694', '0.1653', '0.4767'], be

In [5]:
# Final save to ensure consistency
results_df.to_csv("/kaggle/working/model_selection_df.csv", index=False)
print("Final submission saved to /kaggle/working/")

Final submission saved to /kaggle/working/


In [8]:
# Confidence statistics

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

drfold2_df = pd.read_csv("/kaggle/input/predictions/drfold2_submission_with_confidence.csv")  # Replace with actual paths
protenix_df = pd.read_csv("/kaggle/input/predictions/protenix_submission_with_confidence.csv")  # Replace with actual paths  
ribonanza_df = pd.read_csv("/kaggle/input/predictions/ribonanzanet2_submission_with_confidence.csv")  # Replace with actual paths

def compute_average(df):
    confidence_cols = ['confidence_1', 'confidence_2', 'confidence_3', 'confidence_4', 'confidence_5']
    
    df_normalized = df.copy()
    
    all_confidence_values = df[confidence_cols].values.flatten()
    global_min = np.min(all_confidence_values)
    global_max = np.max(all_confidence_values)
    
    for col in confidence_cols:
        df_normalized[col] = (df[col] - global_min) / (global_max - global_min)
    
    for col in confidence_cols:
        print(f"  {col}: [{df[col].min():.4f}, {df[col].max():.4f}]")
    
    for col in confidence_cols:
        print(f"  {col}: [{df_normalized[col].min():.4f}, {df_normalized[col].max():.4f}]")
    
    df_normalized['avg_confidence_per_residue'] = df_normalized[confidence_cols].mean(axis=1)
    
    avg_confidence_per_sequence = df_normalized.groupby('target_id')['avg_confidence_per_residue'].mean()
    
    overall_avg_confidence = avg_confidence_per_sequence.mean()
    
    print(f"Overall average confidence score (normalized): {overall_avg_confidence:.4f}")

compute_average(drfold2_df)
compute_average(protenix_df)
compute_average(ribonanza_df)

  confidence_1: [0.0303, 0.0752]
  confidence_2: [0.0330, 0.0755]
  confidence_3: [0.0384, 0.0746]
  confidence_4: [0.0324, 0.0741]
  confidence_5: [0.0336, 0.0775]
  confidence_1: [0.0000, 0.9512]
  confidence_2: [0.0579, 0.9569]
  confidence_3: [0.1707, 0.9374]
  confidence_4: [0.0444, 0.9274]
  confidence_5: [0.0695, 1.0000]
Overall average confidence score (normalized): 0.5150
  confidence_1: [-6.3415, 1.0973]
  confidence_2: [-6.3525, 1.0678]
  confidence_3: [-6.3451, 1.0904]
  confidence_4: [-6.4334, 1.0702]
  confidence_5: [-6.3419, 1.0601]
  confidence_1: [0.0122, 1.0000]
  confidence_2: [0.0107, 0.9961]
  confidence_3: [0.0117, 0.9991]
  confidence_4: [0.0000, 0.9964]
  confidence_5: [0.0121, 0.9951]
Overall average confidence score (normalized): 0.5881
  confidence_1: [0.0222, 0.0551]
  confidence_2: [0.0242, 0.0553]
  confidence_3: [0.0281, 0.0546]
  confidence_4: [0.0229, 0.0543]
  confidence_5: [0.0245, 0.0571]
  confidence_1: [0.0000, 0.9414]
  confidence_2: [0.0567, 0.94