In [1]:
import stat
import pandas as pd
import numpy as np
import subprocess, tempfile, os, re, shutil, warnings
from typing import Dict, List, Tuple
from tqdm.auto import tqdm
warnings.filterwarnings('ignore')

In [3]:
class RNAEnsemble:
    def __init__(self):
        self.models = {}
        self.labels = None
        print("✓ RNA Ensemble initialized")
    
    def load_data(self, file_paths):
        for name, path in file_paths.items():
            df = pd.read_csv(path)
            if name == 'labels':
                self.labels = df
            else:
                self.models[name] = df
        print(f"Loaded: {list(self.models.keys())} + labels")
    
    def create_ensemble(self, method='best_confidence'):
        all_ids = set()
        for df in self.models.values():
            all_ids.update(df['ID'].values)
        
        ensemble_rows = []
        
        for nuc_id in sorted(all_ids):
            model_rows = {}
            for name, df in self.models.items():
                row = df[df['ID'] == nuc_id]
                if not row.empty:
                    model_rows[name] = row.iloc[0]
            
            if len(model_rows) < len(self.models):
                continue
            
            ensemble_row = {
                'ID': nuc_id,
                'resname': list(model_rows.values())[0]['resname'],
                'resid': list(model_rows.values())[0]['resid']
            }
            
            for i in range(1, 6):
                if method == 'best_confidence':
                    # Pick model with highest confidence
                    best_conf = -1
                    best_coords = None
                    
                    for name, row in model_rows.items():
                        if name == 'drfold2':
                            conf = row.get(f'drfold_confidence_{i}', 0.5)
                        elif name == 'ribonanzanet2':
                            conf = row.get(f'ribo_confidence_{i}', 0.5)
                        else:
                            conf = row.get(f'confidence_{i}', 0.5)
                        
                        if conf > best_conf:
                            best_conf = conf
                            best_coords = [row[f'x_{i}'], row[f'y_{i}'], row[f'z_{i}']]
                    
                    coords = best_coords

                elif method == 'naive':
                    coords_dict = {}
                    for name, row in model_rows.items():
                        coords_dict[name] = [row[f'x_{i}'], row[f'y_{i}'], row[f'z_{i}']]
                    if i in [1, 2, 3]:
                        coords = [model_rows['protenix'][f'x_{i}'],
                                  model_rows['protenix'][f'y_{i}'],
                                  model_rows['protenix'][f'z_{i}']]
                    else:
                        coords = [model_rows['drfold2'][f'x_{i-3}'],
                                  model_rows['drfold2'][f'y_{i-3}'],
                                  model_rows['drfold2'][f'z_{i-3}']]
                
                elif method == 'weighted_avg':
                    # Confidence-weighted average
                    coords_list, weights = [], []
                    for name, row in model_rows.items():
                        coords_list.append([row[f'x_{i}'], row[f'y_{i}'], row[f'z_{i}']])
                        if name == 'drfold2':
                            conf = row.get(f'drfold_confidence_{i}', 0.5)
                        elif name == 'ribonanzanet2':
                            conf = row.get(f'ribo_confidence_{i}', 0.5)
                        else:
                            conf = row.get(f'confidence_{i}', 0.5)
                        weights.append(conf)
                    
                    coords_array = np.array(coords_list)
                    weights = np.array(weights)
                    weights = weights / np.sum(weights) if np.sum(weights) > 0 else np.ones_like(weights) / len(weights)
                    coords = np.sum(coords_array * weights.reshape(-1, 1), axis=0)
                
                elif method == 'simple_avg':
                    # Simple average
                    coords_list = []
                    for name, row in model_rows.items():
                        coords_list.append([row[f'x_{i}'], row[f'y_{i}'], row[f'z_{i}']])
                    coords = np.mean(coords_list, axis=0)
                
                elif method == 'adaptive_weighted':
                    # Square confidence weights to emphasize differences
                    coords_list, weights = [], []
                    for name, row in model_rows.items():
                        coords_list.append([row[f'x_{i}'], row[f'y_{i}'], row[f'z_{i}']])
                        if name == 'drfold2':
                            conf = row.get(f'drfold_confidence_{i}', 0.5)
                        elif name == 'ribonanzanet2':
                            conf = row.get(f'ribo_confidence_{i}', 0.5)
                        else:
                            conf = row.get(f'confidence_{i}', 0.5)
                        weights.append(conf ** 2)  # Square to emphasize differences
                    
                    coords_array = np.array(coords_list)
                    weights = np.array(weights)
                    weights = weights / np.sum(weights) if np.sum(weights) > 0 else np.ones_like(weights) / len(weights)
                    coords = np.sum(coords_array * weights.reshape(-1, 1), axis=0)
                
                elif method == 'confidence_threshold':
                    # Only use predictions above threshold
                    threshold = 0.6
                    coords_list, weights = [], []
                    
                    for name, row in model_rows.items():
                        if name == 'drfold2':
                            conf = row.get(f'drfold_confidence_{i}', 0.5)
                        elif name == 'ribonanzanet2':
                            conf = row.get(f'ribo_confidence_{i}', 0.5)
                        else:
                            conf = row.get(f'confidence_{i}', 0.5)
                        
                        if conf >= threshold:
                            coords_list.append([row[f'x_{i}'], row[f'y_{i}'], row[f'z_{i}']])
                            weights.append(conf)
                    
                    if coords_list:
                        coords_array = np.array(coords_list)
                        weights = np.array(weights)
                        weights = weights / np.sum(weights)
                        coords = np.sum(coords_array * weights.reshape(-1, 1), axis=0)
                    else:
                        # Fallback to drfold2 if no predictions meet threshold
                        coords = [model_rows['protenix'][f'x_{i}'], 
                                model_rows['protenix'][f'y_{i}'], 
                                model_rows['protenix'][f'z_{i}']]
                
                elif method == 'dynamic':
                    # Dynamically select strategy based on confidence patterns
                    confidences = {}
                    coords_dict = {}
                    
                    for name, row in model_rows.items():
                        if name == 'drfold2':
                            conf = row.get(f'drfold_confidence_{i}', 0.5)
                        elif name == 'ribonanzanet2':
                            conf = row.get(f'ribo_confidence_{i}', 0.5)
                        else:
                            conf = row.get(f'confidence_{i}', 0.5)
                        
                        confidences[name] = conf
                        coords_dict[name] = [row[f'x_{i}'], row[f'y_{i}'], row[f'z_{i}']]
                    
                    max_conf = max(confidences.values())
                    conf_std = np.std(list(confidences.values()))
                    
                    if max_conf > 0.7:
                        # High confidence: use best model
                        best_model = max(confidences.items(), key=lambda x: x[1])[0]
                        coords = coords_dict[best_model]
                    elif conf_std < 0.1:
                        # Similar confidences: use weighted average
                        weights = np.array(list(confidences.values()))
                        weights = weights / np.sum(weights)
                        coords_array = np.array(list(coords_dict.values()))
                        coords = np.sum(coords_array * weights.reshape(-1, 1), axis=0)
                    else:
                        # Default: use best overall model
                        coords = coords_dict['protenix']
                        
                
                ensemble_row[f'x_{i}'] = coords[0]
                ensemble_row[f'y_{i}'] = coords[1]
                ensemble_row[f'z_{i}'] = coords[2]
            
            ensemble_rows.append(ensemble_row)
        
        return pd.DataFrame(ensemble_rows)
    
    def evaluate_all(self):
        results = {}

        print("\nEvaluating ensemble methods...")
        # Evaluate ensembles
        ensemble_methods = ['naive',
            'best_confidence', 'weighted_avg', 'simple_avg', 
            'adaptive_weighted', 'confidence_threshold', 'dynamic'
        ]
        
        for method in ensemble_methods:
            print(f"Creating ensemble_{method}...")
            ensemble_df = self.create_ensemble(method)
            ensemble_df.to_csv(f'/kaggle/working/{method}.csv', index=False)
        return results


In [4]:
# === RNA Ensemble Analysis ===
ensemble_analyzer = RNAEnsemble()
file_paths = {
    'drfold2': '/kaggle/input/predictions/drfold2_submission_with_confidence.csv',
    'ribonanzanet2': '/kaggle/input/predictions/ribonanzanet2_submission_with_confidence.csv',
    'protenix': '/kaggle/input/predictions/protenix_submission_with_confidence.csv',
    'labels': '/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv'
}

ensemble_analyzer.load_data(file_paths)
ensemble_analyzer.evaluate_all()

✓ RNA Ensemble initialized
Loaded: ['drfold2', 'ribonanzanet2', 'protenix'] + labels

Evaluating ensemble methods...
Creating ensemble_naive...


{}