In [None]:
# MAE-Optimized Training with Isotonic Calibration
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from sklearn.cluster import KMeans
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

print("✅ MAE-optimized training with isotonic calibration loaded!")


In [None]:
# Optimized Featurization Functions
def smiles_to_mol(s):
    try:
        return Chem.MolFromSmiles(s)
    except:
        return None

# Optimized descriptor functions
RD_DESC_FUNCS = [
    Descriptors.MolWt, Descriptors.MolLogP, Descriptors.TPSA,
    Descriptors.HeavyAtomCount, Descriptors.NumHAcceptors,
    Descriptors.NumHDonors, Descriptors.NumValenceElectrons,
    Descriptors.FractionCSP3, Descriptors.RingCount,
    Descriptors.BertzCT, Descriptors.BalabanJ,
]

def morgan_bits_optimized(mol, n_bits=1024, radius=2):
    """Optimized Morgan fingerprint generation using modern API"""
    try:
        generator = GetMorganGenerator(radius=radius, fpSize=n_bits)
        fp = generator.GetFingerprint(mol)
        arr = np.zeros((n_bits,), dtype=np.uint8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        return arr
    except:
        return np.zeros(n_bits, dtype=np.uint8)

def smiles_stats_optimized(smiles):
    """Optimized SMILES statistics calculation"""
    s = smiles
    counts = {
        'len': len(s),
        'paren_open': s.count('('),
        'paren_close': s.count(')'),
        'aromatic_c': s.count('c'),
        'aliphatic_C': s.count('C'),
        'num_brackets': s.count('['),
        'num_equals': s.count('='),
        'num_hash': s.count('#'),
    }
    for tok in ['O','N','F','Cl','Br','S','P']:
        counts[f'atom_{tok}'] = s.count(tok)
    for d in '123456':
        counts[f'ring_{d}']= s.count(d)
    counts['aromatic_ratio'] = counts['aromatic_c'] / max(1, counts['aliphatic_C']+counts['aromatic_c'])
    return counts

def featurize_optimized(df, smiles_col='SMILES', n_bits=1024):
    """Optimized featurization with reduced feature count and modern RDKit API"""
    rows = []
    desc_cols = [f.__name__ for f in RD_DESC_FUNCS]
    maccs_cols = [f"MACCS_{i}" for i in range(167)]
    ecfp_cols = [f"ECFP_{i}" for i in range(n_bits)]
    stat_cols = None

    maccs_generator = GetMorganGenerator(radius=1, fpSize=167)
    
    for i, s in enumerate(df[smiles_col].values):
        if i % 1000 == 0:
            print(f"  Processing {i}/{len(df)} molecules...")
            
        mol = smiles_to_mol(s)
        if mol is None:
            rows.append([np.nan] * (len(desc_cols) + 167 + n_bits + 15))
            continue
            
        try:
            # RDKit descriptors
            desc = [f(mol) for f in RD_DESC_FUNCS]
            
            # MACCS-like features using Morgan
            maccs_fp = maccs_generator.GetFingerprint(mol)
            maccs_arr = np.zeros((167,), dtype=np.uint8)
            DataStructs.ConvertToNumpyArray(maccs_fp, maccs_arr)
            
            # ECFP
            ecfp = morgan_bits_optimized(mol, n_bits=n_bits, radius=2)
            
            # SMILES stats
            st = smiles_stats_optimized(s)
            if stat_cols is None:
                stat_cols = list(st.keys())
            
            rows.append(np.concatenate([desc, maccs_arr, ecfp, np.array([st[k] for k in stat_cols])], axis=0))
            
        except Exception as e:
            rows.append([np.nan] * (len(desc_cols) + 167 + n_bits + 15))
    
    X = np.vstack(rows)
    columns = desc_cols + maccs_cols + ecfp_cols + stat_cols
    return pd.DataFrame(X, columns=columns)

print("✅ Optimized featurization functions loaded!")


In [None]:
# Isotonic Calibration Functions
class IsotonicCalibrator:
    """
    Isotonic regression calibrator for regression tasks
    Corrects systematic biases in model predictions
    """
    def __init__(self, target_name):
        self.target_name = target_name
        self.calibrator = IsotonicRegression(out_of_bounds='clip')
        self.is_fitted = False
        
    def fit(self, y_true, y_pred):
        """
        Fit isotonic regression to calibrate predictions
        
        Args:
            y_true: True target values
            y_pred: Model predictions
        """
        print(f"🔧 Fitting isotonic calibrator for {self.target_name}...")
        
        # Remove any NaN values
        valid_mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean = y_true[valid_mask]
        y_pred_clean = y_pred[valid_mask]
        
        if len(y_true_clean) == 0:
            print(f"⚠️ No valid data for {self.target_name} calibration")
            return
        
        # Fit isotonic regression
        self.calibrator.fit(y_pred_clean, y_true_clean)
        self.is_fitted = True
        
        # Calculate calibration improvement
        original_mae = mean_absolute_error(y_true_clean, y_pred_clean)
        calibrated_pred = self.calibrator.predict(y_pred_clean)
        calibrated_mae = mean_absolute_error(y_true_clean, calibrated_pred)
        
        improvement = original_mae - calibrated_mae
        print(f"   Original MAE: {original_mae:.4f}")
        print(f"   Calibrated MAE: {calibrated_mae:.4f}")
        print(f"   Improvement: {improvement:.4f} ({improvement/original_mae*100:.2f}%)")
        
    def predict(self, y_pred):
        """
        Apply calibration to predictions
        
        Args:
            y_pred: Model predictions to calibrate
            
        Returns:
            Calibrated predictions
        """
        if not self.is_fitted:
            print(f"⚠️ Calibrator for {self.target_name} not fitted, returning original predictions")
            return y_pred
        
        return self.calibrator.predict(y_pred)
    
    def get_calibration_info(self):
        """
        Get information about the calibration
        """
        if not self.is_fitted:
            return f"{self.target_name}: Not fitted"
        
        return f"{self.target_name}: Fitted with {len(self.calibrator.X_thresholds_)} knots"

def create_calibration_ensemble(predictions_dict, calibrators_dict):
    """
    Create calibrated ensemble predictions
    
    Args:
        predictions_dict: Dict of {target: predictions}
        calibrators_dict: Dict of {target: IsotonicCalibrator}
        
    Returns:
        Dict of {target: calibrated_predictions}
    """
    calibrated_predictions = {}
    
    for target, predictions in predictions_dict.items():
        if target in calibrators_dict:
            calibrated_pred = calibrators_dict[target].predict(predictions)
            calibrated_predictions[target] = calibrated_pred
            print(f"✅ {target}: Applied calibration")
        else:
            calibrated_predictions[target] = predictions
            print(f"⚠️ {target}: No calibrator found, using original predictions")
    
    return calibrated_predictions

print("✅ Isotonic calibration functions loaded!")


In [None]:
# MAE-Optimized Training with Calibration
def train_mae_calibrated_cv(train_df, test_df, target, n_folds=3, n_clusters=15):
    """
    Training with MAE-optimized objectives + isotonic calibration
    """
    print(f"\n🎯 MAE-OPTIMIZED + CALIBRATED Training for {target}...")
    
    # Prepare data
    train_subset = train_df[train_df[target].notna()].copy()
    train_smiles = train_subset['SMILES'].tolist()
    test_smiles = test_df['SMILES'].tolist()
    
    print(f"Training samples: {len(train_subset)}")
    print(f"Test samples: {len(test_df)}")
    
    # Check target distribution
    y = train_subset[target].values
    print(f"Target range: {y.min():.2f} to {y.max():.2f}")
    print(f"Target mean: {y.mean():.2f}, std: {y.std():.2f}")
    
    # Featurize data
    print("🔧 Featurizing data...")
    try:
        train_features = featurize_optimized(train_subset, smiles_col='SMILES', n_bits=512)
        test_features = featurize_optimized(test_df, smiles_col='SMILES', n_bits=512)
    except Exception as e:
        print(f"❌ Featurization failed: {e}")
        return None, None, None, float('inf'), 0.5
    
    # Handle missing values and infinite values
    train_features = train_features.fillna(0)
    test_features = test_features.fillna(0)
    train_features = train_features.replace([np.inf, -np.inf], 0)
    test_features = test_features.replace([np.inf, -np.inf], 0)
    
    print(f"Feature matrix shape: {train_features.shape}")
    
    # Remove constant features
    constant_features = train_features.columns[train_features.nunique() <= 1]
    if len(constant_features) > 0:
        print(f"Removing {len(constant_features)} constant features")
        train_features = train_features.drop(columns=constant_features)
        test_features = test_features.drop(columns=constant_features)
    
    # Adversarial validation
    try:
        print("🔍 Adversarial validation...")
        train_labels = np.zeros(len(train_features))
        test_labels = np.ones(len(test_features))
        
        X_adv = np.vstack([train_features.values, test_features.values])
        y_adv = np.hstack([train_labels, test_labels])
        
        adv_model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42, n_jobs=-1)
        adv_model.fit(X_adv, y_adv)
        
        train_probs = adv_model.predict_proba(train_features.values)[:, 1]
        adv_auc = roc_auc_score(y_adv, adv_model.predict_proba(X_adv)[:, 1])
        
        # Create bounded sample weights
        sample_weights = 1.0 / (train_probs + 0.1)
        sample_weights = np.clip(sample_weights, 0.5, 2.0)
        sample_weights = sample_weights / np.mean(sample_weights)
        
        print(f"Adversarial AUC: {adv_auc:.4f}")
        print(f"Sample weights range: {sample_weights.min():.3f} to {sample_weights.max():.3f}")
        
    except Exception as e:
        print(f"⚠️ Adversarial validation failed: {e}")
        sample_weights = np.ones(len(train_subset))
        adv_auc = 0.5
    
    # Create ECFP clusters for GroupKFold
    try:
        print("🔗 Creating ECFP clusters...")
        ecfp_features = []
        valid_indices = []
        
        for i, smiles in enumerate(train_smiles):
            if i % 500 == 0:
                print(f"  Processing {i}/{len(train_smiles)} molecules for clustering...")
                
            mol = smiles_to_mol(smiles)
            if mol is not None:
                ecfp = morgan_bits_optimized(mol, n_bits=256, radius=2)
                ecfp_features.append(ecfp)
                valid_indices.append(i)
        
        if len(ecfp_features) > 0:
            ecfp_features = np.array(ecfp_features)
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=5)
            cluster_labels = kmeans.fit_predict(ecfp_features)
            
            clusters = np.zeros(len(train_smiles), dtype=int)
            clusters[valid_indices] = cluster_labels
            print(f"Cluster distribution: {np.bincount(clusters)}")
        else:
            clusters = np.zeros(len(train_smiles), dtype=int)
            
    except Exception as e:
        print(f"⚠️ Clustering failed: {e}")
        clusters = np.zeros(len(train_smiles), dtype=int)
    
    # GroupKFold
    group_kfold = GroupKFold(n_splits=n_folds)
    
    # Store predictions
    oof_predictions = np.zeros(len(train_subset))
    test_predictions = np.zeros(len(test_df))
    
    # MAE-optimized model parameters
    if target == "Tg":
        xgb_params = {
            'objective': 'reg:absoluteerror',
            'eval_metric': 'mae',
            'n_estimators': 1000,
            'learning_rate': 0.05,
            'max_depth': 4,
            'reg_lambda': 1.0,
            'reg_alpha': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 4,
            'n_jobs': -1
        }
        lgb_params = {
            'objective': 'mae',
            'metric': 'mae',
            'n_estimators': 800,
            'learning_rate': 0.05,
            'max_depth': 4,
            'num_leaves': 31,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.1,
            'reg_lambda': 1.0,
            'random_state': 42,
            'n_jobs': -1,
            'verbosity': -1
        }
        cat_params = {
            'loss_function': 'MAE',
            'eval_metric': 'MAE',
            'iterations': 800,
            'learning_rate': 0.05,
            'depth': 4,
            'l2_leaf_reg': 1.0,
            'random_strength': 0.1,
            'random_seed': 42,
            'verbose': False,
            'task_type': 'CPU'
        }
    else:
        # Conservative parameters for other targets
        xgb_params = {
            'objective': 'reg:absoluteerror',
            'eval_metric': 'mae',
            'n_estimators': 600,
            'learning_rate': 0.05,
            'max_depth': 3,
            'reg_lambda': 1.0,
            'random_state': 4,
            'n_jobs': -1
        }
        lgb_params = {
            'objective': 'mae',
            'metric': 'mae',
            'n_estimators': 500,
            'learning_rate': 0.05,
            'max_depth': 3,
            'random_state': 42,
            'n_jobs': -1,
            'verbosity': -1
        }
        cat_params = {
            'loss_function': 'MAE',
            'eval_metric': 'MAE',
            'iterations': 500,
            'learning_rate': 0.05,
            'depth': 3,
            'random_seed': 42,
            'verbose': False,
            'task_type': 'CPU'
        }
    
    # Cross-validation with MAE-optimized ensemble
    fold_scores = []
    
    try:
        for fold, (train_idx, val_idx) in enumerate(group_kfold.split(train_features, y, groups=clusters)):
            print(f"  Fold {fold + 1}/{n_folds}")
            
            # Split data
            X_train_fold = train_features.iloc[train_idx]
            X_val_fold = train_features.iloc[val_idx]
            y_train_fold = y[train_idx]
            y_val_fold = y[val_idx]
            weights_fold = sample_weights[train_idx]
            
            # Train XGBoost with MAE objective
            xgb_model = XGBRegressor(**xgb_params)
            xgb_model.fit(
                X_train_fold, y_train_fold,
                sample_weight=weights_fold,
                eval_set=[(X_val_fold, y_val_fold)],
                early_stopping_rounds=30,
                verbose=False
            )
            
            # Train LightGBM with MAE objective
            lgb_model = lgb.LGBMRegressor(**lgb_params)
            lgb_model.fit(
                X_train_fold, y_train_fold,
                sample_weight=weights_fold,
                eval_set=[(X_val_fold, y_val_fold)],
                callbacks=[lgb.early_stopping(stopping_rounds=30, verbose=False)]
            )
            
            # Train CatBoost with MAE objective
            cat_model = CatBoostRegressor(**cat_params)
            cat_model.fit(
                X_train_fold, y_train_fold,
                sample_weight=weights_fold,
                eval_set=(X_val_fold, y_val_fold),
                early_stopping_rounds=30,
                verbose=False
            )
            
            # Ensemble predictions
            val_pred_xgb = xgb_model.predict(X_val_fold)
            val_pred_lgb = lgb_model.predict(X_val_fold)
            val_pred_cat = cat_model.predict(X_val_fold)
            val_pred_ensemble = (val_pred_xgb + val_pred_lgb + val_pred_cat) / 3
            
            test_pred_xgb = xgb_model.predict(test_features)
            test_pred_lgb = lgb_model.predict(test_features)
            test_pred_cat = cat_model.predict(test_features)
            test_pred_ensemble = (test_pred_xgb + test_pred_lgb + test_pred_cat) / 3
            
            # Store predictions
            oof_predictions[val_idx] = val_pred_ensemble
            test_predictions += test_pred_ensemble / n_folds
            
            # Calculate fold score
            fold_mae = mean_absolute_error(y_val_fold, val_pred_ensemble)
            fold_scores.append(fold_mae)
            print(f"    Fold {fold + 1} MAE: {fold_mae:.4f}")
    
    except Exception as e:
        print(f"❌ Cross-validation failed: {e}")
        return None, None, None, float('inf'), adv_auc
    
    # Overall performance before calibration
    overall_mae = mean_absolute_error(y, oof_predictions)
    print(f"  Overall CV MAE (before calibration): {overall_mae:.4f}")
    print(f"  CV std: {np.std(fold_scores):.4f}")
    
    # Create and fit isotonic calibrator
    calibrator = IsotonicCalibrator(target)
    calibrator.fit(y, oof_predictions)
    
    # Apply calibration to OOF predictions
    calibrated_oof = calibrator.predict(oof_predictions)
    calibrated_mae = mean_absolute_error(y, calibrated_oof)
    
    print(f"  Overall CV MAE (after calibration): {calibrated_mae:.4f}")
    print(f"  Calibration improvement: {overall_mae - calibrated_mae:.4f}")
    
    return oof_predictions, test_predictions, calibrator, calibrated_mae, adv_auc

print("✅ MAE-optimized training with calibration loaded!")


In [None]:
# Complete Training Pipeline with Calibration
def train_all_targets_with_calibration(train_df, test_df, targets=['Tg', 'FFV', 'Tc', 'Density', 'Rg']):
    """
    Train all targets with MAE optimization and isotonic calibration
    """
    print("🚀 COMPLETE TRAINING PIPELINE WITH CALIBRATION")
    print("=" * 60)
    
    results = {}
    calibrators = {}
    test_predictions = {}
    
    for target in targets:
        print(f"\n{'='*20} {target} {'='*20}")
        
        try:
            oof_preds, test_preds, calibrator, mae, adv_auc = train_mae_calibrated_cv(
                train_df, test_df, target, n_folds=3, n_clusters=15
            )
            
            if oof_preds is not None:
                results[target] = {
                    'mae': mae,
                    'adv_auc': adv_auc,
                    'oof_predictions': oof_preds
                }
                calibrators[target] = calibrator
                test_predictions[target] = test_preds
                
                print(f"✅ {target} completed - MAE: {mae:.4f}, Adversarial AUC: {adv_auc:.4f}")
            else:
                print(f"❌ {target} failed")
                
        except Exception as e:
            print(f"❌ Error training {target}: {e}")
    
    # Apply calibration to test predictions
    print("\n🔧 Applying calibration to test predictions...")
    calibrated_test_predictions = {}
    
    for target, predictions in test_predictions.items():
        if target in calibrators:
            calibrated_pred = calibrators[target].predict(predictions)
            calibrated_test_predictions[target] = calibrated_pred
            print(f"✅ {target}: Applied calibration")
        else:
            calibrated_test_predictions[target] = predictions
            print(f"⚠️ {target}: No calibrator available")
    
    # Create submission dataframe
    submission_df = pd.DataFrame({'id': test_df['id']})
    for target in targets:
        if target in calibrated_test_predictions:
            submission_df[target] = calibrated_test_predictions[target]
        else:
            submission_df[target] = 0.0
    
    # Summary
    print("\n" + "="*60)
    print("📊 CALIBRATION RESULTS SUMMARY")
    print("="*60)
    
    for target in targets:
        if target in results:
            mae = results[target]['mae']
            adv_auc = results[target]['adv_auc']
            print(f"{target:8s}: MAE = {mae:.4f}, Adversarial AUC = {adv_auc:.4f}")
        else:
            print(f"{target:8s}: Failed")
    
    avg_mae = np.mean([results[t]['mae'] for t in targets if t in results])
    print(f"\n📈 Average MAE: {avg_mae:.4f}")
    
    # Calibration info
    print("\n🔧 Calibration Information:")
    for target, calibrator in calibrators.items():
        print(f"  {calibrator.get_calibration_info()}")
    
    return results, calibrators, submission_df

print("✅ Complete training pipeline with calibration loaded!")


In [None]:
# Test the Complete Pipeline
print("🧪 TESTING COMPLETE PIPELINE WITH CALIBRATION")
print("=" * 60)

# Create sample data for testing
print("📊 Creating sample data...")

sample_train = pd.DataFrame({
    'SMILES': [
        'CCO', 'CC(C)O', 'CC(C)(C)O', 'C1=CC=CC=C1', 'C1=CC=CC=C1O',
        'CCCC', 'CC(C)CC', 'C1=CC=CC=C1C', 'CC(C)(C)CC', 'C1=CC=CC=C1CC'
    ],
    'Tg': [100.0, 120.0, 140.0, 80.0, 90.0, 110.0, 130.0, 85.0, 150.0, 95.0],
    'FFV': [0.1, 0.15, 0.2, 0.05, 0.08, 0.12, 0.18, 0.06, 0.22, 0.09],
    'Tc': [200.0, 220.0, 240.0, 180.0, 190.0, 210.0, 230.0, 185.0, 250.0, 195.0],
    'Density': [1.0, 1.1, 1.2, 0.9, 0.95, 1.05, 1.15, 0.92, 1.25, 0.98],
    'Rg': [5.0, 6.0, 7.0, 4.0, 4.5, 5.5, 6.5, 4.2, 7.5, 4.8]
})

sample_test = pd.DataFrame({
    'SMILES': ['CCCCC', 'CC(C)CCC', 'C1=CC=CC=C1CC'],
    'id': [1, 2, 3]
})

print(f"Train shape: {sample_train.shape}")
print(f"Test shape: {sample_test.shape}")

# Test the complete pipeline
try:
    results, calibrators, submission_df = train_all_targets_with_calibration(
        sample_train, sample_test, targets=['Tg', 'FFV']  # Test with 2 targets first
    )
    
    print("\n🎉 Pipeline test completed successfully!")
    print(f"\n📋 Submission shape: {submission_df.shape}")
    print("\n📊 Submission preview:")
    print(submission_df.head())
    
except Exception as e:
    print(f"❌ Pipeline test failed: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*60)
print("📋 Next steps:")
print("1. Load your actual train/test data")
print("2. Run train_all_targets_with_calibration() with all 5 targets")
print("3. Compare MAE scores before/after calibration")
print("4. Expected improvement: +0.010-0.025 wMAE reduction")
print("5. Save submission_df.to_csv('calibrated_submission.csv', index=False)")
