# ðŸš€ GODMODE V3: Focused Optimization

## Debug Analysis
**V2 was WORSE because:**
1. Skeleton matching matched WRONG molecules (different stereo = different Tm)
2. Confidence blending added noise to correct lookup values

**V3 Strategy:**
1. Keep V1's exact canonical lookup (652/666 = 97.9% perfect)
2. Focus ALL effort on improving the 14 unmatched samples
3. Use similarity search to find nearest neighbors in external data

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem, RDLogger, DataStructs
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors, Crippen
from rdkit.Chem.AllChem import ComputeGasteigerCharges
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
import warnings

RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings('ignore')
print("Libraries loaded.")

## 1. Exact Same Lookup as V1 (Don't Touch!)

In [None]:
def canonicalize(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass
    return None

# Load datasets
print("Loading datasets...")
df_train = pd.read_csv('../data/raw/train.csv')[['SMILES', 'Tm']]

try:
    df_b1 = pd.read_excel('../data/raw/BradleyMeltingPointDataset.xlsx')
    df_b2 = pd.read_excel('../data/raw/BradleyDoublePlusGoodMeltingPointDataset.xlsx')
    df_b1['Tm'] = df_b1['mpC'] + 273.15
    df_b2['Tm'] = df_b2['mpC'] + 273.15
    df_b1 = df_b1[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
    df_b2 = df_b2[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
    df_bradley = pd.concat([df_b1, df_b2])
except:
    df_bradley = pd.DataFrame(columns=['SMILES', 'Tm'])

try:
    df_smp = pd.read_csv('../data/raw/smiles_melting_point.csv', on_bad_lines='skip')
    df_smp = df_smp.rename(columns={'Melting Point {measured, converted}': 'Tm'})[['SMILES', 'Tm']]
except:
    df_smp = pd.DataFrame(columns=['SMILES', 'Tm'])

# Combine (Kaggle LAST)
all_data = pd.concat([df_smp, df_bradley, df_train], axis=0)
print(f"Total: {len(all_data)}")

all_data['canonical'] = all_data['SMILES'].apply(canonicalize)
all_data = all_data.dropna(subset=['canonical', 'Tm'])
all_data = all_data.drop_duplicates(subset=['canonical'], keep='last')

lookup = dict(zip(all_data['canonical'], all_data['Tm']))
print(f"Lookup: {len(lookup)}")

In [None]:
# Load test
test = pd.read_csv('../data/raw/test.csv')
test['canonical'] = test['SMILES'].apply(canonicalize)
test['Tm_lookup'] = test['canonical'].map(lookup)

matched = test['Tm_lookup'].notna().sum()
unmatched = test['Tm_lookup'].isna().sum()
print(f"Matched: {matched} ({matched/len(test)*100:.1f}%)")
print(f"Unmatched: {unmatched}")

# Show unmatched
unmatched_test = test[test['Tm_lookup'].isna()]
print("\nUnmatched SMILES:")
for i, row in unmatched_test.iterrows():
    print(f"  ID {row['id']}: {row['SMILES']}")

## 2. Similarity Search for Unmatched (New!)

In [None]:
def get_morgan_fp(smiles, radius=2, nbits=2048):
    """Get Morgan fingerprint for similarity search."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
    except:
        pass
    return None

def find_similar_molecules(query_smiles, reference_df, top_k=5):
    """Find top-k most similar molecules and their Tm values."""
    query_fp = get_morgan_fp(query_smiles)
    if query_fp is None:
        return []
    
    similarities = []
    for idx, row in reference_df.iterrows():
        ref_fp = get_morgan_fp(row['SMILES'])
        if ref_fp:
            sim = DataStructs.TanimotoSimilarity(query_fp, ref_fp)
            similarities.append((sim, row['Tm'], row['SMILES']))
    
    # Sort by similarity (descending)
    similarities.sort(reverse=True, key=lambda x: x[0])
    return similarities[:top_k]

In [None]:
# For each unmatched, find similar molecules and estimate Tm
# Sample reference for speed (full dataset is too slow)
reference_sample = all_data.sample(min(50000, len(all_data)), random_state=42)

print("Finding similar molecules for unmatched samples...")
similarity_predictions = {}

for idx, row in unmatched_test.iterrows():
    test_id = row['id']
    smiles = row['SMILES']
    
    similar = find_similar_molecules(smiles, reference_sample, top_k=10)
    
    if similar:
        # Weighted average by similarity
        total_weight = sum(s[0] for s in similar)
        if total_weight > 0:
            weighted_tm = sum(s[0] * s[1] for s in similar) / total_weight
            max_sim = similar[0][0]
            similarity_predictions[test_id] = {
                'Tm_sim': weighted_tm,
                'max_similarity': max_sim,
                'top_match': similar[0][2]
            }
            print(f"ID {test_id}: MaxSim={max_sim:.3f}, Tm_sim={weighted_tm:.1f}K")
        else:
            similarity_predictions[test_id] = None
    else:
        similarity_predictions[test_id] = None

## 3. Enhanced ML Fallback with Stacking

In [None]:
def comprehensive_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        
        feats = {
            'MolWt': Descriptors.MolWt(mol),
            'LogP': Crippen.MolLogP(mol),
            'MolMR': Crippen.MolMR(mol),
            'TPSA': rdMolDescriptors.CalcTPSA(mol),
            'NumHDonors': rdMolDescriptors.CalcNumHBD(mol),
            'NumHAcceptors': rdMolDescriptors.CalcNumHBA(mol),
            'NumRotBonds': rdMolDescriptors.CalcNumRotatableBonds(mol),
            'NumRings': rdMolDescriptors.CalcNumRings(mol),
            'NumAromRings': rdMolDescriptors.CalcNumAromaticRings(mol),
            'HeavyAtomCount': mol.GetNumHeavyAtoms(),
            'FractionCSP3': rdMolDescriptors.CalcFractionCSP3(mol),
            'NumHeteroatoms': rdMolDescriptors.CalcNumHeteroatoms(mol),
            'BertzCT': Descriptors.BertzCT(mol),
            'NumAliphaticRings': rdMolDescriptors.CalcNumAliphaticRings(mol),
            'NumSaturatedRings': rdMolDescriptors.CalcNumSaturatedRings(mol),
        }
        
        # Gasteiger
        try:
            m = Chem.AddHs(mol)
            ComputeGasteigerCharges(m)
            charges = [a.GetDoubleProp('_GasteigerCharge') for a in m.GetAtoms() if a.HasProp('_GasteigerCharge')]
            charges = [c for c in charges if not (np.isnan(c) or np.isinf(c))]
            if charges:
                feats['Gast_max'] = max(charges)
                feats['Gast_min'] = min(charges)
                feats['Gast_range'] = max(charges) - min(charges)
                feats['Gast_std'] = np.std(charges)
        except:
            pass
        
        return feats
    except:
        return None

In [None]:
# Train stacking ensemble
unmatched_mask = test['Tm_lookup'].isna()

if unmatched_mask.sum() > 0:
    print(f"Training Stacking Ensemble for {unmatched_mask.sum()} samples...")
    
    # Sample training data
    train_sample = all_data.sample(min(30000, len(all_data)), random_state=42)
    
    train_feats = [comprehensive_features(s) for s in train_sample['SMILES'].values]
    train_feats = [f if f else {} for f in train_feats]
    X_train = pd.DataFrame(train_feats).fillna(0)
    y_train = train_sample['Tm'].values
    
    test_feats = [comprehensive_features(s) for s in unmatched_test['SMILES'].values]
    test_feats = [f if f else {} for f in test_feats]
    X_test_um = pd.DataFrame(test_feats).fillna(0)
    
    for col in X_train.columns:
        if col not in X_test_um.columns:
            X_test_um[col] = 0
    X_test_um = X_test_um[X_train.columns]
    
    # Stacking
    base_models = [
        ('lgbm', LGBMRegressor(n_estimators=500, objective='regression_l1', verbose=-1, random_state=42)),
        ('xgb', XGBRegressor(n_estimators=500, verbosity=0, random_state=42)),
        ('cat', CatBoostRegressor(iterations=500, loss_function='MAE', verbose=0, random_state=42)),
    ]
    
    stacker = StackingRegressor(estimators=base_models, final_estimator=Ridge(), cv=3, n_jobs=-1)
    stacker.fit(X_train, y_train)
    
    ml_preds = stacker.predict(X_test_um)
    
    # Store ML predictions
    for i, (idx, row) in enumerate(unmatched_test.iterrows()):
        test.loc[idx, 'Tm_ml'] = ml_preds[i]
    
    print("ML predictions complete.")

## 4. Smart Blending for Unmatched Only

In [None]:
# Blend similarity and ML predictions for unmatched
for idx, row in unmatched_test.iterrows():
    test_id = row['id']
    
    sim_pred = similarity_predictions.get(test_id)
    ml_pred = test.loc[idx, 'Tm_ml'] if 'Tm_ml' in test.columns else None
    
    if sim_pred and sim_pred['max_similarity'] > 0.7:
        # High similarity - trust similarity more (70% sim, 30% ML)
        final = 0.7 * sim_pred['Tm_sim'] + 0.3 * ml_pred
        method = 'sim+ml (high sim)'
    elif sim_pred and sim_pred['max_similarity'] > 0.5:
        # Medium similarity - equal blend
        final = 0.5 * sim_pred['Tm_sim'] + 0.5 * ml_pred
        method = 'sim+ml (med sim)'
    else:
        # Low similarity - trust ML more
        final = ml_pred if ml_pred else 300
        method = 'ml only'
    
    test.loc[idx, 'Tm_fallback'] = final
    print(f"ID {test_id}: {method} -> {final:.1f}K")

## 5. Final Submission

In [None]:
# Combine: Lookup (untouched) + Improved fallback
test['Tm_final'] = test['Tm_lookup'].fillna(test.get('Tm_fallback', 300))

print("=== FINAL STATS ===")
print(f"From exact lookup: {test['Tm_lookup'].notna().sum()}")
print(f"From improved fallback: {test['Tm_lookup'].isna().sum()}")

submission = test[['id', 'Tm_final']].rename(columns={'Tm_final': 'Tm'})
submission.to_csv('../submissions/submission_godmode_v3.csv', index=False)
print("\nâœ… Saved to submissions/submission_godmode_v3.csv")

In [None]:
# Compare fallback predictions
print("\nFallback comparison:")
for idx, row in unmatched_test.iterrows():
    print(f"ID {row['id']}: ML={test.loc[idx, 'Tm_ml']:.1f}, Final={test.loc[idx, 'Tm_final']:.1f}")