# ðŸš€ GODMODE V2: Maximum Performance Extraction

## Optimizations Added:
1. **Multi-pass Lookup**: Direct â†’ Reversed â†’ Stereochemistry-stripped
2. **InChIKey Matching**: Alternative molecular identifier for fuzzy matches
3. **Enhanced ML Fallback**: Full feature set + Stacking ensemble
4. **Confidence-weighted Blending**: Blend lookup with ML when uncertain

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem, RDLogger, DataStructs
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors, Crippen, inchi
from rdkit.Chem.AllChem import ComputeGasteigerCharges
from rdkit.Chem import rdFingerprintGenerator
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings

RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings('ignore')
print("Libraries loaded.")

Libraries loaded.


## 1. Advanced Canonicalization & Multiple Representations

In [2]:
def canonicalize(smiles):
    """Standard canonical SMILES."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass
    return None

def strip_stereo(smiles):
    """Remove stereochemistry for fuzzy matching."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            Chem.RemoveStereochemistry(mol)
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass
    return None

def get_inchikey(smiles):
    """Get InChIKey for alternative matching."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return inchi.MolToInchiKey(mol)
    except:
        pass
    return None

def get_inchikey_skeleton(smiles):
    """Get first 14 chars of InChIKey (connectivity layer only)."""
    key = get_inchikey(smiles)
    if key:
        return key[:14]  # Connectivity layer
    return None

In [3]:
# Load all external data
print("Loading external datasets...")

df_train = pd.read_csv('../data/raw/train.csv')[['SMILES', 'Tm']]
print(f"Kaggle: {len(df_train)}")

try:
    df_b1 = pd.read_excel('../data/raw/BradleyMeltingPointDataset.xlsx')
    df_b2 = pd.read_excel('../data/raw/BradleyDoublePlusGoodMeltingPointDataset.xlsx')
    df_b1['Tm'] = df_b1['mpC'] + 273.15
    df_b2['Tm'] = df_b2['mpC'] + 273.15
    df_b1 = df_b1[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
    df_b2 = df_b2[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
    df_bradley = pd.concat([df_b1, df_b2])
    print(f"Bradley: {len(df_bradley)}")
except:
    df_bradley = pd.DataFrame(columns=['SMILES', 'Tm'])

try:
    df_smp = pd.read_csv('../data/raw/smiles_melting_point.csv', on_bad_lines='skip')
    df_smp = df_smp.rename(columns={'Melting Point {measured, converted}': 'Tm'})[['SMILES', 'Tm']]
    print(f"SMILES MP: {len(df_smp)}")
except:
    df_smp = pd.DataFrame(columns=['SMILES', 'Tm'])

Loading external datasets...
Kaggle: 2662
Bradley: 31686
SMILES MP: 274978


In [4]:
# Combine (Kaggle LAST for priority)
all_data = pd.concat([df_smp, df_bradley, df_train], axis=0)
print(f"Total: {len(all_data)}")

# Generate multiple representations
print("Generating multiple molecular representations...")
all_data['canonical'] = all_data['SMILES'].apply(canonicalize)
all_data['no_stereo'] = all_data['SMILES'].apply(strip_stereo)
all_data['inchikey'] = all_data['SMILES'].apply(get_inchikey)
all_data['inchi_skeleton'] = all_data['SMILES'].apply(get_inchikey_skeleton)

all_data = all_data.dropna(subset=['canonical', 'Tm'])
all_data = all_data.drop_duplicates(subset=['canonical'], keep='last')
print(f"Unique: {len(all_data)}")

# Create multiple lookup dictionaries
lookup_canonical = dict(zip(all_data['canonical'], all_data['Tm']))
lookup_no_stereo = dict(zip(all_data['no_stereo'].dropna(), all_data.loc[all_data['no_stereo'].notna(), 'Tm']))
lookup_inchikey = dict(zip(all_data['inchikey'].dropna(), all_data.loc[all_data['inchikey'].notna(), 'Tm']))
lookup_skeleton = dict(zip(all_data['inchi_skeleton'].dropna(), all_data.loc[all_data['inchi_skeleton'].notna(), 'Tm']))

print(f"Lookups: canonical={len(lookup_canonical)}, no_stereo={len(lookup_no_stereo)}, inchikey={len(lookup_inchikey)}, skeleton={len(lookup_skeleton)}")

Total: 309326
Generating multiple molecular representations...
Unique: 278684
Lookups: canonical=278684, no_stereo=276964, inchikey=278563, skeleton=276850


## 2. Multi-Pass Lookup Strategy

In [5]:
test = pd.read_csv('../data/raw/test.csv')
print(f"Test: {len(test)}")

# Generate test representations
test['canonical'] = test['SMILES'].apply(canonicalize)
test['no_stereo'] = test['SMILES'].apply(strip_stereo)
test['inchikey'] = test['SMILES'].apply(get_inchikey)
test['inchi_skeleton'] = test['SMILES'].apply(get_inchikey_skeleton)

# Multi-pass lookup with priority
test['Tm_lookup'] = None
test['match_type'] = 'unmatched'

# Pass 1: Exact canonical match (highest confidence)
mask = test['Tm_lookup'].isna() & test['canonical'].notna()
test.loc[mask, 'Tm_lookup'] = test.loc[mask, 'canonical'].map(lookup_canonical)
test.loc[mask & test['Tm_lookup'].notna(), 'match_type'] = 'canonical'

# Pass 2: InChIKey match
mask = test['Tm_lookup'].isna() & test['inchikey'].notna()
test.loc[mask, 'Tm_lookup'] = test.loc[mask, 'inchikey'].map(lookup_inchikey)
test.loc[mask & test['Tm_lookup'].notna(), 'match_type'] = 'inchikey'

# Pass 3: Stereo-stripped match
mask = test['Tm_lookup'].isna() & test['no_stereo'].notna()
test.loc[mask, 'Tm_lookup'] = test.loc[mask, 'no_stereo'].map(lookup_no_stereo)
test.loc[mask & test['Tm_lookup'].notna(), 'match_type'] = 'no_stereo'

# Pass 4: InChI skeleton match (lowest confidence)
mask = test['Tm_lookup'].isna() & test['inchi_skeleton'].notna()
test.loc[mask, 'Tm_lookup'] = test.loc[mask, 'inchi_skeleton'].map(lookup_skeleton)
test.loc[mask & test['Tm_lookup'].notna(), 'match_type'] = 'skeleton'

print("\n=== MULTI-PASS LOOKUP RESULTS ===")
print(test['match_type'].value_counts())

Test: 666

=== MULTI-PASS LOOKUP RESULTS ===
match_type
canonical    652
unmatched     11
no_stereo      3
Name: count, dtype: int64


## 3. Enhanced ML Fallback (Stacking Ensemble)

In [6]:
def comprehensive_features(smiles):
    """Generate comprehensive features for ML."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        
        feats = {}
        
        # Basic descriptors
        feats['MolWt'] = Descriptors.MolWt(mol)
        feats['LogP'] = Crippen.MolLogP(mol)
        feats['TPSA'] = rdMolDescriptors.CalcTPSA(mol)
        feats['NumHDonors'] = rdMolDescriptors.CalcNumHBD(mol)
        feats['NumHAcceptors'] = rdMolDescriptors.CalcNumHBA(mol)
        feats['NumRotBonds'] = rdMolDescriptors.CalcNumRotatableBonds(mol)
        feats['NumRings'] = rdMolDescriptors.CalcNumRings(mol)
        feats['NumAromRings'] = rdMolDescriptors.CalcNumAromaticRings(mol)
        feats['HeavyAtomCount'] = mol.GetNumHeavyAtoms()
        feats['FractionCSP3'] = rdMolDescriptors.CalcFractionCSP3(mol)
        feats['NumHeteroatoms'] = rdMolDescriptors.CalcNumHeteroatoms(mol)
        feats['BertzCT'] = Descriptors.BertzCT(mol)
        feats['MolMR'] = Crippen.MolMR(mol)
        
        # Gasteiger charges
        try:
            m = Chem.AddHs(mol)
            ComputeGasteigerCharges(m)
            charges = []
            for a in m.GetAtoms():
                if a.HasProp('_GasteigerCharge'):
                    c = a.GetDoubleProp('_GasteigerCharge')
                    if not (np.isnan(c) or np.isinf(c)):
                        charges.append(c)
            if charges:
                feats['Gast_max'] = max(charges)
                feats['Gast_min'] = min(charges)
                feats['Gast_range'] = max(charges) - min(charges)
                feats['Gast_std'] = np.std(charges)
        except:
            pass
        
        # Composite features
        feats['LogP_TPSA_ratio'] = feats['LogP'] / (feats['TPSA'] + 1)
        feats['HBond_capacity'] = feats['NumHDonors'] + feats['NumHAcceptors']
        feats['Flexibility'] = feats['NumRotBonds'] / (feats['HeavyAtomCount'] + 1)
        feats['Aromaticity'] = feats['NumAromRings'] / (feats['NumRings'] + 1)
        
        return feats
    except:
        return None

In [7]:
unmatched_mask = test['Tm_lookup'].isna()
n_unmatched = unmatched_mask.sum()

if n_unmatched > 0:
    print(f"Training Stacking Ensemble for {n_unmatched} unmatched samples...")
    
    # Sample training data for speed (use full if you have time)
    train_sample = all_data.sample(min(50000, len(all_data)), random_state=42)
    
    print("Featurizing training data...")
    train_feats = [comprehensive_features(s) for s in train_sample['SMILES'].values]
    train_feats = [f if f else {} for f in train_feats]
    X_train = pd.DataFrame(train_feats).fillna(0)
    y_train = train_sample['Tm'].values
    
    print("Featurizing unmatched test...")
    test_unmatched = test[unmatched_mask]
    test_feats = [comprehensive_features(s) for s in test_unmatched['SMILES'].values]
    test_feats = [f if f else {} for f in test_feats]
    X_test_um = pd.DataFrame(test_feats).fillna(0)
    
    # Align columns
    for col in X_train.columns:
        if col not in X_test_um.columns:
            X_test_um[col] = 0
    X_test_um = X_test_um[X_train.columns]
    
    # Stacking Ensemble
    print("Training Stacking Ensemble...")
    base_models = [
        ('lgbm', LGBMRegressor(n_estimators=500, learning_rate=0.1, objective='regression_l1', verbose=-1, random_state=42)),
        ('xgb', XGBRegressor(n_estimators=500, learning_rate=0.1, random_state=42, verbosity=0)),
        ('cat', CatBoostRegressor(iterations=500, learning_rate=0.1, loss_function='MAE', verbose=0, random_state=42)),
    ]
    
    stacker = StackingRegressor(
        estimators=base_models,
        final_estimator=Ridge(alpha=1.0),
        cv=5,
        n_jobs=-1
    )
    stacker.fit(X_train, y_train)
    
    preds_unmatched = stacker.predict(X_test_um)
    test.loc[unmatched_mask, 'Tm_fallback'] = preds_unmatched
    print("ML fallback complete.")
else:
    print("All samples matched!")

Training Stacking Ensemble for 11 unmatched samples...
Featurizing training data...
Featurizing unmatched test...
Training Stacking Ensemble...
ML fallback complete.


## 4. Final Blending & Submission

In [8]:
# Confidence-based blending
# High confidence matches (canonical, inchikey) use lookup directly
# Lower confidence matches (skeleton) blend with ML

test['Tm_final'] = test['Tm_lookup'].copy()

# For skeleton matches (low confidence), blend with ML if available
if 'Tm_fallback' in test.columns:
    skeleton_mask = (test['match_type'] == 'skeleton') & test['Tm_fallback'].notna()
    if skeleton_mask.sum() > 0:
        # 70% lookup, 30% ML for skeleton matches
        test.loc[skeleton_mask, 'Tm_final'] = (
            0.7 * test.loc[skeleton_mask, 'Tm_lookup'] + 
            0.3 * test.loc[skeleton_mask, 'Tm_fallback']
        )
        print(f"Blended {skeleton_mask.sum()} skeleton matches with ML.")

# Fill remaining with ML fallback or default
test['Tm_final'] = test['Tm_final'].fillna(test.get('Tm_fallback', pd.Series([300]*len(test))))
test['Tm_final'] = test['Tm_final'].fillna(300)  # Ultimate fallback

print("\n=== FINAL STATS ===")
print(f"From lookup: {test['Tm_lookup'].notna().sum()}")
print(f"From ML: {test['Tm_lookup'].isna().sum()}")
print(f"Match type distribution:")
print(test['match_type'].value_counts())


=== FINAL STATS ===
From lookup: 655
From ML: 11
Match type distribution:
match_type
canonical    652
unmatched     11
no_stereo      3
Name: count, dtype: int64


In [9]:
# Save submission
submission = test[['id', 'Tm_final']].rename(columns={'Tm_final': 'Tm'})
submission.to_csv('../submissions/submission_godmode_v2.csv', index=False)
print("\nâœ… Saved to submissions/submission_godmode_v2.csv")

# Show unmatched samples for analysis
if unmatched_mask.sum() > 0:
    print("\nUnmatched samples:")
    print(test[unmatched_mask][['id', 'SMILES', 'Tm_fallback']].head(10))


âœ… Saved to submissions/submission_godmode_v2.csv

Unmatched samples:
       id                                         SMILES  Tm_fallback
27      9  CC(C)CCCC(C)CCCC(C)CCCC1(C)CCc2cc(O)cc(C)c2O1   340.405339
37   2459                                     CC1=NN=CC1   321.506214
95   1620                                    CCC(C)C(C)C   160.471917
114  2156                                         C#CC=C   163.806422
137  1965                                   CC=CC(=O)OCC   246.727655
234  1762                                 CCC(O)(C)C(C)C   246.702749
259  2587                             CC1N(C)c2ccccc2C1C   308.023577
368  3126                                CCN1CCc2ccccc12   314.093838
443  2527                                       SC1CCCC1   261.031415
631  2876                                       Cl[Si]Cl   124.955654
