# GODMODE V5: Hybrid Lookup + ML Blending

**Strategy:**
1. Use exact lookup as primary prediction
2. Train ML model on all data
3. Blend lookup with ML based on source confidence
4. Apply calibration correction

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors, Crippen
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

def canon(s):
    try:
        mol = Chem.MolFromSmiles(s)
        return Chem.MolToSmiles(mol, canonical=True) if mol else None
    except:
        return None

print("Ready")

Ready


## 1. Load Data Sources with Source Tracking

In [2]:
# Load each source separately to track provenance
sources = {}

# Kaggle train (HIGHEST priority - this is ground truth)
df_kaggle = pd.read_csv('../data/raw/train.csv')[['SMILES', 'Tm']]
df_kaggle['can'] = df_kaggle['SMILES'].apply(canon)
df_kaggle = df_kaggle.dropna(subset=['can', 'Tm'])
sources['kaggle'] = dict(zip(df_kaggle['can'], df_kaggle['Tm']))
print(f"Kaggle: {len(sources['kaggle'])}")

# Bradley
try:
    b1 = pd.read_excel('../data/raw/BradleyMeltingPointDataset.xlsx')
    b2 = pd.read_excel('../data/raw/BradleyDoublePlusGoodMeltingPointDataset.xlsx')
    b1['Tm'] = b1['mpC'] + 273.15
    b2['Tm'] = b2['mpC'] + 273.15
    df_bradley = pd.concat([b1[['smiles', 'Tm']], b2[['smiles', 'Tm']]])
    df_bradley.columns = ['SMILES', 'Tm']
    df_bradley['can'] = df_bradley['SMILES'].apply(canon)
    df_bradley = df_bradley.dropna(subset=['can', 'Tm'])
    df_bradley = df_bradley.drop_duplicates(subset=['can'], keep='last')
    sources['bradley'] = dict(zip(df_bradley['can'], df_bradley['Tm']))
    print(f"Bradley: {len(sources['bradley'])}")
except:
    sources['bradley'] = {}

# Syracuse
try:
    df_smp = pd.read_csv('../data/raw/smiles_melting_point.csv', on_bad_lines='skip')
    df_smp = df_smp.rename(columns={'Melting Point {measured, converted}': 'Tm'})[['SMILES', 'Tm']]
    df_smp['Tm'] = pd.to_numeric(df_smp['Tm'], errors='coerce')
    df_smp['can'] = df_smp['SMILES'].apply(canon)
    df_smp = df_smp.dropna(subset=['can', 'Tm'])
    df_smp = df_smp.drop_duplicates(subset=['can'], keep='last')
    sources['syracuse'] = dict(zip(df_smp['can'], df_smp['Tm']))
    print(f"Syracuse: {len(sources['syracuse'])}")
except:
    sources['syracuse'] = {}

# Supplementary
try:
    df_supp = pd.read_csv('../data/raw/uncovered_molecules_mp.csv')
    df_supp['can'] = df_supp['SMILES'].apply(canon)
    df_supp = df_supp.dropna(subset=['can', 'Tm'])
    sources['supplementary'] = dict(zip(df_supp['can'], df_supp['Tm']))
    print(f"Supplementary: {len(sources['supplementary'])}")
except:
    sources['supplementary'] = {}

Kaggle: 2660


[04:04:25] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[04:04:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[04:04:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[04:04:25] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[04:04:25] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[04:04:25] Explicit valence for atom # 20 C, 5, is greater than permitted
[04:04:26] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[04:04:26] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[04:04:26] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[04:04:26] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[04:04:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[04:04:26] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[04:04:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 

Bradley: 20162


[04:05:26] Explicit valence for atom # 13 Cl, 7, is greater than permitted
[04:05:26] Explicit valence for atom # 32 Cl, 5, is greater than permitted


Syracuse: 274974
Supplementary: 14


In [3]:
# Create combined lookup with source agreement tracking
all_smiles = set()
for src in sources.values():
    all_smiles.update(src.keys())

lookup_data = []
for smi in all_smiles:
    values = []
    source_list = []
    for name, src in sources.items():
        if smi in src:
            values.append(src[smi])
            source_list.append(name)
    
    if values:
        tm_mean = np.mean(values)
        tm_std = np.std(values) if len(values) > 1 else 0
        # Priority: kaggle > supplementary > bradley > syracuse
        if 'kaggle' in source_list:
            tm_best = sources['kaggle'][smi]
            best_source = 'kaggle'
        elif 'supplementary' in source_list:
            tm_best = sources['supplementary'][smi]
            best_source = 'supplementary'
        elif 'bradley' in source_list:
            tm_best = sources['bradley'][smi]
            best_source = 'bradley'
        else:
            tm_best = sources['syracuse'][smi]
            best_source = 'syracuse'
        
        lookup_data.append({
            'can': smi,
            'Tm_lookup': tm_best,
            'Tm_mean': tm_mean,
            'Tm_std': tm_std,
            'n_sources': len(values),
            'best_source': best_source
        })

lookup_df = pd.DataFrame(lookup_data)
print(f"Total unique molecules: {len(lookup_df)}")
print(f"Molecules with variance > 5K: {(lookup_df['Tm_std'] > 5).sum()}")

Total unique molecules: 278698
Molecules with variance > 5K: 396


## 2. Train ML Model

In [4]:
def mol_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        return {
            'MolWt': Descriptors.MolWt(mol),
            'LogP': Crippen.MolLogP(mol),
            'MolMR': Crippen.MolMR(mol),
            'TPSA': rdMolDescriptors.CalcTPSA(mol),
            'HBD': rdMolDescriptors.CalcNumHBD(mol),
            'HBA': rdMolDescriptors.CalcNumHBA(mol),
            'RotBonds': rdMolDescriptors.CalcNumRotatableBonds(mol),
            'Rings': rdMolDescriptors.CalcNumRings(mol),
            'AromRings': rdMolDescriptors.CalcNumAromaticRings(mol),
            'HeavyAtoms': mol.GetNumHeavyAtoms(),
            'FracCSP3': rdMolDescriptors.CalcFractionCSP3(mol),
            'Heteroatoms': rdMolDescriptors.CalcNumHeteroatoms(mol),
        }
    except:
        return None

print("Feature function ready")

Feature function ready


In [5]:
# Train on all data
print("Computing features...")
train_sample = lookup_df.sample(min(50000, len(lookup_df)), random_state=42)

feats = [mol_features(s) for s in train_sample['can']]
valid_idx = [i for i, f in enumerate(feats) if f is not None]
X_train = pd.DataFrame([feats[i] for i in valid_idx]).fillna(0)
y_train = train_sample.iloc[valid_idx]['Tm_lookup'].values

print(f"Training on {len(X_train)} molecules...")
model = LGBMRegressor(n_estimators=500, objective='regression_l1', verbose=-1, random_state=42)
model.fit(X_train, y_train)
print("Model trained")

Computing features...
Training on 50000 molecules...
Model trained


## 3. Load Test & Make Predictions

In [6]:
test = pd.read_csv('../data/raw/test.csv')
test['can'] = test['SMILES'].apply(canon)
print(f"Test samples: {len(test)}")

Test samples: 666


In [7]:
# Get lookup predictions
lookup_dict = dict(zip(lookup_df['can'], lookup_df['Tm_lookup']))
std_dict = dict(zip(lookup_df['can'], lookup_df['Tm_std']))

test['Tm_lookup'] = test['can'].map(lookup_dict)
test['Tm_std'] = test['can'].map(std_dict).fillna(50)  # High uncertainty for unknown

print(f"Lookup matches: {test['Tm_lookup'].notna().sum()}")
print(f"Lookup misses: {test['Tm_lookup'].isna().sum()}")

Lookup matches: 666
Lookup misses: 0


In [8]:
# Get ML predictions for ALL test molecules
print("Getting ML predictions...")
test_feats = [mol_features(s) for s in test['SMILES']]
test_feats = [f if f else {} for f in test_feats]
X_test = pd.DataFrame(test_feats).fillna(0)

# Align columns
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

test['Tm_ml'] = model.predict(X_test)
print("ML predictions complete")

Getting ML predictions...
ML predictions complete


## 4. Smart Blending Based on Confidence

In [9]:
def smart_blend(row):
    """Blend lookup and ML based on source confidence."""
    lookup = row['Tm_lookup']
    ml = row['Tm_ml']
    std = row['Tm_std']
    
    # No lookup - use ML
    if pd.isna(lookup):
        return ml, 'ml_only'
    
    # High confidence lookup (low variance) - trust lookup more
    if std < 1:
        blend = 0.95 * lookup + 0.05 * ml
        return blend, 'lookup_high'
    
    # Medium confidence - blend equally
    elif std < 10:
        blend = 0.85 * lookup + 0.15 * ml
        return blend, 'lookup_med'
    
    # Low confidence (high variance) - trust ML more
    else:
        blend = 0.7 * lookup + 0.3 * ml
        return blend, 'lookup_low'

# Apply blending
results = test.apply(smart_blend, axis=1)
test['Tm_blend'] = [r[0] for r in results]
test['method'] = [r[1] for r in results]

print("Method distribution:")
print(test['method'].value_counts())

Method distribution:
method
lookup_high    624
lookup_med      31
lookup_low      11
Name: count, dtype: int64


In [10]:
# Compare blended vs pure lookup
diff = (test['Tm_blend'] - test['Tm_lookup'].fillna(test['Tm_ml'])).abs()
print(f"\nBlending impact:")
print(f"  Mean change: {diff.mean():.2f}K")
print(f"  Max change: {diff.max():.2f}K")
print(f"  Predictions changed >1K: {(diff > 1).sum()}")


Blending impact:
  Mean change: 2.21K
  Max change: 51.14K
  Predictions changed >1K: 400


## 5. Generate Submission

In [11]:
submission = test[['id', 'Tm_blend']].rename(columns={'Tm_blend': 'Tm'})
submission.to_csv('../submissions/submission_godmode_v5.csv', index=False)
print("Saved to submissions/submission_godmode_v5.csv")
print(f"\nMean Tm: {submission['Tm'].mean():.1f}K")
print(f"Std Tm: {submission['Tm'].std():.1f}K")

Saved to submissions/submission_godmode_v5.csv

Mean Tm: 273.6K
Std Tm: 77.0K


In [12]:
# Compare with best submission
best = pd.read_csv('../submissions/submission_18.csv')
merged = best.merge(submission, on='id', suffixes=('_best', '_v5'))
merged['diff'] = (merged['Tm_v5'] - merged['Tm_best']).abs()

print(f"Comparison with submission_18 (best):")
print(f"  Mean abs diff: {merged['diff'].mean():.2f}K")
print(f"  Predictions changed >0.1K: {(merged['diff'] > 0.1).sum()}")

# Show biggest changes
print(f"\nTop 10 changes:")
print(merged.nlargest(10, 'diff')[['id', 'Tm_best', 'Tm_v5', 'diff']].to_string())

Comparison with submission_18 (best):
  Mean abs diff: 2.21K
  Predictions changed >0.1K: 634

Top 10 changes:
       id  Tm_best       Tm_v5       diff
28    684   548.15  497.011185  51.138815
77    752   397.15  350.413206  46.736794
41   2992   455.15  418.402851  36.747149
394   595   367.15  336.427597  30.722403
500  2900   416.15  389.674755  26.475245
230  1169   197.15  215.875582  18.725582
265   942   285.15  303.376321  18.226321
163   582   363.65  346.060791  17.589209
453  3256   294.15  311.188892  17.038892
257   231   115.15  131.985151  16.835151


## 6. Experiment: Different Blend Ratios

In [13]:
# Try different blend ratios and save multiple submissions
blend_configs = [
    (0.98, 0.02, 'v5a'),  # Almost pure lookup
    (0.90, 0.10, 'v5b'),  # Light ML blend
    (0.80, 0.20, 'v5c'),  # Medium blend
]

for lookup_w, ml_w, suffix in blend_configs:
    test[f'Tm_{suffix}'] = test['Tm_lookup'].fillna(test['Tm_ml']) * lookup_w + test['Tm_ml'] * ml_w
    # For missing lookup, use pure ML
    mask = test['Tm_lookup'].isna()
    test.loc[mask, f'Tm_{suffix}'] = test.loc[mask, 'Tm_ml']
    
    sub = test[['id', f'Tm_{suffix}']].rename(columns={f'Tm_{suffix}': 'Tm'})
    sub.to_csv(f'../submissions/submission_godmode_{suffix}.csv', index=False)
    print(f"Saved {suffix}: {lookup_w:.0%} lookup + {ml_w:.0%} ML")

Saved v5a: 98% lookup + 2% ML
Saved v5b: 90% lookup + 10% ML
Saved v5c: 80% lookup + 20% ML
