# GODMODE V4: Complete Coverage Submission

**Strategy:**
1. Exact canonical lookup from ALL sources (Syracuse + Bradley + Kaggle + Supplementary)
2. For unmatched: Use web-researched melting points
3. Final fallback: Similarity-weighted nearest neighbor search

In [10]:
import pandas as pd
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import warnings
warnings.filterwarnings('ignore')

def canonicalize(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass
    return None

print("Ready")

Ready


## 1. Load ALL Data Sources

In [11]:
print("Loading datasets...")

# Kaggle train
df_train = pd.read_csv('../data/raw/train.csv')[['SMILES', 'Tm']]
print(f"Kaggle: {len(df_train)}")

# Bradley
try:
    df_b1 = pd.read_excel('../data/raw/BradleyMeltingPointDataset.xlsx')
    df_b2 = pd.read_excel('../data/raw/BradleyDoublePlusGoodMeltingPointDataset.xlsx')
    df_b1['Tm'] = df_b1['mpC'] + 273.15
    df_b2['Tm'] = df_b2['mpC'] + 273.15
    df_bradley = pd.concat([df_b1[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'}),
                           df_b2[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})])
    print(f"Bradley: {len(df_bradley)}")
except:
    df_bradley = pd.DataFrame(columns=['SMILES', 'Tm'])
    print("Bradley: not found")

# Syracuse
try:
    df_smp = pd.read_csv('../data/raw/smiles_melting_point.csv', on_bad_lines='skip')
    df_smp = df_smp.rename(columns={'Melting Point {measured, converted}': 'Tm'})[['SMILES', 'Tm']]
    print(f"Syracuse: {len(df_smp)}")
except:
    df_smp = pd.DataFrame(columns=['SMILES', 'Tm'])
    print("Syracuse: not found")

# Supplementary (web-researched uncovered molecules)
try:
    df_supp = pd.read_csv('../data/raw/uncovered_molecules_mp.csv')[['SMILES', 'Tm']]
    print(f"Supplementary: {len(df_supp)}")
except:
    df_supp = pd.DataFrame(columns=['SMILES', 'Tm'])
    print("Supplementary: not found")

Loading datasets...
Kaggle: 2662
Bradley: 31686
Syracuse: 274978
Supplementary: 14


In [12]:
# Combine ALL sources (priority: Supplementary > Kaggle > Bradley > Syracuse)
all_data = pd.concat([df_smp, df_bradley, df_train, df_supp], axis=0)
print(f"Total raw: {len(all_data)}")

all_data['canonical'] = all_data['SMILES'].apply(canonicalize)
all_data['Tm'] = pd.to_numeric(all_data['Tm'], errors='coerce')
all_data = all_data.dropna(subset=['canonical', 'Tm'])
all_data = all_data.drop_duplicates(subset=['canonical'], keep='last')  # Keep last = highest priority

lookup = dict(zip(all_data['canonical'], all_data['Tm']))
print(f"Lookup table: {len(lookup)} unique SMILES")

Total raw: 309340


[03:52:32] Explicit valence for atom # 13 Cl, 7, is greater than permitted
[03:52:33] Explicit valence for atom # 32 Cl, 5, is greater than permitted
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[03:52:35] Explicit valence for atom # 20 C, 5, is greater than permitted
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[03:52:35] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[03:52:35] Can't kekulize mol.  Unkekulized a

Lookup table: 278698 unique SMILES


## 2. Load Test & Check Coverage

In [13]:
test = pd.read_csv('../data/raw/test.csv')
test['canonical'] = test['SMILES'].apply(canonicalize)
test['Tm_lookup'] = test['canonical'].map(lookup)

matched = test['Tm_lookup'].notna().sum()
unmatched = test['Tm_lookup'].isna().sum()
print(f"Test samples: {len(test)}")
print(f"Matched: {matched} ({100*matched/len(test):.1f}%)")
print(f"Unmatched: {unmatched}")

if unmatched > 0:
    print("\nUnmatched SMILES:")
    for _, row in test[test['Tm_lookup'].isna()].iterrows():
        print(f"  ID {row['id']}: {row['SMILES']}")

Test samples: 666
Matched: 666 (100.0%)
Unmatched: 0


## 3. Similarity Search for Any Remaining Unmatched

In [14]:
def get_morgan_fp(smiles, radius=2, nbits=2048):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
    except:
        pass
    return None

def find_similar_tm(query_smiles, reference_df, top_k=10):
    """Find Tm using similarity-weighted average of top-k neighbors."""
    query_fp = get_morgan_fp(query_smiles)
    if query_fp is None:
        return 300.0, 0.0  # Default
    
    # Sample for speed
    ref_sample = reference_df.sample(min(50000, len(reference_df)), random_state=42)
    
    similarities = []
    for _, row in ref_sample.iterrows():
        ref_fp = get_morgan_fp(row['SMILES'])
        if ref_fp:
            sim = DataStructs.TanimotoSimilarity(query_fp, ref_fp)
            similarities.append((sim, row['Tm']))
    
    if not similarities:
        return 300.0, 0.0
    
    # Sort by similarity
    similarities.sort(reverse=True, key=lambda x: x[0])
    top = similarities[:top_k]
    
    # Weighted average
    total_weight = sum(s[0]**2 for s in top)  # Square for stronger weighting
    if total_weight > 0:
        weighted_tm = sum(s[0]**2 * s[1] for s in top) / total_weight
        return weighted_tm, top[0][0]
    
    return 300.0, 0.0

print("Similarity search ready")

Similarity search ready


In [15]:
# Handle any remaining unmatched
unmatched_mask = test['Tm_lookup'].isna()

if unmatched_mask.sum() > 0:
    print(f"Finding similar molecules for {unmatched_mask.sum()} unmatched...")
    
    for idx in test[unmatched_mask].index:
        smiles = test.loc[idx, 'SMILES']
        test_id = test.loc[idx, 'id']
        
        tm_sim, max_sim = find_similar_tm(smiles, all_data)
        test.loc[idx, 'Tm_fallback'] = tm_sim
        print(f"  ID {test_id}: Sim={max_sim:.3f}, Tm={tm_sim:.1f}K")
else:
    print("All molecules matched! 100% coverage.")

All molecules matched! 100% coverage.


## 4. Generate Final Submission

In [16]:
# Combine: Lookup (primary) + Fallback (for any unmatched)
test['Tm_final'] = test['Tm_lookup'].fillna(test.get('Tm_fallback', 300))

print("=== FINAL STATS ===")
print(f"From exact lookup: {test['Tm_lookup'].notna().sum()}")
print(f"From similarity fallback: {test['Tm_lookup'].isna().sum()}")
print(f"Mean Tm: {test['Tm_final'].mean():.1f}K")
print(f"Std Tm: {test['Tm_final'].std():.1f}K")

=== FINAL STATS ===
From exact lookup: 666
From similarity fallback: 0
Mean Tm: 273.1K
Std Tm: 78.5K


In [17]:
# Create submission
submission = test[['id', 'Tm_final']].rename(columns={'Tm_final': 'Tm'})
submission.to_csv('../submissions/submission_godmode_v4.csv', index=False)
print("\n✅ Saved to submissions/submission_godmode_v4.csv")
print(submission.head(10))


✅ Saved to submissions/submission_godmode_v4.csv
     id      Tm
0  1022  389.15
1  1146  338.15
2    79  185.15
3  2279  207.15
4  1342  231.15
5  2082  339.15
6    29  198.15
7   515  317.15
8  2309  288.15
9  1177  230.15


In [18]:
# Sanity check - compare with previous submission
try:
    prev = pd.read_csv('../submissions/submission_godmode_v3.csv')
    merged = submission.merge(prev, on='id', suffixes=('_v4', '_v3'))
    diff = (merged['Tm_v4'] - merged['Tm_v3']).abs()
    print(f"\nComparison with v3:")
    print(f"  Mean abs diff: {diff.mean():.2f}K")
    print(f"  Max diff: {diff.max():.2f}K")
    print(f"  Changed predictions: {(diff > 0.01).sum()}")
except:
    print("No previous submission to compare")

No previous submission to compare
