In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

# --- 1. Data Loading & Chemical Engineering ---
main_df = pd.read_csv('/kaggle/input/catechol-benchmark-hackathon/catechol_full_data_yields.csv')
desc_df = pd.read_csv('/kaggle/input/catechol-benchmark-hackathon/acs_pca_descriptors_lookup.csv')

# Define descriptor column name globally to avoid NameError
d_col = desc_df.columns[0]

def chemical_engineering(df, descriptors):
    # Sanitize Solvent Ratios
    for col in ['SOLVENT A Ratio', 'SOLVENT B Ratio']:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(r'[^0-9.]', '', regex=True), errors='coerce').fillna(0)
    
    # Normalize Solvent names for merging
    df['SOLVENT A NAME'] = df['SOLVENT A NAME'].astype(str).str.strip().str.upper()
    df['SOLVENT B NAME'] = df['SOLVENT B NAME'].astype(str).str.strip().str.upper()
    
    # Standardize Descriptor lookup
    lookup = descriptors.copy()
    lookup[d_col] = lookup[d_col].astype(str).str.strip().str.upper()
    
    # Merge PCA physical descriptors
    df = df.merge(lookup, left_on='SOLVENT A NAME', right_on=d_col, how='left')
    df = df.merge(lookup, left_on='SOLVENT B NAME', right_on=d_col, how='left', suffixes=('_A', '_B'))
    
    # Structural features (SMILES) and Kinetics
    df['SMILES_Len'] = df['SM SMILES'].astype(str).apply(len)
    df['Double_Bonds'] = df['SM SMILES'].astype(str).apply(lambda x: x.count('='))
    df['Kinetics'] = np.log1p(df['Temperature'] * df['Residence Time'])
    return df

df_final = chemical_engineering(main_df, desc_df)

# Feature Selection (d_col is now correctly defined)
exclude = ['EXP NUM', 'SM', 'Product 2', 'Product 3', 'SM SMILES', 'Product 2 SMILES', 
           'Product 3 SMILES', 'SOLVENT A SMILES', 'SOLVENT B SMILES', 'RAMP NUM', d_col]
X = df_final.select_dtypes(include=[np.number]).drop(columns=[c for c in exclude if c in df_final.columns], errors='ignore').fillna(0)
y = df_final[['SM', 'Product 2', 'Product 3']]

# --- 2. Ensemble Architecture ---
xgb = MultiOutputRegressor(XGBRegressor(n_estimators=1500, learning_rate=0.01, max_depth=8, random_state=42))
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=500, max_depth=12, random_state=42))

# --- 3. Cross-Validation & Prediction with ID Correction ---
kf = KFold(n_splits=10, shuffle=True, random_state=42)
submission_rows = []

print("ðŸš€ Training Final Ensemble with fix for NameError...")

for _, val_idx in kf.split(X):
    X_t, X_v = X.iloc[_], X.iloc[val_idx]
    y_t, y_v = y.iloc[_], y.iloc[val_idx]
    
    xgb.fit(X_t, y_t)
    rf.fit(X_t, y_t)
    
    # Ensemble Blend (75% XGB / 25% RF)
    preds = (0.75 * xgb.predict(X_v)) + (0.25 * rf.predict(X_v))
    
    # Chemical Post-processing
    preds = np.clip(preds, 0, 1)
    preds = preds / preds.sum(axis=1)[:, np.newaxis]
    
    for i, idx in enumerate(val_idx):
        submission_rows.append({
            'id': idx, 
            'SM': preds[i][0],
            'Product 2': preds[i][1],
            'Product 3': preds[i][2]
        })

# --- 4. Final Submission Export ---
submission_df = pd.DataFrame(submission_rows).sort_values('id').drop_duplicates('id')
submission_df.to_csv('submission.csv', index=False)

print(f"âœ… Success! Generated 'submission.csv' with {len(submission_df)} rows.")

ðŸš€ Training Final Ensemble with fix for NameError...
âœ… Success! Generated 'submission.csv' with 1227 rows.
