In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tc-smiles/Tc_SMILES.csv
/kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train.csv
/kaggle/input/neurips-open-polymer-prediction-2025/test.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv
/kaggle/input/smiles-extra-data/data_dnst1.xlsx
/kaggle/input/smiles-extra-data/data_tg3.xlsx
/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv
/kaggle/input/tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv


In [2]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3


# Data Loading and Preprocessing Pipeline

In [3]:
# === Imports ===
import pandas as pd
import numpy as np
from rdkit import Chem

# === Config ===
BASE_PATH = '/kaggle/input/neurips-open-polymer-prediction-2025/'
TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
BAD_PATTERNS = ['[R]', '[R1]', '[R2]', '[R3]', '[R4]', '[R5]',
                "[R']", '[R"]', 'R1', 'R2', 'R3', 'R4', 'R5',
                '([R])', '([R1])', '([R2])']

# === SMILES Cleaner ===
def clean_and_validate_smiles(smiles):
    if not isinstance(smiles, str) or not smiles:
        return None
    for pattern in BAD_PATTERNS:
        if pattern in smiles:
            return None
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None
    return None

# === Load Train/Test ===
train = pd.read_csv(BASE_PATH + 'train.csv')
test = pd.read_csv(BASE_PATH + 'test.csv')

train['SMILES'] = train['SMILES'].apply(clean_and_validate_smiles)
test['SMILES'] = test['SMILES'].apply(clean_and_validate_smiles)

train.dropna(subset=['SMILES'], inplace=True)
test.dropna(subset=['SMILES'], inplace=True)

# === Load External Datasets (excluding dataset2) ===
external_datasets = []

def load_external(path, target, rename_map=None):
    try:
        df = pd.read_csv(path)
        if rename_map:
            df = df.rename(columns=rename_map)
        if 'SMILES' in df.columns and target in df.columns:
            df = df[['SMILES', target]].dropna()
            external_datasets.append((target, df))
            print(f"✅ Loaded {path} ({len(df)} entries for {target})")
        else:
            print(f"⚠️ Skipped {path}: required columns missing")
    except Exception as e:
        print(f"⚠️ Failed to load {path}: {e}")

load_external(BASE_PATH + 'train_supplement/dataset1.csv', 'Tc', rename_map={'TC_mean': 'Tc'})
load_external(BASE_PATH + 'train_supplement/dataset3.csv', 'Tg')
load_external(BASE_PATH + 'train_supplement/dataset4.csv', 'FFV')

# === Load Additional External Datasets ===
try:
    extra_data_tg3 = pd.read_excel("/kaggle/input/smiles-extra-data/data_tg3.xlsx")
    extra_data_dnst1 = pd.read_excel("/kaggle/input/smiles-extra-data/data_dnst1.xlsx")
    jcim_sup_bigsmiles = pd.read_csv("/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv")
    tc_smiles_df = pd.read_csv("/kaggle/input/tc-smiles/Tc_SMILES.csv")
except Exception as e:
    print(f"⚠️ Error loading extra data: {e}")

# Helper to standardize and append
def process_and_append_external(df, target, source_name):
    if 'SMILES' in df.columns and target in df.columns:
        df = df[['SMILES', target]].copy()
        df['SMILES'] = df['SMILES'].apply(clean_and_validate_smiles)
        df = df.dropna(subset=['SMILES'])

        # Ensure the target column is numeric
        df[target] = pd.to_numeric(df[target], errors='coerce')
        df = df.dropna(subset=[target])

        df = df.groupby('SMILES', as_index=False)[target].mean()
        external_datasets.append((target, df))
        print(f"✅ Integrated {source_name}: {len(df)} entries for {target}")
    else:
        print(f"⚠️ Skipped {source_name}: missing columns")

# Process each extra dataset (with correct column names)
process_and_append_external(extra_data_tg3.rename(columns={"Tg [K]": "Tg"}), "Tg", "data_tg3.xlsx")
process_and_append_external(extra_data_dnst1.rename(columns={"density(g/cm3)": "Density"}), "Density", "data_dnst1.xlsx")
process_and_append_external(tc_smiles_df.rename(columns={"TC_mean": "Tc"}), "Tc", "Tc_SMILES.csv")

# JCIM SMILES only (for future feature engineering)
jcim_smiles_only = jcim_sup_bigsmiles[['SMILES']].dropna()
jcim_smiles_only['SMILES'] = jcim_smiles_only['SMILES'].apply(clean_and_validate_smiles)
jcim_smiles_only = jcim_smiles_only.dropna().drop_duplicates()
print(f"✅ Loaded JCIM SMILES-only dataset: {len(jcim_smiles_only)} unique SMILES (no targets)")

# === Merge External Data ===
def merge_external(train_df, ext_df, target):
    ext_df['SMILES'] = ext_df['SMILES'].apply(clean_and_validate_smiles)
    ext_df = ext_df.dropna(subset=['SMILES', target])
    ext_df = ext_df.groupby('SMILES', as_index=False)[target].mean()

    # Fill missing target values in existing rows
    existing_smiles = set(train_df['SMILES'])
    to_fill = ext_df[ext_df['SMILES'].isin(existing_smiles)]
    for _, row in to_fill.iterrows():
        mask = (train_df['SMILES'] == row['SMILES']) & (train_df[target].isna())
        train_df.loc[mask, target] = row[target]

    # Add new rows
    new_smiles = set(ext_df['SMILES']) - existing_smiles
    new_rows = ext_df[ext_df['SMILES'].isin(new_smiles)].copy()
    for col in TARGETS:
        if col not in new_rows.columns:
            new_rows[col] = np.nan
    return pd.concat([train_df, new_rows[['SMILES'] + TARGETS]], ignore_index=True)

# === Apply Merges ===
train_extended = train[['SMILES'] + TARGETS].copy()
for target, ext in external_datasets:
    train_extended = merge_external(train_extended, ext, target)

# === Final Clean-Up ===
train_extended = train_extended.replace([np.inf, -np.inf], np.nan)
train_extended = train_extended.dropna(subset=TARGETS, how='all')
train_extended = train_extended.drop_duplicates(subset=['SMILES']).reset_index(drop=True)

# === Summary ===
print("\n📊 Final Summary:")
print(f"Train: {len(train)} | Extended: {len(train_extended)}")
for t in TARGETS:
    base = train[t].notna().sum()
    ext = train_extended[t].notna().sum()
    print(f"• {t:<8}: {ext} total ({ext - base:+} from supplements)")

print("\n✅ Data loading and preprocessing complete.")

✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv (874 entries for Tc)
✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv (46 entries for Tg)
✅ Loaded /kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv (862 entries for FFV)
✅ Integrated data_tg3.xlsx: 499 entries for Tg
✅ Integrated data_dnst1.xlsx: 778 entries for Density
✅ Integrated Tc_SMILES.csv: 866 entries for Tc
✅ Loaded JCIM SMILES-only dataset: 662 unique SMILES (no targets)

📊 Final Summary:
Train: 7973 | Extended: 9990
• Tg      : 1056 total (+545 from supplements)
• FFV     : 7892 total (+862 from supplements)
• Tc      : 866 total (+129 from supplements)
• Density : 1247 total (+634 from supplements)
• Rg      : 614 total (+0 from supplements)

✅ Data loading and preprocessing complete.


In [4]:
smiles_list = train_extended['SMILES'].tolist()
# Clean SMILES column robustly
train_extended['SMILES'] = train_extended['SMILES'].apply(clean_and_validate_smiles)
# === Final Clean-Up ===
train_extended = train_extended.replace([np.inf, -np.inf], np.nan)
train_extended = train_extended.dropna(subset=TARGETS, how='all')
train_extended = train_extended.drop_duplicates(subset=['SMILES']).reset_index(drop=True)

# === Drop constant columns ===
constant_cols = [col for col in train_extended.columns if train_extended[col].nunique() == 1]
train_extended.drop(columns=constant_cols, inplace=True)
print(f"Dropped {len(constant_cols)} constant columns from train_extended")


train_extended.shape
train_extended

Dropped 0 constant columns from train_extended


Unnamed: 0,SMILES,Tg,FFV,Tc,Density,Rg
0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,1.05,
1,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,
2,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,
3,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,
...,...,...,...,...,...,...
9985,c1ccc(-c2ccccn2)nc1,,,,1.31,
9986,c1ccc(-c2nc3cc4ncoc4cc3o2)cc1,,,,1.43,
9987,c1ccc2oc(-c3ccc4ncoc4c3)nc2c1,,,,1.43,
9988,c1ccsc1,,,,1.51,


# Feature Engineering Pipeline

In [5]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import networkx as nx
from tqdm import tqdm

# === Canonicalize SMILES ===
def canonicalize_smiles(smiles_list):
    canonical = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            canonical.append(Chem.MolToSmiles(mol, canonical=True))
        else:
            canonical.append(None)
    return canonical

# === All RDKit Descriptors ===
def compute_rdkit_descriptors(mol):
    descs = {}
    for name, func in Descriptors.descList:
        try:
            descs[name] = func(mol)
        except:
            descs[name] = np.nan
    return descs

# === Explicit 5 Key Descriptors ===
def compute_key_descriptors(mol):
    return {
        'key_MolWt': Descriptors.MolWt(mol),
        'key_LogP': Descriptors.MolLogP(mol),
        'key_RotBonds': Descriptors.NumRotatableBonds(mol),
        'key_HDonors': Descriptors.NumHDonors(mol),
        'key_HAcceptors': Descriptors.NumHAcceptors(mol),
    }

# === Graph Features ===
def compute_graph_descriptors(mol):
    descriptors = {}
    g = nx.Graph()
    g.add_edges_from([(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()])

    try:
        descriptors['graph_diameter'] = nx.diameter(g) if nx.is_connected(g) else 0
        descriptors['avg_shortest_path'] = nx.average_shortest_path_length(g) if nx.is_connected(g) else 0
    except:
        descriptors['graph_diameter'] = 0
        descriptors['avg_shortest_path'] = 0

    descriptors['num_cycles'] = len(nx.cycle_basis(g))

    try:
        descriptors['betweenness_mean'] = np.mean(list(nx.betweenness_centrality(g).values()))
        descriptors['betweenness_std'] = np.std(list(nx.betweenness_centrality(g).values()))
        descriptors['closeness_mean'] = np.mean(list(nx.closeness_centrality(g).values()))
        descriptors['max_degree'] = max(dict(g.degree()).values())
    except:
        descriptors['betweenness_mean'] = np.nan
        descriptors['betweenness_std'] = np.nan
        descriptors['closeness_mean'] = np.nan
        descriptors['max_degree'] = np.nan

    try:
        ec = nx.eigenvector_centrality_numpy(g)
        descriptors['eigenvector_mean'] = np.mean(list(ec.values()))
    except:
        descriptors['eigenvector_mean'] = np.nan

    try:
        ring_info = mol.GetRingInfo().AtomRings()
        descriptors['ring_4'] = sum(1 for r in ring_info if len(r) == 4)
    except:
        descriptors['ring_4'] = 0

    try:
        descriptors['heteroatom_ratio'] = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() not in [1, 6]) / mol.GetNumAtoms()
    except:
        descriptors['heteroatom_ratio'] = np.nan

    return descriptors

# === Final Combined Feature Computation ===
def compute_all_features(smiles_list, verbose=True):
    smiles_list = canonicalize_smiles(smiles_list)

    feature_dict = {}
    valid_idx = []
    failed_idx = []

    for idx, smi in enumerate(tqdm(smiles_list, desc="Computing Features")):
        if smi is None:
            failed_idx.append(idx)
            continue

        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            failed_idx.append(idx)
            continue

        valid_idx.append(idx)
        feats = {}
        
        # Compute all descriptors from RDKit
        feats.update(compute_rdkit_descriptors(mol))
        # Compute graph descriptors
        feats.update(compute_graph_descriptors(mol))
        # Add the 5 explicit key descriptors (overwrites if duplicated keys)
        feats.update(compute_key_descriptors(mol))

        for k, v in feats.items():
            if k not in feature_dict:
                feature_dict[k] = []
            feature_dict[k].append(v)

    total = len(smiles_list)
    for k in feature_dict:
        if len(feature_dict[k]) < total:
            feature_dict[k].extend([None] * (total - len(feature_dict[k])))

    if verbose:
        print("\n--- Feature Engineering Summary ---")
        print(f"Total SMILES: {total}")
        print(f"Valid molecules: {len(valid_idx)}")
        print(f"Invalid molecules: {len(failed_idx)}")
        print(f"Number of computed features: {len(feature_dict)}")
        sample_key = next(iter(feature_dict))
        print(f"Feature vector length per molecule: {len(feature_dict[sample_key])}")
        print("-----------------------------------")

    return feature_dict, valid_idx

In [6]:
from rdkit import RDLogger
import pandas as pd

# Suppress RDKit warnings
RDLogger.DisableLog('rdApp.*')

# List of columns to drop ,Source: from various Notebooks through out the competition
useless_cols = [   
    'MaxPartialCharge', 
    'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
    'NumRadicalElectrons', 'SMR_VSA8', 'SlogP_VSA9', 'fr_barbitur',
    'fr_benzodiazepine', 'fr_dihydropyridine', 'fr_epoxide', 'fr_isothiocyan',
    'fr_lactam', 'fr_nitroso', 'fr_prisulfonamd', 'fr_thiocyan',
    'MaxEStateIndex', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons',
    'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Kappa1',
    'LabuteASA', 'HeavyAtomCount', 'MolMR', 'Chi3n', 'BertzCT', 'Chi2v',
    'Chi4n', 'HallKierAlpha', 'Chi3v', 'Chi4v', 'MinAbsPartialCharge',
    'MinPartialCharge', 'MaxAbsPartialCharge', 'FpDensityMorgan2',
    'FpDensityMorgan3', 'Phi', 'Kappa3', 'fr_nitrile', 'SlogP_VSA6',
    'NumAromaticCarbocycles', 'NumAromaticRings', 'fr_benzene', 'VSA_EState6',
    'NOCount', 'fr_C_O', 'fr_C_O_noCOO', 'NumHDonors', 'fr_amide',
    'fr_Nhpyrrole', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_COO2',
    'fr_halogen', 'fr_diazo', 'fr_nitro_arom', 'fr_phos_ester'
]

# === Compute Train Features ===
feature_dict_train, valid_idx_train = compute_all_features(train_extended["SMILES"], verbose=True)
features_train = pd.DataFrame(feature_dict_train).reset_index(drop=True)
features_train = features_train.drop(columns=[col for col in useless_cols if col in features_train.columns])

# === Compute Test Features ===
feature_dict_test, valid_idx_test = compute_all_features(test["SMILES"], verbose=True)
features_test = pd.DataFrame(feature_dict_test).reset_index(drop=True)
features_test = features_test.drop(columns=[col for col in useless_cols if col in features_test.columns])

# === Output Summary ===
print("Train features shape:", features_train.shape)
print("Test features shape:", features_test.shape)
print("Training dataframe Shape:", train_extended.shape)
print("Test dataframe Shape:", test.shape)

Computing Features: 100%|██████████| 9990/9990 [05:47<00:00, 28.74it/s]



--- Feature Engineering Summary ---
Total SMILES: 9990
Valid molecules: 9990
Invalid molecules: 0
Number of computed features: 232
Feature vector length per molecule: 9990
-----------------------------------


Computing Features: 100%|██████████| 3/3 [00:00<00:00, 21.84it/s]


--- Feature Engineering Summary ---
Total SMILES: 3
Valid molecules: 3
Invalid molecules: 0
Number of computed features: 232
Feature vector length per molecule: 3
-----------------------------------
Train features shape: (9990, 163)
Test features shape: (3, 163)
Training dataframe Shape: (9990, 6)
Test dataframe Shape: (3, 2)





# Prerprocessing features

In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

# === Preprocessing Utilities ===

def preprocess_features(df, df_name=""):
    print(f"\n📦 Preprocessing {df_name} features")

    # 1. Replace inf/-inf with NaN
    inf_count = np.isinf(df.values).sum()
    print(f"  - Replacing {inf_count} ±inf values with NaN...")
    df = df.replace([np.inf, -np.inf], np.nan)

    # 2. Drop columns that are entirely NaN
    all_nan_cols = df.columns[df.isna().all()].tolist()
    print(f"  - Dropping {len(all_nan_cols)} all-NaN columns...")
    df = df.dropna(axis=1, how='all')

    # 3. Fill remaining NaNs with column means
    nan_count = df.isna().sum().sum()
    print(f"  - Filling {nan_count} remaining NaNs with column means...")
    df = df.fillna(df.mean())

    return df


def detect_outliers(df, threshold=1e10):
    max_vals = df.max()
    min_vals = df.min()
    too_large = max_vals[max_vals > threshold]
    too_small = min_vals[min_vals < -threshold]
    return too_large, too_small


def remove_low_variance(df, threshold=1e-5):
    print(f"\n🧹 Applying VarianceThreshold (threshold={threshold})...")
    selector = VarianceThreshold(threshold=threshold)
    reduced = selector.fit_transform(df)
    kept_cols = df.columns[selector.get_support()]
    removed_count = df.shape[1] - len(kept_cols)
    print(f"  - Removed {removed_count} low-variance features.")
    return pd.DataFrame(reduced, columns=kept_cols)


def clip_outliers(df, lower=-1e6, upper=1e6):
    print(f"\n🧯 Clipping outliers to range [{lower}, {upper}]...")
    too_large, too_small = detect_outliers(df)
    if not too_large.empty or not too_small.empty:
        print(f"  - Clipping {len(too_large)} overly large and {len(too_small)} overly small features.")
        df = df.clip(lower, upper)
    else:
        print("  - No extreme outliers found.")
    return df


# === Apply Preprocessing ===

# Make sure your features_train and features_test already exist
features_train_clean = preprocess_features(features_train, df_name="Train")
features_test_clean = preprocess_features(features_test, df_name="Test")

# Align both datasets
common_cols = features_train_clean.columns.intersection(features_test_clean.columns)
features_train_clean = features_train_clean[common_cols].copy()
features_test_clean = features_test_clean[common_cols].copy()

# Remove near-zero variance features
features_train_clean = remove_low_variance(features_train_clean)
features_test_clean = features_test_clean[features_train_clean.columns]  # Align

# Clip extreme outliers
features_train_clean = clip_outliers(features_train_clean)
features_test_clean = clip_outliers(features_test_clean)

# === Summary ===
print("\n✅ Final Preprocessing Summary:")
print(f"  - Train shape: {features_train_clean.shape}")
print(f"  - Test shape:  {features_test_clean.shape}")
print(f"  - Common features retained: {len(features_train_clean.columns)}")


📦 Preprocessing Train features
  - Replacing 0 ±inf values with NaN...
  - Dropping 0 all-NaN columns...
  - Filling 23 remaining NaNs with column means...

📦 Preprocessing Test features
  - Replacing 0 ±inf values with NaN...
  - Dropping 0 all-NaN columns...
  - Filling 0 remaining NaNs with column means...

🧹 Applying VarianceThreshold (threshold=1e-05)...
  - Removed 0 low-variance features.

🧯 Clipping outliers to range [-1000000.0, 1000000.0]...
  - Clipping 1 overly large and 0 overly small features.

🧯 Clipping outliers to range [-1000000.0, 1000000.0]...
  - Clipping 1 overly large and 0 overly small features.

✅ Final Preprocessing Summary:
  - Train shape: (9990, 163)
  - Test shape:  (3, 163)
  - Common features retained: 163


# ExtraTrees Pipeline

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

# === Configuration ===
target_cols = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
n_splits = 5
random_seed = 42

# === Prepare DataFrames to hold predictions ===
oof_preds_1 = pd.DataFrame(index=features_train_clean.index, columns=target_cols)
test_preds_1 = pd.DataFrame(index=features_test_clean.index, columns=target_cols)

oof_preds_2 = pd.DataFrame(index=features_train_clean.index, columns=target_cols)
test_preds_2 = pd.DataFrame(index=features_test_clean.index, columns=target_cols)

kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

# === Train models and predict twice for ensembling ===
for round_num, (oof_df, test_df) in enumerate([(oof_preds_1, test_preds_1), (oof_preds_2, test_preds_2)], start=1):
    print(f"\n🚀 Training Round {round_num}")

    for target in target_cols:
        print(f"\n🎯 Target: {target}")

        mask = ~train_extended[target].isna()
        X = features_train_clean.loc[mask].copy()
        y = train_extended.loc[mask, target].copy()

        # Replace inf/-inf with NaN, then impute with mean
        X.replace([np.inf, -np.inf], np.nan, inplace=True)
        for col in X.columns:
            if X[col].isnull().any():
                mean_val = X[col].mean()
                X[col].fillna(mean_val, inplace=True)
                features_test_clean[col].fillna(mean_val, inplace=True)

        oof_pred = np.zeros(len(X))
        test_fold_preds = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
            print(f"  🧪 Fold {fold + 1}/{n_splits}")

            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model = lgb.LGBMRegressor(
                objective="mae",
                metric="mae",
                n_estimators=1000,
                learning_rate=0.05,
                feature_fraction=0.8,
                bagging_fraction=0.8,
                bagging_freq=1,
                lambda_l1=0.1,
                lambda_l2=0.1,
                num_leaves=31,
                verbose=-1,
                n_jobs=-1,
                random_state=random_seed + fold
            )

            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                eval_metric="mae",
                callbacks=[lgb.early_stopping(100, verbose=False)]
            )

            val_pred = model.predict(X_val)
            test_pred = model.predict(features_test_clean)

            oof_pred[val_idx] = val_pred
            test_fold_preds.append(test_pred)

            fold_mae = mean_absolute_error(y_val, val_pred)
            print(f"     🔍 Fold MAE: {fold_mae:.5f}")

        oof_df.loc[mask, target] = oof_pred
        test_df[target] = np.mean(test_fold_preds, axis=0)

# === Final submission: average test predictions ===
test_preds_avg = (test_preds_1 + test_preds_2) / 2
submission = test[['id']].copy()
submission = pd.concat([submission, test_preds_avg], axis=1)
submission.to_csv("submission.csv", index=False)
print("\n📁 Submission saved as submission.csv")


🚀 Training Round 1

🎯 Target: Tg
  🧪 Fold 1/5
     🔍 Fold MAE: 39.99535
  🧪 Fold 2/5
     🔍 Fold MAE: 37.04403
  🧪 Fold 3/5
     🔍 Fold MAE: 36.91343
  🧪 Fold 4/5
     🔍 Fold MAE: 35.19417
  🧪 Fold 5/5
     🔍 Fold MAE: 33.59358

🎯 Target: FFV
  🧪 Fold 1/5
     🔍 Fold MAE: 0.00610
  🧪 Fold 2/5
     🔍 Fold MAE: 0.00638
  🧪 Fold 3/5
     🔍 Fold MAE: 0.00650
  🧪 Fold 4/5
     🔍 Fold MAE: 0.00632
  🧪 Fold 5/5
     🔍 Fold MAE: 0.00651

🎯 Target: Tc
  🧪 Fold 1/5
     🔍 Fold MAE: 0.02951
  🧪 Fold 2/5
     🔍 Fold MAE: 0.02730
  🧪 Fold 3/5
     🔍 Fold MAE: 0.04174
  🧪 Fold 4/5
     🔍 Fold MAE: 0.02415
  🧪 Fold 5/5
     🔍 Fold MAE: 0.02816

🎯 Target: Density
  🧪 Fold 1/5
     🔍 Fold MAE: 0.05168
  🧪 Fold 2/5
     🔍 Fold MAE: 0.05013
  🧪 Fold 3/5
     🔍 Fold MAE: 0.05455
  🧪 Fold 4/5
     🔍 Fold MAE: 0.04935
  🧪 Fold 5/5
     🔍 Fold MAE: 0.04945

🎯 Target: Rg
  🧪 Fold 1/5
     🔍 Fold MAE: 1.98123
  🧪 Fold 2/5
     🔍 Fold MAE: 1.83445
  🧪 Fold 3/5
     🔍 Fold MAE: 1.96693
  🧪 Fold 4/5
     🔍 Fold 

In [9]:
submission

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,136.500149,0.371902,0.189768,1.206412,20.823329
1,1422188626,148.729134,0.375412,0.234676,1.114005,20.398437
2,2032016830,107.948971,0.350409,0.249736,1.119443,20.704481
