In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
# Load precomputed test RDKit features
test_rdkit = pd.read_csv("/kaggle/input/precomp/test_with_rdkit_features.csv")

In [3]:
# ----- STEP 1: Align features with training columns -----
# Load training feature columns
train_df = pd.read_csv("/kaggle/input/precomputed-rdkit-features/train_with_rdkit_features.csv")
train_features = [col for col in train_df.columns if col not in ['id', 'Tg', 'FFV', 'Tc', 'Density', 'Rg']]

In [4]:
# Ensure test has the same columns, fill missing ones with 0
for col in train_features:
    if col not in test_rdkit:
        test_rdkit[col] = 0

X_test = test_rdkit[train_features].fillna(0)


In [5]:
# ----- STEP 2: Predict using trained models -----
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
submission = pd.DataFrame()
submission['id'] = test_rdkit['id']

for target in targets:
    model = joblib.load(f"/kaggle/input/precomputed-rdkit-features/model_{target}.pkl")
    submission[target] = model.predict(X_test)


In [6]:
# ----- STEP 4: Final validation before saving -----
expected_columns = ['id', 'Tg', 'FFV', 'Tc', 'Density', 'Rg']
missing_cols = [col for col in expected_columns if col not in submission.columns]
if missing_cols:
    raise ValueError(f"❌ Missing columns: {missing_cols}")

if submission.isnull().any().any():
    raise ValueError("❌ Submission contains NaN values.")

if not np.issubdtype(submission['id'].dtype, np.integer):
    raise TypeError("❌ 'id' column must be of integer type.")

print(f"✅ Submission shape: {submission.shape}")

# Save only if all checks pass
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv saved successfully.")

✅ Submission shape: (3, 6)
✅ submission.csv saved successfully.
