In [6]:
# pip uninstall pandas

In [7]:
!pip install catboost



In [8]:
# === 📦 Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

In [9]:
# === 📁 Load Data ===
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [10]:
import pandas as pd
import numpy as np

def engineer_features(df):
    df = df.copy()

    component_cols = [f'C{i}_' for i in range(1, 6)]
    property_names = ['Property_1', 'Property_2', 'Property_3', 'Property_4', 'Property_5',
                      'Property_6', 'Property_7', 'Property_8', 'Property_9', 'Property_10']

    # 1. Weighted average of each property
    for prop in property_names:
        weighted_sum = sum(df[f'C{i}_wt%'] * df[f'C{i}_{prop}'] for i in range(1, 6))
        df[f'Weighted_{prop}'] = weighted_sum

    # 2. Component-wise mean & std of properties
    for i in range(1, 6):
        props = [df[f'C{i}_{p}'] for p in property_names]
        df[f'C{i}_mean'] = np.mean(props, axis=0)
        df[f'C{i}_std'] = np.std(props, axis=0)

    # 3. Total component weight (should be 100 but for safety)
    df['Total_wt'] = df[[f'C{i}_wt%' for i in range(1, 6)]].sum(axis=1)

    # 4. Normalized weights
    for i in range(1, 6):
        df[f'C{i}_wt_norm'] = df[f'C{i}_wt%'] / df['Total_wt']

    # 5. Weighted mean & std of each property across components
    for prop in property_names:
        values = np.stack([df[f'C{i}_{prop}'] for i in range(1, 6)], axis=1)
        weights = np.stack([df[f'C{i}_wt_norm'] for i in range(1, 6)], axis=1)
        df[f'WeightedMean_{prop}'] = np.sum(values * weights, axis=1)
        df[f'WeightedStd_{prop}'] = np.std(values * weights, axis=1)

    # 6. Property Interaction (pairwise product of important properties)
    important_props = ['Property_1', 'Property_4', 'Property_6', 'Property_9']
    for i in range(len(important_props)):
        for j in range(i + 1, len(important_props)):
            prop1, prop2 = important_props[i], important_props[j]
            df[f'Interact_{prop1}_{prop2}'] = df[f'Weighted_{prop1}'] * df[f'Weighted_{prop2}']

    # 7. Deviation of each component from blend-weighted property mean
    for prop in property_names:
        weighted = df[f'Weighted_{prop}']
        for i in range(1, 6):
            df[f'C{i}_{prop}_dev'] = df[f'C{i}_{prop}'] - weighted

    return df

In [11]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

# Define target columns
target_cols = [col for col in train.columns if col.startswith('target_')]

# Apply feature engineering to full dataset
X = engineer_features(train.drop(columns=target_cols))
y = train[target_cols]
X_test_fe = engineer_features(test)

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

val_scores = []
test_preds = np.zeros((len(test), len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = MultiOutputRegressor(LGBMRegressor(
        n_estimators=1000, 
        learning_rate=0.05,
        random_state=fold,
        n_jobs=-1
    ))
    
    model.fit(X_tr, y_tr)
    val_pred = model.predict(X_val)
    fold_score = mean_absolute_percentage_error(y_val, val_pred)
    val_scores.append(fold_score)

    print(f"Fold {fold+1} MAPE: {fold_score:.4f}")

    test_preds += model.predict(X_test_fe) / kf.n_splits

# Show average validation score
print(f"\nAverage CV MAPE: {np.mean(val_scores):.4f}")

KeyError: 'C1_wt%'

In [None]:
# === 📁 Load Data ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Remove ID if exists
test_ids = test_df["ID"] if "ID" in test_df.columns else test_df.index
train_df.drop(columns=["ID"], inplace=True, errors='ignore')
test_df.drop(columns=["ID"], inplace=True, errors='ignore')

# 🎯 Target Columns
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]

In [None]:
# Weighted averages of component properties
for i in range(1, 11):
    train_df[f"WeightedAvg_Property{i}"] = sum(
        train_df[f"Component{j}_fraction"] * train_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )
    test_df[f"WeightedAvg_Property{i}"] = sum(
        test_df[f"Component{j}_fraction"] * test_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )

In [None]:
# Extended SHAP-based Top 15 Features
top_features = [
    "Component5_fraction", "WeightedAvg_Property1", "Component2_fraction", 
    "Component4_fraction", "Component3_fraction", "Component1_fraction",
    "Component3_Property1", "Component2_Property1", "Component4_Property1",
    "WeightedAvg_Property2", "Component1_Property1", "Component5_Property1",
    "WeightedAvg_Property3", "Component2_Property2", "WeightedAvg_Property4"
]

X = train_df[top_features]
y = train_df[target_cols]
X_test = test_df[top_features]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros((X_test.shape[0], len(target_cols)))
lgb_preds = np.zeros((X_test.shape[0], len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n📂 Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # 🐈 CatBoost (custom tuned)
    cat_model = MultiOutputRegressor(
        CatBoostRegressor(
            iterations=1500,
            learning_rate=0.015,   # Reduced LR for stability
            depth=9,
            l2_leaf_reg=4,
            loss_function="MAE",
            verbose=0,
            random_seed=42,
            early_stopping_rounds=75
        )
    )
    cat_model.fit(X_train, y_train)
    cat_preds += cat_model.predict(X_test)

    # 🌱 LightGBM (custom tuned)
    lgb_model = MultiOutputRegressor(
        LGBMRegressor(
            n_estimators=1500,
            learning_rate=0.01,
            max_depth=7,
            num_leaves=60,
            subsample=0.75,
            colsample_bytree=0.7,
            reg_lambda=1.5,
            random_state=42,
            n_jobs=-1
        )
    )
    lgb_model.fit(X_train, y_train)
    lgb_preds += lgb_model.predict(X_test)

cat_preds /= kf.get_n_splits()
lgb_preds /= kf.get_n_splits()

final_preds = 100.85 * cat_preds + 0.25 * lgb_preds

submission = pd.DataFrame(final_preds, columns=target_cols)
submission.insert(0, "ID", test_ids)
submission.to_csv("submission_cat_lgb25_extended.csv", index=False)

print("\n✅ submission_cat75_lgb25_extended.csv created successfully!")

In [None]:
# The highest code

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

# === Load Data ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# === Handle ID Column ===
test_ids = test_df["ID"] if "ID" in test_df.columns else test_df.index
train_df.drop(columns=["ID"], inplace=True, errors='ignore')
test_df.drop(columns=["ID"], inplace=True, errors='ignore')

# === Separate Features and Targets ===
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]
X = train_df.drop(columns=target_cols)
y = train_df[target_cols]
X_test = test_df.copy()

# === Feature Engineering: Weighted Average Properties ===
for i in range(1, 11):  # Property1 to Property10
    weighted_train = sum(
        train_df[f"Component{j}_fraction"] * train_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )
    weighted_test = sum(
        test_df[f"Component{j}_fraction"] * test_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )
    X[f"WeightedAvg_Property{i}"] = weighted_train
    X_test[f"WeightedAvg_Property{i}"] = weighted_test

# === Setup K-Fold ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)
lgb_preds = np.zeros((X_test.shape[0], y.shape[1]))
cat_preds = np.zeros((X_test.shape[0], y.shape[1]))

# === Cross-Validation Loop ===
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}")

    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # === LightGBM Model ===
    lgb_model = MultiOutputRegressor(
        LGBMRegressor(
            n_estimators=500,
            learning_rate=0.03,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        )
    )
    lgb_model.fit(X_tr, y_tr)
    lgb_preds += lgb_model.predict(X_test)

    # === CatBoost Model ===
    cat_model = MultiOutputRegressor(
        CatBoostRegressor(
            iterations=500,
            learning_rate=0.03,
            depth=8,
            loss_function="MAE",
            verbose=0,
            random_seed=42
        )
    )
    cat_model.fit(X_tr, y_tr)
    cat_preds += cat_model.predict(X_test)

# === Average Over Folds ===
lgb_preds /= kf.get_n_splits()
cat_preds /= kf.get_n_splits()

# === Weighted Blending ===
final_preds = 0.9 * lgb_preds + 0.3 * cat_preds

# === Create Submission ===
submission = pd.DataFrame(final_preds, columns=target_cols)
submission.insert(0, "ID", test_ids)
submission.to_csv("submission_lgb_cat_blend1.csv", index=False)
print("\n✅ submission_lgb_cat_blend1.csv created successfully!")

In [None]:
# !pip install -U scikit-learn

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

# === HGBR Pipeline ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)
hgbr_preds = np.zeros((X_test.shape[0], len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n🌲 Fold {fold + 1} - HistGradientBoostingRegressor")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    hgbr_model = MultiOutputRegressor(
        HistGradientBoostingRegressor(
            loss="absolute_error",
            learning_rate=0.05,
            max_iter=800,
            max_depth=6,
            l2_regularization=0.1,
            random_state=42
        )
    )

    hgbr_model.fit(X_train, y_train)
    hgbr_preds += hgbr_model.predict(X_test)

hgbr_preds /= kf.get_n_splits()

# Save predictions
submission_hgbr = pd.DataFrame(hgbr_preds, columns=target_cols)
submission_hgbr.insert(0, "ID", test_ids)
submission_hgbr.to_csv("submission_hgbr.csv", index=False)

print("✅ submission_hgbr.csv saved")

# === 📦 Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

# === 📁 Load Data ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# === 🧹 Remove ID if exists ===
test_ids = test_df["ID"] if "ID" in test_df.columns else test_df.index
train_df.drop(columns=["ID"], inplace=True, errors='ignore')
test_df.drop(columns=["ID"], inplace=True, errors='ignore')

# === 🎯 Target Columns ===
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]

# === 🧠 Feature Engineering: Weighted Averages ===
for i in range(1, 11):
    train_df[f"WeightedAvg_Property{i}"] = sum(
        train_df[f"Component{j}_fraction"] * train_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )
    test_df[f"WeightedAvg_Property{i}"] = sum(
        test_df[f"Component{j}_fraction"] * test_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )

# === ✅ Top SHAP-based Features Only ===
top_features = [
    "Component5_fraction", "WeightedAvg_Property1", "Component2_fraction", 
    "Component4_fraction", "Component3_fraction", "Component1_fraction",
    "Component3_Property1", "Component2_Property1", "Component4_Property1"
]

X = train_df[top_features]
y = train_df[target_cols]
X_test = test_df[top_features]

# === 🔁 Cross Validation Setup ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros((X_test.shape[0], len(target_cols)))
hgb_preds = np.zeros((X_test.shape[0], len(target_cols)))

# === 🔁 Train per Fold ===
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n📂 Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # 🐈 CatBoost
    cat_model = MultiOutputRegressor(
        CatBoostRegressor(
            iterations=1500,
            learning_rate=0.015,
            depth=9,
            l2_leaf_reg=4,
            loss_function="MAE",
            verbose=0,
            random_seed=42,
            early_stopping_rounds=75
        )
    )
    cat_model.fit(X_train, y_train)
    cat_preds += cat_model.predict(X_test)

    # 🌄 HistGradientBoostingRegressor
    hgb_model = MultiOutputRegressor(
        HistGradientBoostingRegressor(
            max_iter=1000,
            learning_rate=0.02,
            max_depth=8,
            l2_regularization=1.0,
            early_stopping=True,
            random_state=42
        )
    )
    hgb_model.fit(X_train, y_train)
    hgb_preds += hgb_model.predict(X_test)

# === 📊 Average Predictions Over Folds ===
cat_preds /= kf.get_n_splits()
hgb_preds /= kf.get_n_splits()

# === ⚖️ Final Weighted Blend (CatBoost 70%, HGBR 30%)
final_preds = 0.7 * cat_preds + 0.3 * hgb_preds

In [None]:
# === 💾 Submission ===
submission = pd.DataFrame(final_preds, columns=target_cols)
submission.insert(0, "ID", test_ids)
submission.to_csv("submission_cat70_hgbr30.csv", index=False)

print("\n✅ submission_cat70_hgbr30.csv created successfully!")

# === 📦 Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

# === 📁 Load Data ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# === 🎯 Targets & Features ===
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]
test_ids = test_df["ID"] if "ID" in test_df.columns else test_df.index
train_df.drop(columns=["ID"], inplace=True, errors="ignore")
test_df.drop(columns=["ID"], inplace=True, errors="ignore")

# === 🧠 Feature Engineering: Weighted Averages ===
for i in range(1, 11):
    train_df[f"WeightedAvg_Property{i}"] = sum(
        train_df[f"Component{j}_fraction"] * train_df[f"Component{j}_Property{i}"] for j in range(1, 5+1)
    )
    test_df[f"WeightedAvg_Property{i}"] = sum(
        test_df[f"Component{j}_fraction"] * test_df[f"Component{j}_Property{i}"] for j in range(1, 5+1)
    )

# === ✅ SHAP-based Top Features ===
top_features = [
    "Component5_fraction", "WeightedAvg_Property1", "Component2_fraction", 
    "Component4_fraction", "Component3_fraction", "Component1_fraction",
    "Component3_Property1", "Component2_Property1", "Component4_Property1"
]

X = train_df[top_features]
y = train_df[target_cols]
X_test = test_df[top_features]

# === ⚖️ Standardize Features for MLP ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# === 🔁 K-Fold CV ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mlp_preds = np.zeros((X_test.shape[0], len(target_cols)))
cat_preds = np.zeros((X_test.shape[0], len(target_cols)))

# === 🔁 Training Loop ===
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n📂 Fold {fold+1}")
    
    X_train_scaled, X_val_scaled = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # === 🤖 MLP Model
    mlp_model = MultiOutputRegressor(
        MLPRegressor(
            hidden_layer_sizes=(100, 80),
            activation='relu',
            solver='adam',
            alpha=0.001,
            learning_rate='adaptive',
            max_iter=1000,
            early_stopping=True,
            random_state=42
        )
    )
    mlp_model.fit(X_train_scaled, y_train)
    mlp_preds += mlp_model.predict(X_test_scaled)

    # === 🐈 CatBoost (for comparison/blending)
    cat_model = MultiOutputRegressor(
        CatBoostRegressor(
            iterations=1500,
            learning_rate=0.015,
            depth=9,
            l2_leaf_reg=4,
            loss_function="MAE",
            verbose=0,
            random_seed=42,
            early_stopping_rounds=75
        )
    )
    cat_model.fit(X.iloc[train_idx], y.iloc[train_idx])
    cat_preds += cat_model.predict(X_test)

# === 📊 Average Predictions ===
mlp_preds /= kf.get_n_splits()
cat_preds /= kf.get_n_splits()

In [None]:
# === 🧪 Blend CatBoost + MLP (optional)
final_preds = 0.9 * cat_preds + 0.0 * mlp_preds

# === 💾 Save Submission ===
submission = pd.DataFrame(final_preds, columns=target_cols)
submission.insert(0, "ID", test_ids)
submission.to_csv("submission_cat80_mlp20.csv", index=False)

print("\n✅ submission_cat80_mlp20.csv created successfully!")

from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
import numpy as np
import pandas as pd

# === Load Data ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

target_cols = [f"BlendProperty{i}" for i in range(1, 11)]

# Feature Engineering
for i in range(1, 11):
    train_df[f"WeightedAvg_Property{i}"] = sum(
        train_df[f"Component{j}_fraction"] * train_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )
    test_df[f"WeightedAvg_Property{i}"] = sum(
        test_df[f"Component{j}_fraction"] * test_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )

top_features = [
    "Component5_fraction", "WeightedAvg_Property1", "Component2_fraction", 
    "Component4_fraction", "Component3_fraction", "Component1_fraction",
    "Component3_Property1", "Component2_Property1", "Component4_Property1"
]

X = train_df[top_features]
y = train_df[target_cols]
X_test = test_df[top_features]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros((X_test.shape[0], len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n📂 Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    cat_model = MultiOutputRegressor(
        CatBoostRegressor(
            iterations=1500,
            learning_rate=0.015,
            depth=9,
            l2_leaf_reg=4,
            loss_function="MAE",
            verbose=0,
            random_seed=42,
            early_stopping_rounds=75
        )
    )
    cat_model.fit(X_train, y_train)
    cat_preds += cat_model.predict(X_test)

# Average over folds
cat_preds /= kf.get_n_splits()

# Save submission
submission = pd.DataFrame(cat_preds, columns=target_cols)
submission.insert(0, "ID", test_df["ID"])
submission.to_csv("submission_catboost_only_recovery.csv", index=False)

print("\n✅ submission_catboost_only_recovery.csv saved")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# === 1. Load the submissions ===
path_best = r"C:\Users\USER\Desktop\shell.ai\submission_lgb_cat_blend1.csv"
path_bad = r"C:\Users\USER\Desktop\shell.ai\submission_catboost_only_optimized.csv"      # 10 score

# Replace paths with actual ones if different
best_df = pd.read_csv(path_best)
bad_df = pd.read_csv(path_bad)

# === 2. Ensure matching ID and shape ===
assert best_df.shape == bad_df.shape, "Submissions have different shapes!"
assert (best_df['ID'] == bad_df['ID']).all(), "Mismatch in IDs!"

# === 3. Compute absolute and relative differences ===
diff_df = best_df.copy()
target_cols = [col for col in best_df.columns if col != "ID"]

for col in target_cols:
    diff_df[col] = np.abs(best_df[col] - bad_df[col])

# === 4. Show a few example rows of differences ===
print("🔍 Sample of absolute differences between submissions:")
display(diff_df.head())

# === 5. Mean Difference per Column ===
mean_diffs = diff_df[target_cols].mean().sort_values(ascending=False)
print("\n📊 Mean absolute difference per target column:")
display(mean_diffs)

# === 6. Plot the differences per blend property ===
plt.figure(figsize=(10, 5))
mean_diffs.plot(kind="bar", color="skyblue", edgecolor="black")
plt.title("Mean Absolute Difference per Blend Property")
plt.ylabel("Mean Absolute Difference")
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# === 📦 Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")

# === 📁 Load Data ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# === 🧹 Remove ID if present ===
test_ids = test_df["ID"] if "ID" in test_df.columns else test_df.index
train_df.drop(columns=["ID"], inplace=True, errors='ignore')
test_df.drop(columns=["ID"], inplace=True, errors='ignore')

# === 🎯 Target Columns ===
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]

# === ⚙️ Weighted Average Features ===
for i in range(1, 11):
    train_df[f"WeightedAvg_Property{i}"] = sum(
        train_df[f"Component{j}_fraction"] * train_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )
    test_df[f"WeightedAvg_Property{i}"] = sum(
        test_df[f"Component{j}_fraction"] * test_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )

# === ✅ Top SHAP Features (Based on best run) ===
top_features = [
    "Component5_fraction", "WeightedAvg_Property1", "Component2_fraction", 
    "Component4_fraction", "Component3_fraction", "Component1_fraction",
    "Component3_Property1", "Component2_Property1", "Component4_Property1"
]

X = train_df[top_features]
y = train_df[target_cols]
X_test = test_df[top_features]

# === 🔀 K-Fold CV ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros((X_test.shape[0], len(target_cols)))
lgb_preds = np.zeros((X_test.shape[0], len(target_cols)))

# === 🔁 Train Loop ===
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n📂 Fold {fold + 1}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # === 🐈 CatBoost
    cat_model = MultiOutputRegressor(
        CatBoostRegressor(
            iterations=1600,
            learning_rate=0.015,
            depth=9,
            l2_leaf_reg=4.5,
            loss_function="MAE",
            verbose=0,
            random_seed=42,
            early_stopping_rounds=75
        )
    )
    cat_model.fit(X_train, y_train)
    cat_preds += cat_model.predict(X_test)

    # === 🌱 LightGBM
    lgb_model = MultiOutputRegressor(
        LGBMRegressor(
            n_estimators=1200,
            learning_rate=0.012,
            max_depth=7,
            num_leaves=60,
            subsample=0.75,
            colsample_bytree=0.7,
            reg_lambda=1.3,
            random_state=42,
            n_jobs=-1
        )
    )
    lgb_model.fit(X_train, y_train)
    lgb_preds += lgb_model.predict(X_test)

# === 📊 Average Fold Predictions ===
cat_preds /= kf.get_n_splits()
lgb_preds /= kf.get_n_splits()

# === ⚖️ Final Blend: CatBoost 80%, LGBM 20%
final_preds = 0.80 * cat_preds + 0.20 * lgb_preds

# === 💾 Submission
submission = pd.DataFrame(final_preds, columns=target_cols)
submission.insert(0, "ID", test_ids)
submission.to_csv("submission_cat80_lgb20_final.csv", index=False)

print("\n✅ submission_cat80_lgb20_final.csv created!")

# === 📦 Imports ===
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

# === 📁 Load Data ===
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# === 🧹 Preprocessing ===
test_ids = test_df["ID"] if "ID" in test_df.columns else test_df.index
train_df.drop(columns=["ID"], inplace=True, errors='ignore')
test_df.drop(columns=["ID"], inplace=True, errors='ignore')

target_cols = [f"BlendProperty{i}" for i in range(1, 11)]

# === 🧠 Feature Engineering ===
for i in range(1, 11):
    train_df[f"WeightedAvg_Property{i}"] = sum(
        train_df[f"Component{j}_fraction"] * train_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )
    test_df[f"WeightedAvg_Property{i}"] = sum(
        test_df[f"Component{j}_fraction"] * test_df[f"Component{j}_Property{i}"] for j in range(1, 6)
    )

top_features = [
    "Component5_fraction", "WeightedAvg_Property1", "Component2_fraction", 
    "Component4_fraction", "Component3_fraction", "Component1_fraction",
    "Component3_Property1", "Component2_Property1", "Component4_Property1"
]

X = train_df[top_features]
y = train_df[target_cols]
X_test = test_df[top_features]

# === 🧠 Models ===
cat_model = MultiOutputRegressor(CatBoostRegressor(
    iterations=1000,
    learning_rate=0.02,
    depth=8,
    loss_function="MAE",
    verbose=0,
    random_seed=42
))

lgb_model = MultiOutputRegressor(LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.02,
    max_depth=7,
    subsample=0.75,
    colsample_bytree=0.7,
    random_state=42
))

ridge_model = MultiOutputRegressor(RidgeCV(alphas=[0.1, 1.0, 10.0]))

# === 🔁 KFold CV ===
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros((X_test.shape[0], len(target_cols)))
lgb_preds = np.zeros((X_test.shape[0], len(target_cols)))
ridge_preds = np.zeros((X_test.shape[0], len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n📂 Fold {fold + 1}")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    
    cat_model.fit(X_train, y_train)
    lgb_model.fit(X_train, y_train)
    ridge_model.fit(X_train, y_train)
    
    cat_preds += cat_model.predict(X_test)
    lgb_preds += lgb_model.predict(X_test)
    ridge_preds += ridge_model.predict(X_test)

# === 📊 Average over folds ===
cat_preds /= kf.get_n_splits()
lgb_preds /= kf.get_n_splits()
ridge_preds /= kf.get_n_splits()

In [None]:
# === ⚖️ Weighted Blending ===
alpha, beta, gamma = 0.6, 0.09, 1.0  # catboost, lgbm, ridge
final_preds = alpha * cat_preds + beta * lgb_preds + gamma * ridge_preds

# === 💾 Save Submission ===
submission = pd.DataFrame(final_preds, columns=target_cols)
submission.insert(0, "ID", test_ids)
submission.to_csv("submission_blended_cat_lgb_ridge.csv", index=False)

print("\n✅ Blended submission saved: submission_blended_cat_lgb_ridge.csv")

import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Extract features and target
target_cols = [col for col in train.columns if "BlendProperty" in col]
X = train.drop(columns=["ID"] + target_cols)
y = train[target_cols]
X_test = test.drop(columns=["ID"])

# Fill missing values and ensure numeric
X = X.fillna(0).astype(float)
X_test = X_test.fillna(0).astype(float)
y = y.fillna(0).astype(float)

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
svr_preds = np.zeros((X_test.shape[0], len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    X_train, y_train = X_scaled[train_idx], y.iloc[train_idx]

    svr = MultiOutputRegressor(SVR(kernel='rbf', C=1.0, epsilon=0.1))
    svr.fit(X_train, y_train)
    svr_preds += svr.predict(X_test_scaled)

# Average predictions
svr_preds /= kf.get_n_splits()

# Prepare submission
submission = pd.DataFrame(svr_preds, columns=target_cols)
submission.insert(0, "ID", test["ID"])
submission_path = "/mnt/data/submission_svr_only_fixed.csv"
submission.to_csv(submission_path, index=False)

submission_path

import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Extract targets
target_cols = [col for col in train.columns if "BlendProperty" in col]

# Handle absence of ID in train
drop_cols = target_cols.copy()
if "ID" in train.columns:
    drop_cols.insert(0, "ID")
X = train.drop(columns=drop_cols)
y = train[target_cols]

# Test features
X_test = test.drop(columns=["ID"]) if "ID" in test.columns else test.copy()

# Fill missing values and ensure float
X = X.fillna(0).astype(float)
X_test = X_test.fillna(0).astype(float)
y = y.fillna(0).astype(float)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# K-Fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
svr_preds = np.zeros((X_test.shape[0], len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"Fold {fold+1}")
    X_train, y_train = X_scaled[train_idx], y.iloc[train_idx]

    svr = MultiOutputRegressor(SVR(kernel='rbf', C=1.0, epsilon=0.1))
    svr.fit(X_train, y_train)
    svr_preds += svr.predict(X_test_scaled)

# Average predictions
svr_preds /= kf.get_n_splits()

# Save submission
submission = pd.DataFrame(svr_preds, columns=target_cols)
submission.insert(0, "ID", test["ID"] if "ID" in test.columns else np.arange(1, len(test)+1))
submission_path = "C:\\Users\\USER\\Desktop\\shell.ai\\submission_svr.csv"
submission.to_csv(submission_path, index=False)

submission_path

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Extract targets
target_cols = [col for col in train.columns if "BlendProperty" in col]

# Drop unused columns
drop_cols = target_cols.copy()
if "ID" in train.columns:
    drop_cols.insert(0, "ID")
X = train.drop(columns=drop_cols)
y = train[target_cols]

X_test = test.drop(columns=["ID"]) if "ID" in test.columns else test.copy()

# Preprocessing
X = X.fillna(0).astype(float)
X_test = X_test.fillna(0).astype(float)
y = y.fillna(0).astype(float)

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
lr_preds = np.zeros((X_test.shape[0], len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"Fold {fold+1}")
    X_train, y_train = X_scaled[train_idx], y.iloc[train_idx]

    model = MultiOutputRegressor(LinearRegression())
    model.fit(X_train, y_train)
    lr_preds += model.predict(X_test_scaled)

lr_preds /= kf.get_n_splits()

# Prepare submission
submission = pd.DataFrame(lr_preds, columns=target_cols)
submission.insert(0, "ID", test["ID"] if "ID" in test.columns else np.arange(1, len(test)+1))
submission_path = "C:\\Users\\USER\\Desktop\\shell.ai\\submission_linear_regression.csv"
submission.to_csv(submission_path, index=False)

submission_path


import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Extract target columns and features
target_cols = [col for col in train.columns if "BlendProperty" in col]
X = train.drop(columns=target_cols)
y = train[target_cols]
X_test = test.copy()

# Define model
base_model = XGBRegressor(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist"
)
model = MultiOutputRegressor(base_model)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_preds = np.zeros((len(X_test), len(target_cols)))

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training fold {fold + 1}...")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_preds += model.predict(X_test) / kf.n_splits

# Save submission
submission = pd.DataFrame(y_preds, columns=target_cols)
submission.insert(0, "ID", test["ID"] if "ID" in test.columns else range(1, len(test)+1))
submission.to_csv("submission_xgboost.csv", index=False)
print("Saved: submission_xgboost.csv")

In [None]:
# pip install xgboost

import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

# Load data
train = pd.read_csv("train.csv")  # Update path if needed
test = pd.read_csv("test.csv")

# Define target columns
target_cols = [col for col in train.columns if "BlendProperty" in col]

# Features and target
X = train.drop(columns=["ID"] + target_cols, errors="ignore")
y = train[target_cols]
X_test = test.drop(columns=["ID"], errors="ignore")

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Prepare predictions container
et_preds = np.zeros((X_test.shape[0], len(target_cols)))

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"🌿 Fold {fold+1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = MultiOutputRegressor(
        ExtraTreesRegressor(
            n_estimators=500,
            max_depth=20,
            min_samples_split=4,
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
    )

    model.fit(X_train, y_train)
    et_preds += model.predict(X_test)

# Average over folds
et_preds /= kf.get_n_splits()

# Prepare submission
submission = pd.DataFrame(et_preds, columns=target_cols)
submission.insert(0, "ID", test["ID"])
submission.to_csv("submission_extratrees.csv", index=False)
print("✅ Submission saved as 'submission_extratrees.csv'")


import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_percentage_error

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_solution.csv")

# Drop ID if present
X = train.drop(columns=["ID"] + [f'BlendProperty{i}' for i in range(1, 11)], errors='ignore')
y = train[[f'BlendProperty{i}' for i in range(1, 11)]]
X_test = test.drop(columns=["ID"], errors='ignore')

# Optional: Feature engineering (add more if needed)
for i in range(1, 11):
    X[f'Property{i}_mean'] = X[[f'Component{j}_Property{i}' for j in range(1, 6)]].mean(axis=1)
    X[f'Property{i}_std'] = X[[f'Component{j}_Property{i}' for j in range(1, 6)]].std(axis=1)
    X_test[f'Property{i}_mean'] = X_test[[f'Component{j}_Property{i}' for j in range(1, 6)]].mean(axis=1)
    X_test[f'Property{i}_std'] = X_test[[f'Component{j}_Property{i}' for j in range(1, 6)]].std(axis=1)

# CatBoost parameters
catboost_params = {
    'loss_function': 'MAPE',
    'verbose': 0,
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 1000,
    'early_stopping_rounds': 50,
    'random_seed': 42
}

# KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_preds = np.zeros((X_test.shape[0], y.shape[1]))
oof_preds = np.zeros_like(y)

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training Fold {fold + 1}...")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = MultiOutputRegressor(CatBoostRegressor(**catboost_params))
    model.fit(X_train, y_train)

    # Validation prediction for internal score
    oof_preds[val_idx] = model.predict(X_val)

    # Predict on test set
    y_preds += model.predict(X_test) / kf.n_splits

# Evaluate internal CV score (optional)
cv_score = mean_absolute_percentage_error(y, oof_preds)
print(f"CV MAPE Score: {cv_score:.5f}")

# Save submission
submission = sample_submission.copy()
submission.iloc[:, 1:] = y_preds
submission.to_csv("submission_catboost_mape.csv", index=False)
print("Submission saved to 'submission_catboost_mape.csv'")

import pandas as pd
import numpy as np
import shap
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_solution.csv")

# Drop ID if exists
X = train.drop(columns=["ID"] + [f'BlendProperty{i}' for i in range(1, 11)], errors="ignore")
y = train[[f'BlendProperty{i}' for i in range(1, 11)]]
X_test = test.drop(columns=["ID"], errors="ignore")

# -------- FEATURE ENGINEERING --------
def add_engineered_features(X_df):
    X = X_df.copy()

    # 1. Mean & Std of each property across 5 components
    for i in range(1, 11):
        cols = [f'Component{j}_Property{i}' for j in range(1, 6)]
        X[f'Property{i}_mean'] = X[cols].mean(axis=1)
        X[f'Property{i}_std'] = X[cols].std(axis=1)

    # 2. Weighted Average (blend-weighted mean for each property)
    for i in range(1, 11):
        blend_weighted = sum(
            X[f'Component{j}_fraction'] * X[f'Component{j}_Property{i}'] for j in range(1, 6)
        )
        X[f'Property{i}_blend_weighted'] = blend_weighted

    # 3. Interaction Features (non-linear)
    X['frac1_frac2'] = X['Component1_fraction'] * X['Component2_fraction']
    X['frac3_frac5'] = X['Component3_fraction'] * X['Component5_fraction']
    X['prop1_mean_x_prop2_mean'] = X['Property1_mean'] * X['Property2_mean']
    X['std1_x_std2'] = X['Property1_std'] * X['Property2_std']

    return X

X = add_engineered_features(X)
X_test = add_engineered_features(X_test)

# -------- LIGHTGBM MODEL SETUP --------
lgbm_params = {
    'learning_rate': 0.03,
    'n_estimators': 1000,
    'max_depth': 7,
    'num_leaves': 31,
    'objective': 'mape',
    'random_state': 42,
    'n_jobs': -1
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_preds = np.zeros((X_test.shape[0], y.shape[1]))
oof_preds = np.zeros_like(y)

print("Training with 5-fold CV...")
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}...")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = MultiOutputRegressor(lgb.LGBMRegressor(**lgbm_params))
    model.fit(X_train, y_train)

    oof_preds[val_idx] = model.predict(X_val)
    y_preds += model.predict(X_test) / kf.n_splits

# -------- EVALUATION --------
cv_score = mean_absolute_percentage_error(y, oof_preds)
print(f"CV MAPE Score: {cv_score:.5f}")

# -------- SHAP FEATURE IMPORTANCE (on 1 target model) --------
explainer = shap.TreeExplainer(model.estimators_[0])
shap_values = explainer.shap_values(X)

shap.summary_plot(shap_values, X, plot_type="bar", max_display=20)
plt.show()

# -------- DROP WEAK FEATURES (Optional: SHAP threshold < 0.005) --------
shap_importances = np.abs(shap_values).mean(axis=0)
important_features = X.columns[shap_importances > 0.005]
X = X[important_features]
X_test = X_test[important_features]

# -------- RETRAIN ON SELECTED FEATURES --------
print("Retraining after SHAP feature selection...")
y_preds = np.zeros((X_test.shape[0], y.shape[1]))
oof_preds = np.zeros_like(y)

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = MultiOutputRegressor(lgb.LGBMRegressor(**lgbm_params))
    model.fit(X_train, y_train)

    oof_preds[val_idx] = model.predict(X_val)
    y_preds += model.predict(X_test) / kf.n_splits

# Final CV score after SHAP filtering
cv_score_shap = mean_absolute_percentage_error(y, oof_preds)
print(f"CV MAPE after SHAP selection: {cv_score_shap:.5f}")

# -------- SUBMISSION --------
submission = sample_submission.copy()
submission.iloc[:, 1:] = y_preds
submission.to_csv("submission_lgbm_features_shap.csv", index=False)
print("Submission saved as 'submission_lgbm_features_shap.csv'")

In [None]:
# !pip install --user numpy optuna tensorflow

In [None]:
import optuna
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

def objective(trial):
    params = {
        "loss_function": "MAE",  # MAPE is unstable; CatBoost recommends MAE for robustness
        "iterations": trial.suggest_int("iterations", 300, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "random_strength": trial.suggest_float("random_strength", 0.1, 1.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1.0),
        "early_stopping_rounds": 50,
        "verbose": 0
    }

    # KFold on first blend target only for speed
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    X = X_reduced
    y = train[[col for col in train.columns if "BlendProperty" in col]].iloc[:, 0]

    scores = []
    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostRegressor(**params)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)

        preds = model.predict(X_val)
        score = mean_absolute_percentage_error(y_val, preds)
        scores.append(score)

    return np.mean(scores)

# Start tuning
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

# Best parameters
print("Best params:", study.best_params)

In [None]:
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
import pandas as pd
import numpy as np

# Use SHAP-reduced features
X = X_reduced.copy()
y = train[[col for col in train.columns if "BlendProperty" in col]].iloc[:, 0]  # First target only

def objective(trial):
    params = {
        "loss_function": "MAE",
        "iterations": trial.suggest_int("iterations", 300, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "random_strength": trial.suggest_float("random_strength", 0.1, 1.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1.0),
        "early_stopping_rounds": 50,
        "verbose": 0
    }

    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)
        preds = model.predict(X_val)
        score = mean_absolute_percentage_error(y_val, preds)
        scores.append(score)

    return np.mean(scores)

# Run Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

# Best params
print("Best MAPE:", study.best_value)
print("Best params:", study.best_params)

In [None]:
# pip install tensorflow scikit-learn pandas numpy

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import warnings
warnings.filterwarnings("ignore")

# Load datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Drop ID column if present
X = train.drop(columns=["ID"] + [f'BlendProperty{i}' for i in range(1, 11)], errors='ignore')
y = train[[f'BlendProperty{i}' for i in range(1, 11)]]
X_test = test.drop(columns=["ID"], errors='ignore')

# -------- FEATURE ENGINEERING FUNCTION --------
def add_engineered_features(X_df):
    X = X_df.copy()

    # Mean and Std per property
    for i in range(1, 11):
        prop_cols = [f'Component{j}_Property{i}' for j in range(1, 6)]
        X[f'Property{i}_mean'] = X[prop_cols].mean(axis=1)
        X[f'Property{i}_std'] = X[prop_cols].std(axis=1)

    # Blend-weighted features
    for i in range(1, 11):
        weighted_sum = sum(X[f'Component{j}_fraction'] * X[f'Component{j}_Property{i}'] for j in range(1, 6))
        X[f'Property{i}_blend_weighted'] = weighted_sum

    # Non-linear interaction features
    X['frac1_frac2'] = X['Component1_fraction'] * X['Component2_fraction']
    X['frac3_frac5'] = X['Component3_fraction'] * X['Component5_fraction']
    X['prop1_mean_x_prop2_mean'] = X['Property1_mean'] * X['Property2_mean']
    X['std1_x_std2'] = X['Property1_std'] * X['Property2_std']

    return X

# Apply feature engineering
X = add_engineered_features(X)
X_test = add_engineered_features(X_test)

# -------- Keras MAPE Loss --------
def mape_loss(y_true, y_pred):
    return K.mean(K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None)))

# -------- Model Architecture --------
def build_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(256, input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(output_dim))  # Linear output layer

    model.compile(optimizer=Adam(learning_rate=0.001), loss=mape_loss)
    return model

# -------- 5-Fold Cross Validation --------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_preds = np.zeros((X_test.shape[0], y.shape[1]))
oof_preds = np.zeros_like(y)

print("Training ANN with 5-Fold CV...")
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}...")
    X_train, X_val = X.iloc[train_idx].values, X.iloc[val_idx].values
    y_train, y_val = y.iloc[train_idx].values, y.iloc[val_idx].values

    model = build_model(input_dim=X.shape[1], output_dim=y.shape[1])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=150, batch_size=32, verbose=0)

    oof_preds[val_idx] = model.predict(X_val)
    y_preds += model.predict(X_test.values) / kf.n_splits

# -------- Evaluation --------
cv_score = mean_absolute_percentage_error(y, oof_preds)
print(f"CV MAPE Score (ANN): {cv_score:.5f}")

# -------- Submission --------
submission = sample_submission.copy()
submission.iloc[:, 1:] = y_preds
submission.to_csv("submission_ann_engineered.csv", index=False)
print("Saved: submission_ann_engineered.csv")