In [2]:
# === 📁 Load Data ===
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
import pandas as pd
import numpy as np

def engineer_features(df):
    df = df.copy()
    
    # Create weighted average features for each property
    for prop in range(1, 11):  # 10 properties
        weighted_sum = 0
        for comp in range(1, 6):  # 5 components
            frac_col = f'Component{comp}_fraction'
            prop_col = f'Component{comp}_Property{prop}'
            weighted_sum += df[frac_col] * df[prop_col]
        
        df[f'Weighted_Property{prop}'] = weighted_sum

    return df

In [4]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

# Define target columns
target_cols = [col for col in train.columns if col.startswith('BlendProperty')]

# Apply feature engineering
X = engineer_features(train.drop(columns=target_cols))
y = train[target_cols]
X_test_fe = engineer_features(test)

# Train LightGBM using 5-fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models = []
oof_preds = np.zeros_like(y)

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = MultiOutputRegressor(LGBMRegressor(n_estimators=1000, learning_rate=0.03, random_state=fold))
    model.fit(X_tr, y_tr)

    y_pred = model.predict(X_val)
    oof_preds[val_idx] = y_pred

    fold_score = mean_absolute_percentage_error(y_val, y_pred)
    print(f"Fold {fold + 1} MAPE: {fold_score:.4f}")

    models.append(model)

# Overall score
final_score = mean_absolute_percentage_error(y, oof_preds)
print(f"\nOverall CV MAPE: {final_score:.5f}")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15531
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15531
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.004643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15531
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start t

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15532
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.014248
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15532
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.005260
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15532
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start t

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15532
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score 0.020359
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15532
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 65
[LightGBM] [Info] Start training from score -0.008625
Fold 5 MAPE: 0.7875

Overall CV MAPE: 1.00955


In [1]:
#pip install shap

In [2]:
import shap
import matplotlib.pyplot as plt

# Train a LightGBM model on all data
model = MultiOutputRegressor(LGBMRegressor(n_estimators=1000, learning_rate=0.03, random_state=42))
model.fit(X, y)

# Compute SHAP values for one target (e.g. BlendProperty1) for demo
explainer = shap.TreeExplainer(model.estimators_[0])
shap_values = explainer.shap_values(X)

# Plot SHAP summary to visualize feature importance
shap.summary_plot(shap_values, X)

# Get mean absolute SHAP value per feature
importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": np.abs(shap_values).mean(axis=0)
}).sort_values(by="importance", ascending=False)

print("Top 10 features by SHAP:")
print(importance_df.head(10))

# Drop low-importance features (e.g., bottom 20%)
threshold = importance_df['importance'].quantile(0.2)
selected_features = importance_df[importance_df['importance'] > threshold]['feature'].tolist()

# Filter X and test set
X_selected = X[selected_features]
X_test_selected = X_test_fe[selected_features]


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\USER\anaconda3\New folder\New folder\Anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\USER\anaconda3\New folder\New folder\Anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\USER\anaconda3\New folder\New folder\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\USER\anaconda3\New folder\New folder\Anaconda3\lib\site-packages\t

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\USER\anaconda3\New folder\New folder\Anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\USER\anaconda3\New folder\New folder\Anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\USER\anaconda3\New folder\New folder\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\USER\anaconda3\New folder\New folder\Anaconda3\lib\site-packages\t

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [None]:
# 5-fold CV with selected features
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models_pruned = []
oof_preds_pruned = np.zeros_like(y)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_selected)):
    X_tr, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = MultiOutputRegressor(LGBMRegressor(n_estimators=1000, learning_rate=0.03, random_state=fold))
    model.fit(X_tr, y_tr)

    y_pred = model.predict(X_val)
    oof_preds_pruned[val_idx] = y_pred

    fold_score = mean_absolute_percentage_error(y_val, y_pred)
    print(f"Fold {fold + 1} MAPE (pruned): {fold_score:.4f}")

    models_pruned.append(model)

# Final MAPE
final_pruned_score = mean_absolute_percentage_error(y, oof_preds_pruned)
print(f"\nOverall CV MAPE with SHAP-pruned features: {final_pruned_score:.5f}")
