#### Importing reequired Libraries

In [None]:
import pandas as pd
import numpy as np
import zipfile
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
import seaborn as sns
from scipy.stats import skew
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

from pytorch_tabular import TabularModel
from pytorch_tabular.models.category_embedding.config import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from sklearn.model_selection import KFold, cross_val_predict
from lightgbm import LGBMRegressor

#### Data Engineering and preprocessing

In [None]:
train_df=pd.read_csv(r"C:\Users\Hemant Pathak\Downloads\e25764c65bca11f0\dataset\train.csv")
test_df=pd.read_csv(r"C:\Users\Hemant Pathak\Downloads\e25764c65bca11f0\dataset\test.csv")
sample_solution_df=pd.read_csv(r"C:\Users\Hemant Pathak\Downloads\e25764c65bca11f0\dataset\sample_solution.csv")

In [None]:
# def add_engineered_features(df):
#     df = df.copy()  # keep original intact

#     new_features = {}  # collect all new columns here

#     # 1. Weighted Averages
#     for prop in range(1, 11):
#         new_features[f'WeightedAvg_Property{prop}'] = sum(
#             df[f'Component{i}_fraction'] * df[f'Component{i}_Property{prop}'] for i in range(1, 6)
#         )

#     # 2. Min, Max, Mean, Std per Property
#     for prop in range(1, 11):
#         cols = [f'Component{i}_Property{prop}' for i in range(1, 6)]
#         new_features[f'Property{prop}_min'] = df[cols].min(axis=1)
#         new_features[f'Property{prop}_max'] = df[cols].max(axis=1)
#         new_features[f'Property{prop}_mean'] = df[cols].mean(axis=1)
#         new_features[f'Property{prop}_std'] = df[cols].std(axis=1)

#     # 3. Interaction Features (fraction × property)
#     for i in range(1, 6):
#         for prop in range(1, 11):
#             new_features[f'C{i}_FracProp{prop}'] = df[f'Component{i}_fraction'] * df[f'Component{i}_Property{prop}']

#     # 4. Total Fraction
#     frac_cols = [f'Component{i}_fraction' for i in range(1, 6)]
#     new_features['Total_fraction'] = df[frac_cols].sum(axis=1)

#     # 5. Dominant Component Index
#     dominant_comp = df[frac_cols].idxmax(axis=1).apply(lambda x: int(x.split('Component')[1].split('_')[0]))
#     new_features['Dominant_component'] = dominant_comp

#     # 🔁 Add all new columns at once
#     new_feature_df = pd.DataFrame(new_features, index=df.index)
#     df = pd.concat([df, new_feature_df], axis=1)

#     return df

In [None]:
import pandas as pd
import numpy as np

def add_engineered_features(df):
    """
    Combines general feature engineering with features specifically designed
    to model synergistic and antagonistic effects between components in a mixture.

    General Feature Engineering focuses on:
    1. Weighted Averages: Overall property contributions.
    2. Min, Max, Mean, Std per Property: Distribution characteristics of properties.
    3. Interaction Features (fraction × property): Direct interaction between component
       proportion and its properties.
    4. Total Fraction: Sum of all component fractions (for sanity check, often sums to 1).
    5. Dominant Component Index: Identifies the component with the largest fraction.

    Synergistic Feature Engineering focuses on:
    1. Pairwise Interactions (Fraction Products): Models the effect of having two
       components present at the same time, indicating co-presence impact.
    2. Deviation from Linear Expectation: Measures how much the mixture's properties
       deviate from a simple weighted average, a key indicator of synergy. This includes:
       - Deviation of dominant component's property from weighted average.
       - Deviation of maximum property from weighted average.
       - Deviation of minimum property from weighted average.
    3. Heterogeneity Interactions: Combines composition diversity (entropy) with
       property diversity (standard deviation), hypothesizing stronger synergy
       when diverse components are mixed in a balanced way.
    """
    df = df.copy()
    new_features = {}

    # --- Pre-calculations for both general and synergy modeling ---
    frac_cols = [f'Component{i}_fraction' for i in range(1, 6)]

    # Calculate weighted averages and property statistics as a baseline for deviation (used in synergy features)
    weighted_avgs = {}
    prop_stds = {}
    for prop in range(1, 11):
        prop_cols = [f'Component{i}_Property{prop}' for i in range(1, 6)]
        weighted_avgs[prop] = np.sum(df[frac_cols].values * df[prop_cols].values, axis=1)
        prop_stds[prop] = df[prop_cols].std(axis=1)

    # --- General Feature Engineering ---

    # 1. Weighted Averages (already calculated above, just adding to new_features)
    for prop in range(1, 11):
        new_features[f'WeightedAvg_Property{prop}'] = weighted_avgs[prop]

    # 2. Min, Max, Mean, Std per Property
    for prop in range(1, 11):
        cols = [f'Component{i}_Property{prop}' for i in range(1, 6)]
        new_features[f'Property{prop}_min'] = df[cols].min(axis=1)
        new_features[f'Property{prop}_max'] = df[cols].max(axis=1)
        new_features[f'Property{prop}_mean'] = df[cols].mean(axis=1)
        new_features[f'Property{prop}_std'] = prop_stds[prop] # Using pre-calculated std

    # 3. Interaction Features (fraction × property)
    for i in range(1, 6):
        for prop in range(1, 11):
            new_features[f'C{i}_FracProp{prop}'] = df[f'Component{i}_fraction'] * df[f'Component{i}_Property{prop}']

    # 4. Total Fraction
    new_features['Total_fraction'] = df[frac_cols].sum(axis=1)

    # 5. Dominant Component Index
    # Use to_numpy() for potentially better performance with argmax
    dominant_comp_idx_general = df[frac_cols].to_numpy().argmax(axis=1) # 0-indexed
    # Convert back to 1-indexed component number
    new_features['Dominant_component'] = dominant_comp_idx_general + 1


    # --- Synergistic Feature Engineering ---

    # 1. Pairwise Component Interaction (Fraction Products)
    comp_indices = range(1, 6)
    for i in comp_indices:
        for j in comp_indices:
            if i < j: # To avoid duplicates (C1*C2 is same as C2*C1)
                new_features[f'Frac_Interaction_{i}x{j}'] = df[f'Component{i}_fraction'] * df[f'Component{j}_fraction']

    # 2. Deviation from Linear Expectation
    dominant_comp_idx_synergy = df[frac_cols].to_numpy().argmax(axis=1) # Index (0-4) of dominant component

    for prop in range(1, 11):
        prop_cols = [f'Component{i}_Property{prop}' for i in range(1, 6)]
        all_prop_values = df[prop_cols].to_numpy()

        # Get property of the dominant component for synergy calculation
        dominant_prop_val = all_prop_values[np.arange(len(df)), dominant_comp_idx_synergy]

        # Get min/max property values for the current sample
        min_prop_val = np.min(all_prop_values, axis=1)
        max_prop_val = np.max(all_prop_values, axis=1)

        # Feature: How far does the dominant component's property deviate from the weighted average?
        new_features[f'Prop{prop}_Dom_vs_WeightedAvg'] = dominant_prop_val - weighted_avgs[prop]

        # Feature: How much does the most extreme property (max) deviate from the weighted average?
        new_features[f'Prop{prop}_Max_vs_WeightedAvg'] = max_prop_val - weighted_avgs[prop]

        # Feature: How much does the least extreme property (min) deviate from the weighted average?
        new_features[f'Prop{prop}_Min_vs_WeightedAvg'] = min_prop_val - weighted_avgs[prop]

    # 3. Heterogeneity Interaction
    fractions = df[frac_cols] + 1e-9 # Add small epsilon to avoid log(0)
    composition_entropy = -np.sum(fractions * np.log(fractions), axis=1)

    for prop in range(1, 11):
        # Interaction between how diverse the components are and how diverse their properties are.
        new_features[f'Prop{prop}_Entropy_x_StdDev'] = composition_entropy * prop_stds[prop]

    # 🔁 Add all new columns at once for performance
    new_feature_df = pd.DataFrame(new_features, index=df.index)
    df = pd.concat([df, new_feature_df], axis=1)

    # Handle potential inf/-inf/NaN values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True) # Fill NaNs that may have been created by std on single-value rows, etc.

    return df

In [None]:
X_train_df=train_df.iloc[:, 0:55]
y_train_df=train_df.iloc[:, -10:]
X_test_df=test_df.drop(columns=['ID'])

In [None]:
X_train_df.shape, y_train_df.shape, X_test_df.shape

X_train_fe=add_engineered_features(X_train_df)
X_test_fe=add_engineered_features(X_test_df)

In [None]:
X_train_fe.shape, X_test_fe.shape, y_train_df.shape

((2000, 207), (500, 207), (2000, 10))

#### Submission Creation Function

In [None]:
def create_submission(y_pred, test_ids=None, transforms=None):
    """
    Formats y_pred into a sample submission format.

    Parameters:
    - y_pred (np.array or pd.DataFrame): shape (n_samples, 10)
    - test_ids (array-like or None): If None, uses 0 to n-1 as IDs
    - transfmors (dict): e.g., {'BlendProperty5': 'log1p'}

    Returns:
    - pd.DataFrame ready for submission
    """
    # Column names
    target_cols = [f'BlendProperty{i}' for i in range(1, 11)]

    # Convert to DataFrame
    if isinstance(y_pred, np.ndarray):
        df_pred = pd.DataFrame(y_pred, columns=target_cols)
    else:
        df_pred = y_pred.copy()

    # Apply inverse transformations if any
    if transforms:
        for i, col in enumerate(target_cols):
            method = transforms.get(col, 'none')
            if method == 'log1p':
                df_pred[col] = np.expm1(df_pred[col])
            # You can extend here with more inverse methods

    # Add ID column
    if test_ids is not None:
        df_pred.insert(0, 'ID', pd.Series(test_ids).astype(int).values)
    else:
        df_pred.insert(0, 'ID', np.arange(1, len(df_pred) + 1).astype(int))

    # Ensure correct dtypes
    df_pred[target_cols] = df_pred[target_cols].astype(float)
    df_pred['ID'] = df_pred['ID'].astype(int)

    return df_pred

#### KFold Cross validation on RandomForestRegressor 58.08075

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Model
base_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model = MultiOutputRegressor(base_model)

# To store MAPE scores for each blend property
mape_per_fold = []

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Train
    model.fit(X_train_fe, y_train_df)

    # Predict
    y_pred = model.predict(X_val)

    # MAPE for each blend property
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)

    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Convert to DataFrame for summary
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
print("\n📊 Average MAPE per BlendProperty over 5 folds:")
print(mape_df.mean().round(4))

✅ Fold 1 MAPE per target: [4.4519 0.4672 0.4941 0.5221 0.0201 0.5986 0.3525 1.1371 0.5189 0.2935]
✅ Fold 2 MAPE per target: [0.7888 0.9413 0.3164 0.5407 0.0295 0.3193 0.5504 0.4133 0.6074 0.6938]
✅ Fold 3 MAPE per target: [0.3254 0.6707 0.7359 0.5708 0.0148 0.3856 3.4235 0.7815 0.3344 0.2851]
✅ Fold 4 MAPE per target: [0.625  0.5277 0.8033 1.0307 0.0145 0.2656 0.3528 0.4088 1.5415 0.3111]
✅ Fold 5 MAPE per target: [0.6742 0.2963 0.4393 0.402  0.0266 0.2678 0.7989 0.3713 0.6184 0.8   ]

📊 Average MAPE per BlendProperty over 5 folds:
BlendProperty1     1.3731
BlendProperty2     0.5806
BlendProperty3     0.5578
BlendProperty4     0.6133
BlendProperty5     0.0211
BlendProperty6     0.3674
BlendProperty7     1.0956
BlendProperty8     0.6224
BlendProperty9     0.7241
BlendProperty10    0.4767
dtype: float64


In [None]:
y_rf_cv_test=model.predict(X_test_fe)

y_rf_cv_test_df=create_submission(y_rf_cv_test)
y_rf_cv_test_df.to_csv('y_rf_kfcv_test.csv', index=False)
y_rf_cv_test_df.head()

Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,-0.312973,0.233421,0.565485,0.436828,0.347658,0.434165,0.596674,0.477129,-0.190072,0.372541
1,2,-0.507242,-0.456344,-1.169968,-0.032656,-0.72735,-0.016713,-1.155338,-1.286557,-0.251075,0.283245
2,3,1.212688,0.78338,0.671814,0.793792,2.452283,1.449138,0.652892,1.389193,0.142986,1.956964
3,4,-0.121796,0.469584,0.616132,-0.209198,1.928104,-0.256726,0.634065,1.613433,0.847717,-0.6934
4,5,0.179493,-0.930156,1.111534,0.230324,2.395025,0.027578,1.05514,-0.480542,-0.672602,0.743564


#### RF with Recursive Feature Elimination 53.14206

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Model base
base_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# MAPE results
mape_per_fold = []

# Loop through folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # RFE for feature selection (you can adjust n_features_to_select)
    selector = RFE(estimator=base_model, n_features_to_select=50, step=10)
    selector.fit(X_tr, y_tr)

    # Transform features
    X_tr_sel = selector.transform(X_tr)
    X_val_sel = selector.transform(X_val)

    # Multi-output model
    model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
    model.fit(X_tr_sel, y_tr)

    # Predict
    y_pred = model.predict(X_val_sel)

    # MAPE for each blend property
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)

    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Summary
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
print("\n📊 Average MAPE per BlendProperty over 5 folds (with RFE):")
print(mape_df.mean().round(4))

✅ Fold 1 MAPE per target: [33.5911  1.2386  1.5032  1.5759  0.0858  1.3813  1.0047  2.9507  2.0086
  0.9005]
✅ Fold 2 MAPE per target: [1.812  2.4732 0.9792 1.0314 0.0762 0.7887 1.2712 1.0696 1.7661 1.9443]
✅ Fold 3 MAPE per target: [0.8314 1.423  2.3764 1.9925 0.0663 1.013  7.1911 2.5558 0.7837 0.7825]
✅ Fold 4 MAPE per target: [1.6089 1.2425 2.6979 3.1852 0.0624 0.7873 1.0533 1.1808 1.4046 0.8647]
✅ Fold 5 MAPE per target: [2.1849 0.7744 1.3806 0.7948 0.1805 0.6938 2.4275 1.0493 1.1888 2.0417]

📊 Average MAPE per BlendProperty over 5 folds (with RFE):
BlendProperty1     8.0057
BlendProperty2     1.4303
BlendProperty3     1.7875
BlendProperty4     1.7160
BlendProperty5     0.0942
BlendProperty6     0.9328
BlendProperty7     2.5896
BlendProperty8     1.7613
BlendProperty9     1.4304
BlendProperty10    1.3067
dtype: float64


In [None]:
# Final RFE fit on full training data
final_selector = RFE(estimator=base_model, n_features_to_select=50, step=10)
final_selector.fit(X_train_fe, y_train_df)

# Transform both train and test sets
X_train_selected = final_selector.transform(X_train_fe)
X_test_selected = final_selector.transform(X_test_fe)

# Final model training on all training data
final_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
final_model.fit(X_train_selected, y_train_df)

# Prediction on test data
y_test_rf_rfe_pred = final_model.predict(X_test_selected)
y_test_pred_rf_RFE_df=create_submission(y_test_rf_rfe_pred)
y_test_pred_rf_RFE_df.to_csv('y_test_pred_rf_RFE.csv', index=False)

#### KFold CrossValidation on Catboost 74.53801

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# CatBoost base model
base_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    silent=True,
    loss_function='MAPE',
      eval_metric='MAPE'  # Optional: can also use 'RMSE'
)

model = MultiOutputRegressor(base_model)

# To store MAPE scores for each blend property
mape_per_fold = []

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Train
    model.fit(X_tr, y_tr)

    # Predict
    y_pred = model.predict(X_val)

    # MAPE for each blend property
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)

    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Convert to DataFrame for summary
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
print("\n📊 Average MAPE per BlendProperty over 5 folds:")
print(mape_df.mean().round(4))

y_test_predict=model.predict(X_test_fe)

✅ Fold 1 MAPE per target: [3.388  0.7023 1.0741 0.5811 0.4127 0.6642 0.805  1.9113 0.8911 0.36  ]
✅ Fold 2 MAPE per target: [0.7788 0.7802 0.6961 0.5844 0.3957 0.2612 1.0622 0.5346 1.2473 0.7654]
✅ Fold 3 MAPE per target: [0.4424 0.7147 1.926  0.4915 0.2018 0.7396 6.5647 1.2067 0.6546 0.6569]
✅ Fold 4 MAPE per target: [0.4689 0.6342 1.9948 0.7343 0.6003 0.3509 0.7924 0.7875 2.19   0.3658]
✅ Fold 5 MAPE per target: [1.1667 0.3746 0.8595 0.3875 0.3766 0.2967 1.1598 0.5596 0.9986 0.6739]

📊 Average MAPE per BlendProperty over 5 folds:
BlendProperty1     1.2490
BlendProperty2     0.6412
BlendProperty3     1.3101
BlendProperty4     0.5557
BlendProperty5     0.3974
BlendProperty6     0.4625
BlendProperty7     2.0768
BlendProperty8     0.9999
BlendProperty9     1.1963
BlendProperty10    0.5644
dtype: float64


In [None]:
# Train final model on full training data
final_model = MultiOutputRegressor(CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    silent=True,
    loss_function='MAPE',
    eval_metric='MAPE'
))

final_model.fit(X_train_fe, y_train_df)

# Predict on test data
y_test_pred_catboost = final_model.predict(X_test_fe)
y_test_pred_catboost_df=create_submission(y_test_pred_catboost)
y_test_pred_catboost_df.to_csv('y_test_pred_catboost.csv', index=False)

#### Feature selection, select Kbest on Catboost 49.76943

In [None]:
# Apply PCA to retain 98% variance
pca = PCA(n_components=0.98, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_fe)

# K-Fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_per_fold = []

# Base model
base_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    silent=True,
    loss_function='MAPE',
    eval_metric='MAPE'
)

model = MultiOutputRegressor(base_model)

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_pca)):
    X_tr, X_val = X_train_pca[train_idx], X_train_pca[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)

    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_train_df.shape[1])
    ]
    mape_per_fold.append(fold_mape)
    print(f"✅ Fold {fold + 1} MAPE:", np.round(fold_mape, 4))

# MAPE Summary
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
print("\n📊 Average MAPE per BlendProperty (PCA):")
print(mape_df.mean().round(4))

✅ Fold 1 MAPE: [31.5508  1.7642  2.2902  1.4865  2.561   2.8952  1.8238  4.1646  3.435
  2.4246]
✅ Fold 2 MAPE: [2.5874 2.1217 1.4908 1.9674 2.1066 1.7883 4.0235 1.442  1.8478 5.5824]
✅ Fold 3 MAPE: [1.9288 1.8264 9.0962 2.8411 1.7082 3.8171 2.7643 3.0685 1.3964 2.5724]
✅ Fold 4 MAPE: [3.8103 1.8248 2.5483 5.8015 2.5867 2.4636 1.4581 1.5398 8.7848 1.7218]
✅ Fold 5 MAPE: [1.9784 1.6002 1.5601 1.4905 2.808  1.6265 2.7082 1.8262 3.5132 2.9306]

📊 Average MAPE per BlendProperty (PCA):
BlendProperty1     8.3712
BlendProperty2     1.8275
BlendProperty3     3.3971
BlendProperty4     2.7174
BlendProperty5     2.3541
BlendProperty6     2.5181
BlendProperty7     2.5556
BlendProperty8     2.4082
BlendProperty9     3.7954
BlendProperty10    3.0464
dtype: float64


In [None]:
# Transform full train and test data using the same PCA
X_train_pca_full = pca.transform(X_train_fe)
X_test_pca = pca.transform(X_test_fe)

# Train on full PCA-transformed training set
final_model = MultiOutputRegressor(base_model)
final_model.fit(X_train_pca_full, y_train_df)

# Predict on PCA-transformed test set
y_test_pred_pca_catboost = final_model.predict(X_test_pca)
y_test_pred_pca_catboost_df=create_submission(y_test_pred_pca_catboost)
y_test_pred_pca_catboost_df.to_csv('y_test_pred_pca_catboost.csv',index=False)

# Show final predictions
print("📦 Final Test Predictions (PCA + CatBoost):")
y_test_pred_pca_catboost_df.head()

📦 Final Test Predictions (PCA + CatBoost):


Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,-0.075866,0.311543,0.056914,0.387271,0.377431,0.638274,0.159812,0.318842,0.072865,0.387713
1,2,-0.399934,0.010924,-0.18403,0.713392,-0.369043,-0.082335,-0.455055,0.033464,-0.315798,-0.166916
2,3,0.365351,0.144721,0.372369,-0.254354,-0.178939,0.861335,0.419558,0.839511,0.513411,0.57994
3,4,-0.438175,-0.151623,0.026294,-0.863989,0.427597,-0.72559,0.118822,0.639856,0.084956,-0.263122
4,5,0.598211,-0.639487,0.427344,0.682397,0.398799,-0.114883,0.28459,-0.321307,-0.047907,0.691382


#### Recursive Feature Elimination after Random Forest 75.81276

In [None]:
from sklearn.feature_selection import RFE
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
import numpy as np
import pandas as pd

# Parameters
n_features_to_select = 40  # 🔁 Try 30 first, tune as needed

# Store selected features per target
selected_features_per_target = []

for i, target_name in enumerate(y_train_df.columns):
    print(f"🔍 Running RFE for target: {target_name}")
    
    # Base estimator
    cat_model = CatBoostRegressor(
        iterations=300,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        silent=True
    )
    
    # RFE
    rfe = RFE(estimator=cat_model, n_features_to_select=n_features_to_select, step=10)
    rfe.fit(X_train_fe, y_train_df[target_name])
    
    # Save selected feature names
    selected_features = X_train_fe.columns[rfe.support_].tolist()
    selected_features_per_target.append(selected_features)
    print(f"✅ Top {n_features_to_select} features for {target_name}:", selected_features)

# Optionally: Union of all selected features
selected_union = sorted(set([f for sublist in selected_features_per_target for f in sublist]))
print(f"\n📦 Total selected features (union across targets): {len(selected_union)}")


🔍 Running RFE for target: BlendProperty1
✅ Top 40 features for BlendProperty1: ['Component1_fraction', 'Component2_fraction', 'Component3_fraction', 'Component4_fraction', 'Component5_fraction', 'Component1_Property1', 'Component2_Property1', 'Component3_Property1', 'Component4_Property1', 'Component5_Property1', 'Component1_Property10', 'WeightedAvg_Property1', 'Property1_min', 'Property1_max', 'Property1_mean', 'Property1_std', 'C1_FracProp1', 'C1_FracProp2', 'C1_FracProp6', 'C1_FracProp8', 'C2_FracProp1', 'C2_FracProp2', 'C2_FracProp3', 'C2_FracProp6', 'C3_FracProp1', 'C3_FracProp2', 'C3_FracProp6', 'C3_FracProp7', 'C3_FracProp8', 'C4_FracProp1', 'C5_FracProp1', 'C5_FracProp2', 'C5_FracProp3', 'C5_FracProp4', 'C5_FracProp5', 'C5_FracProp6', 'C5_FracProp7', 'C5_FracProp8', 'C5_FracProp9', 'Dominant_component']
🔍 Running RFE for target: BlendProperty2
✅ Top 40 features for BlendProperty2: ['Component1_fraction', 'Component2_fraction', 'Component3_fraction', 'Component4_fraction', 'Com

In [None]:
# Reuse RFE-selected feature list (e.g. union across targets)
# Let's assume `selected_union` is already defined as per previous step
X_rfe = X_train_fe[selected_union]

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Base model 
base_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    silent=True,
    loss_function='MAPE',
    eval_metric='MAPE'
)

model = MultiOutputRegressor(base_model)

# Store MAPE scores
rfe_mape_per_fold = []

# Loop through K folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X_rfe)):
    X_tr, X_val = X_rfe.iloc[train_idx], X_rfe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Train
    model.fit(X_tr, y_tr)

    # Predict
    y_pred = model.predict(X_val)

    # Compute MAPE for each target
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    rfe_mape_per_fold.append(fold_mape)

    print(f"📘 RFE Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Summary DataFrame
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
rfe_mape_df = pd.DataFrame(rfe_mape_per_fold, columns=target_names)

print("\n📊 Average MAPE per BlendProperty (RFE) over 5 folds:")
print(rfe_mape_df.mean().round(4))

📘 RFE Fold 1 MAPE per target: [0.5064 0.7915 1.285  0.5572 0.3627 0.953  0.8189 2.1024 0.8767 0.3466]
📘 RFE Fold 2 MAPE per target: [0.7938 0.6637 0.6611 0.6132 0.2955 0.3087 0.8072 0.4429 1.2253 0.7729]
📘 RFE Fold 3 MAPE per target: [0.4716 1.0003 1.2559 0.433  0.2232 0.699  4.8543 1.0841 0.6235 0.6075]
📘 RFE Fold 4 MAPE per target: [0.6145 0.5664 2.0464 0.9235 0.4442 0.3487 0.748  0.7193 1.7937 0.3441]
📘 RFE Fold 5 MAPE per target: [1.0786 0.402  0.8043 0.3443 0.3678 0.2493 1.5724 0.4798 1.0982 0.8656]

📊 Average MAPE per BlendProperty (RFE) over 5 folds:
BlendProperty1     0.6930
BlendProperty2     0.6848
BlendProperty3     1.2105
BlendProperty4     0.5743
BlendProperty5     0.3387
BlendProperty6     0.5117
BlendProperty7     1.7602
BlendProperty8     0.9657
BlendProperty9     1.1235
BlendProperty10    0.5874
dtype: float64


In [None]:
# Make sure to subset test features using the same RFE-selected feature names
X_test_rfe = X_test_fe[selected_union]

# Predict using the trained model
y_rf_cv_RFE_test = model.predict(X_test_rfe)

# Convert predictions to submission format
y_rf_cv_RFE_test_df = create_submission(y_rf_cv_RFE_test)

# Save submission
y_rf_cv_RFE_test_df.to_csv('y_rf_cv_RFE_test_df.csv', index=False)

# Show preview
y_rf_cv_RFE_test_df.head()

Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,0.145908,0.250934,0.674859,0.697537,0.329973,0.680459,0.73365,0.478645,-0.190351,0.395681
1,2,-0.733855,-0.68253,-0.976764,0.059218,-0.737951,-0.045879,-0.958305,-1.254422,-0.634679,-0.003795
2,3,1.496749,1.095546,1.01214,0.93282,1.759482,1.738107,0.975835,1.773924,0.149237,2.0931
3,4,-0.367533,0.394439,0.783049,-0.470365,1.899325,-0.342644,0.730684,1.295433,0.437075,-0.912612
4,5,0.213762,-0.941663,1.127862,0.46672,1.863232,0.172817,1.043453,-0.021684,-0.528438,0.902628


#### KFold cross validation on LinearRegressor 10

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Scale features (fit only on training folds inside loop to avoid leakage)
scaler = StandardScaler()

# Model
base_model = LinearRegression()
model = MultiOutputRegressor(base_model)

# To store MAPE scores for each blend property
mape_per_fold = []

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Fit scaler on training data, transform both train and val
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # Train
    model.fit(X_tr_scaled, y_tr)

    # Predict
    y_pred = model.predict(X_val_scaled)

    # MAPE for each blend property
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)

    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Convert to DataFrame for summary
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
print("\n📊 Average MAPE per BlendProperty over 5 folds:")
print(mape_df.mean().round(4))

✅ Fold 1 MAPE per target: [5.4000e-03 2.8710e-01 1.0913e+00 2.6120e-01 2.8789e+00 6.0000e-04
 8.6490e-01 1.8380e+00 2.6585e+00 3.3800e-02]
✅ Fold 2 MAPE per target: [1.0000e-04 5.3050e-01 4.7840e-01 3.0280e-01 2.3152e+00 2.0000e-04
 7.8390e-01 3.5470e-01 9.2240e-01 1.2270e-01]
✅ Fold 3 MAPE per target: [1.0000e-04 2.5090e-01 1.6824e+00 1.8850e-01 2.0531e+00 3.0000e-04
 4.7393e+00 8.9780e-01 7.3870e-01 3.2800e-02]
✅ Fold 4 MAPE per target: [3.0000e-04 1.1650e-01 7.4020e-01 8.3620e-01 4.6806e+00 2.0000e-04
 5.5820e-01 3.8750e-01 3.3438e+00 4.2300e-02]
✅ Fold 5 MAPE per target: [2.0000e-04 1.0520e-01 6.9910e-01 1.8370e-01 4.0341e+00 1.0000e-04
 1.4849e+00 4.0320e-01 1.3581e+00 1.0570e-01]

📊 Average MAPE per BlendProperty over 5 folds:
BlendProperty1     0.0012
BlendProperty2     0.2581
BlendProperty3     0.9383
BlendProperty4     0.3545
BlendProperty5     3.1924
BlendProperty6     0.0003
BlendProperty7     1.6862
BlendProperty8     0.7762
BlendProperty9     1.8043
BlendProperty10    0.06

In [None]:
y_linreg_cv_test=model.predict(X_test_fe)

y_linreg_cv_test_df=create_submission(y_linreg_cv_test)
y_linreg_cv_test_df.to_csv('y_linreg_kfcv_test.csv', index=False)
y_linreg_cv_test_df.head()



Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,-63251830.0,39673920000.0,-530683900000.0,151178200000.0,-1429493000000.0,11064310.0,-595514600000.0,288168100000.0,-650399300000.0,29381630000.0
1,2,-291626700.0,105090300000.0,-2058456000000.0,770598400000.0,-7541797000000.0,288907300.0,-2398113000000.0,1256635000000.0,-3009127000000.0,127795400000.0
2,3,387500900.0,-150221100000.0,2787979000000.0,-1013933000000.0,9891908000000.0,-351542900.0,3233760000000.0,-1679549000000.0,3996980000000.0,-170851800000.0
3,4,640963400.0,-266052000000.0,4699252000000.0,-1660529000000.0,16147440000000.0,-527774300.0,5427416000000.0,-2794385000000.0,6609034000000.0,-284337000000.0
4,5,-24093420.0,9211738000.0,-172705600000.0,63164210000.0,-616613300000.0,22250380.0,-200490000000.0,104309500000.0,-248535100000.0,10610290000.0


#### Linear Regression select top k 44.62569

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Store test predictions per target
y_linreg_cv_top20_test_dict = {}

# Loop through each target column (BlendProperty1 to BlendProperty10)
for i in range(1, 11):
    target_col = f"BlendProperty{i}"
    print(f"🔄 Processing {target_col}...")

    # Step 1: Scale the full train set
    scaler_full = StandardScaler()
    X_train_scaled_full = scaler_full.fit_transform(X_train_fe)

    # Step 2: Fit Linear Regression on full features
    linreg_full = LinearRegression().fit(X_train_scaled_full, y_train_df[target_col])

    # Step 3: Get top 20 feature indices based on coefficient magnitude
    coefs_abs = np.abs(linreg_full.coef_)
    top_idx = np.argsort(coefs_abs)[-20:][::-1]  # descending order

    # Step 4: Get top 20 feature names
    top_feat_names = X_train_fe.columns[top_idx].tolist()

    # Step 5: Scale the train and test data for those top 20 features
    scaler_top20 = StandardScaler()
    X_train_top20 = scaler_top20.fit_transform(X_train_fe[top_feat_names])
    X_test_top20 = scaler_top20.transform(X_test_fe[top_feat_names])

    # Step 6: Fit linear regression on top 20 features
    linreg_top20 = LinearRegression().fit(X_train_top20, y_train_df[target_col])

    # Step 7: Predict on test data
    y_pred_test = linreg_top20.predict(X_test_top20)

    # Step 8: Store predictions in dictionary
    y_linreg_cv_top20_test_dict[target_col] = y_pred_test

# Step 9: Combine predictions into a DataFrame
y_linreg_cv_top20_test_df = pd.DataFrame(y_linreg_cv_top20_test_dict)

# Step 10: Add ID column for submission
y_linreg_cv_top20_test_df.insert(0, 'ID', range(1, len(y_linreg_cv_top20_test_df) + 1))

# Step 11: Save to CSV
y_linreg_cv_top20_test_df.to_csv('y_linreg_kfcv_top20_test.csv', index=False)
print("✅ Saved predictions to 'y_linreg_kfcv_top20_test.csv'")
y_linreg_cv_top20_test_df.head()

🔄 Processing BlendProperty1...
🔄 Processing BlendProperty2...
🔄 Processing BlendProperty3...
🔄 Processing BlendProperty4...
🔄 Processing BlendProperty5...
🔄 Processing BlendProperty6...
🔄 Processing BlendProperty7...
🔄 Processing BlendProperty8...
🔄 Processing BlendProperty9...
🔄 Processing BlendProperty10...
✅ Saved predictions to 'y_linreg_kfcv_top20_test.csv'


Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,-0.07695,-0.454714,0.161857,0.018933,0.16941,0.562839,0.155218,0.141527,0.132717,0.001013
1,2,0.102695,-0.820603,0.017482,0.179872,-0.070767,0.305459,0.014749,-0.089248,-0.806906,0.108986
2,3,0.307918,0.072625,0.224709,0.003977,-0.035078,0.201593,0.220394,0.445908,-0.216277,0.102575
3,4,-0.484366,-0.191203,0.277542,-0.004137,0.115469,-0.511027,0.280731,1.189059,-0.103841,0.028341
4,5,0.343155,-0.600232,-0.167539,-0.019971,0.04274,-0.361047,-0.160022,-0.678221,0.023763,0.225239


#### RF after PCA 23.44412

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Model base
base_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Store MAPE for each fold
mape_per_fold = []

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    # Split into train and validation sets
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # ----- Step 1: Feature Selection with RFE -----
    selector = RFE(estimator=base_model, n_features_to_select=50, step=10)
    selector.fit(X_tr, y_tr)
    X_tr_sel = selector.transform(X_tr)
    X_val_sel = selector.transform(X_val)

    # ----- Step 2: Apply Standard Scaling -----
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr_sel)
    X_val_scaled = scaler.transform(X_val_sel)

    # ----- Step 3: Apply PCA (retain 95% variance) -----
    pca = PCA(n_components=0.99, svd_solver='full')  # retain 95% variance
    X_tr_pca = pca.fit_transform(X_tr_scaled)
    X_val_pca = pca.transform(X_val_scaled)

    # ----- Step 4: Train MultiOutput Regressor -----
    model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
    model.fit(X_tr_pca, y_tr)

    # ----- Step 5: Predict on Validation Set -----
    y_pred = model.predict(X_val_pca)

    # ----- Step 6: Evaluate using MAPE -----
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)

    print(f"Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Final Summary
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
print("\n📊 Average MAPE per BlendProperty over 5 folds (with RFE + PCA 95%):")
print(mape_df.mean().round(4))

Fold 1 MAPE per target: [11.587   2.2     2.4945  2.1558  1.6199  4.6144  1.846   4.269   3.966
  1.1072]
Fold 2 MAPE per target: [1.3021 5.0163 1.125  2.004  1.5415 1.2393 2.9244 1.3362 1.5458 1.749 ]
Fold 3 MAPE per target: [1.4748 2.3761 3.1895 2.0623 1.2874 1.76   3.1949 3.7072 1.1658 1.3524]
Fold 4 MAPE per target: [2.7668 1.864  1.4042 2.8772 1.6735 1.3702 1.2133 1.6097 5.7128 1.2476]
Fold 5 MAPE per target: [2.3099 1.2322 1.5538 1.4676 2.7125 1.0727 2.6254 1.5069 2.2134 3.0065]

📊 Average MAPE per BlendProperty over 5 folds (with RFE + PCA 95%):
BlendProperty1     3.8881
BlendProperty2     2.5377
BlendProperty3     1.9534
BlendProperty4     2.1134
BlendProperty5     1.7670
BlendProperty6     2.0113
BlendProperty7     2.3608
BlendProperty8     2.4858
BlendProperty9     2.9208
BlendProperty10    1.6925
dtype: float64


In [None]:
# Step 1: Apply RFE (from last fold)
X_test_rfe = selector.transform(X_test_fe)
# Step 2: Standardize
X_test_scaled = scaler.transform(X_test_rfe)
# Step 3: Apply PCA
X_test_pca = pca.transform(X_test_scaled)
# Step 4: Predict
y_rf_kfcv_pca_test = model.predict(X_test_pca)
# Step 5: Convert to DataFrame
y_rf_kfcv_pca_test_df = pd.DataFrame(y_rf_kfcv_pca_test, columns=target_names)
# Step 6: Add ID column (if needed)
y_rf_kfcv_pca_test_df.insert(0, 'ID', np.arange(1, len(y_rf_kfcv_pca_test_df)+1))
# Step 7: Save
y_rf_kfcv_pca_test_df.to_csv('y_rf_kfcv_pca_test.csv', index=False)
# Step 8: Preview
y_rf_kfcv_pca_test_df.head()

Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,0.447276,0.390818,0.58347,0.913602,0.542055,0.365721,0.58538,0.655399,0.021679,-0.091201
1,2,-0.330697,-0.518482,-0.839323,0.538828,-0.654125,-0.120159,-0.842336,-0.674847,-0.542518,0.415465
2,3,0.995505,0.530691,0.582739,0.378718,0.779229,0.822851,0.653049,1.250729,0.097211,1.075577
3,4,-0.491358,-0.588653,0.637178,-0.575448,0.462782,-0.565344,0.578145,1.49557,-0.538383,-0.663417
4,5,0.187697,-0.391973,0.553814,0.586672,0.970784,-0.583765,0.615371,-0.338342,-0.293962,0.190813


#### Light GBM model 79.47185

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Scale features (fit only on training folds inside loop to avoid leakage)
scaler = StandardScaler()

# Base model: LightGBM
base_model = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model = MultiOutputRegressor(base_model)

# To store MAPE scores for each blend property
mape_per_fold = []

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Fit scaler on training data, transform both train and val
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    # Train
    model.fit(X_tr_scaled, y_tr)

    # Predict
    y_pred = model.predict(X_val_scaled)

    # MAPE for each blend property
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)

    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Convert to DataFrame for summary
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
print("\n📊 Average MAPE per BlendProperty over 5 folds:")
print(mape_df.mean().round(4))

# Predict on test set
X_test_scaled = scaler.transform(X_test_fe)
y_test_predict_lgbm = model.predict(X_test_scaled)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38485
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 157
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38485
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 157
[LightGBM] [Info] Start training from score -0.004643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003955 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38485
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 157
[LightGBM] [Info] Star

In [None]:
# --- Test Set Prediction Code ---

# Scale the test set using the last fitted scaler (from last fold)
X_test_scaled = scaler.transform(X_test_fe)
# Predict using the model trained on the last fold
y_test_pred = model.predict(X_test_scaled)

# Convert predictions to DataFrame
y_test_pred_df_lgbm = pd.DataFrame(
    y_test_pred,
    columns=[f'BlendProperty{i+1}' for i in range(y_test_pred.shape[1])]
)

# Optionally, add an ID column
y_test_pred_df_lgbm.insert(0, 'ID', range(1, len(y_test_pred_df_lgbm)+1))
# Save to CSV
y_test_pred_df_lgbm.to_csv('new_y_lgbm_kfcv_test_predictions.csv', index=False)

# Display the first few rows
print("\n📤 Sample predictions on test set:")
y_test_pred_df_lgbm.head()


📤 Sample predictions on test set:


Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,-0.155296,0.319216,0.566609,0.503049,0.349836,0.547548,0.666762,0.419596,-0.269709,0.41042
1,2,-0.847334,-0.783397,-1.126921,0.078659,-0.730802,-0.171003,-1.101913,-1.185381,-0.737151,0.296757
2,3,1.659156,0.99412,1.006556,1.077454,2.458332,1.723992,0.964488,1.729912,0.338633,2.163435
3,4,-0.454312,0.375003,0.700882,-0.596616,1.897748,-0.464389,0.683653,1.615664,0.699133,-0.87163
4,5,0.204602,-1.352812,1.133363,0.33794,2.392285,0.16359,1.109816,-0.089048,-0.543755,1.099203


#### light GBM + top k featue selection based on the feature importances 67.31835

In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define target names
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]

# Scale input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_fe)
X_test_scaled = scaler.transform(X_test_fe)

# Store importances
feature_names = X_train_fe.columns.tolist()
importances_dict = {}

# Train one LGBM model per target to collect importances
for target in target_names:
    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    )
    model.fit(X_train_scaled, y_train_df[target])
    importances = model.feature_importances_
    
    importances_dict[target] = pd.Series(importances, index=feature_names)

# Compute average importances
avg_importance = pd.DataFrame(importances_dict).mean(axis=1)
avg_importance = avg_importance.sort_values(ascending=False)

# Select top-K features
k = 35  # choose desired value of k
top_k_features = avg_importance.head(k).index.tolist()


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38490
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 157
[LightGBM] [Info] Start training from score -0.016879
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38490
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 157
[LightGBM] [Info] Start training from score -0.002076
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004011 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38490
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 157
[LightGBM] [Info] Star

In [None]:
# Filter top-K features
X_train_topk = X_train_fe[top_k_features]
X_test_topk = X_test_fe[top_k_features]

# Scale again
scaler_topk = StandardScaler()
X_train_topk_scaled = scaler_topk.fit_transform(X_train_topk)
X_test_topk_scaled = scaler_topk.transform(X_test_topk)

# Train one model per target and predict
y_preds = []

for target in target_names:
    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    )
    model.fit(X_train_topk_scaled, y_train_df[target])
    y_pred = model.predict(X_test_topk_scaled)
    y_preds.append(y_pred)

# Combine predictions
y_preds_array = np.vstack(y_preds).T  # shape: (n_samples, n_targets)
y_preds_lightgbm_top_k_df=create_submission(y_preds_array)
y_preds_lightgbm_top_k_df.to_csv('y_preds_lightgbm_top_k_df.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000784 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7638
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 35
[LightGBM] [Info] Start training from score -0.016879
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7638
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 35
[LightGBM] [Info] Start training from score -0.002076
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000627 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7638
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 35
[LightGBM] [Info] Start trai

#### LightGBM MultiOutputRegressor RFE to take k features 67.31835

In [None]:
def predict_on_test_with_rfe(X_train_fe, y_train_df, X_test_fe, n_splits=5, n_features_to_select=75):
    from sklearn.model_selection import KFold
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_selection import RFE
    from sklearn.multioutput import MultiOutputRegressor
    from lightgbm import LGBMRegressor
    import numpy as np
    import pandas as pd

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    test_preds_all_folds = []

    for fold, (train_idx, _) in enumerate(kf.split(X_train_fe)):
        X_tr, y_tr = X_train_fe.iloc[train_idx], y_train_df.iloc[train_idx]

        scaler = StandardScaler()
        X_tr_scaled = scaler.fit_transform(X_tr)
        X_test_scaled = scaler.transform(X_test_fe)

        selected_features_all_targets = set()

        # RFE per target to gather important features
        for i in range(y_tr.shape[1]):
            model = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
            rfe = RFE(estimator=model, n_features_to_select=n_features_to_select, step=10)
            rfe.fit(X_tr_scaled, y_tr.iloc[:, i])
            selected = np.where(rfe.support_)[0]
            selected_features_all_targets.update(selected)

        selected_features_all_targets = sorted(list(selected_features_all_targets))

        X_tr_selected = X_tr_scaled[:, selected_features_all_targets]
        X_test_selected = X_test_scaled[:, selected_features_all_targets]

        # Train final model
        base_model = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        model = MultiOutputRegressor(base_model)
        model.fit(X_tr_selected, y_tr)

        # Predict on test set
        test_pred = model.predict(X_test_selected)
        test_preds_all_folds.append(test_pred)

    # Average test predictions across all folds
    y_test_final = np.mean(test_preds_all_folds, axis=0)
    return y_test_final

In [None]:
y_test_predict_rfe = predict_on_test_with_rfe(X_train_fe, y_train_df, X_test_fe)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38485
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 157
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 36187
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 147
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003628 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 33637
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 137
[LightGBM] [Info] Star

In [None]:
y_test_predict_lgbm_rfe_df=create_submission(y_test_predict_rfe)
y_test_predict_lgbm_rfe_df.to_csv('y_test_predict_lgbm_rfe.csv', index=False)
y_test_predict_lgbm_rfe_df.head()

Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,-0.115759,0.325742,0.623514,0.429629,0.355806,0.574898,0.627195,0.458375,-0.208951,0.395471
1,2,-0.728667,-0.719342,-1.162019,0.034546,-0.729501,-0.132483,-1.119001,-1.203103,-0.522356,0.169094
2,3,1.534206,0.908336,0.946667,1.000498,2.459832,1.679056,0.944786,1.577894,0.412654,2.163382
3,4,-0.404485,0.285883,0.717732,-0.454879,1.923751,-0.417399,0.65925,1.517497,0.770358,-0.857904
4,5,0.173269,-1.204868,1.138781,0.315727,2.53658,0.179969,1.097766,-0.273438,-0.545113,1.039648


#### Removing just one feature after each step in LGBM Regressor

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mape_per_fold = []
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
test_preds_all_folds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Standardize features
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    selected_features_all_targets = set()

    # RFE for each target with step=1
    for i in range(y_tr.shape[1]):
        lgb = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        rfe = RFE(estimator=lgb, n_features_to_select=75, step=1)
        rfe.fit(X_tr_scaled, y_tr.iloc[:, i])
        selected_features = np.where(rfe.support_)[0]
        selected_features_all_targets.update(selected_features)

    selected_features_all_targets = sorted(list(selected_features_all_targets))
    X_tr_selected = X_tr_scaled[:, selected_features_all_targets]
    X_val_selected = X_val_scaled[:, selected_features_all_targets]

    # Train multi-output regressor on selected features
    base_model = LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model = MultiOutputRegressor(base_model)
    model.fit(X_tr_selected, y_tr)

    y_pred = model.predict(X_val_selected)
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)
    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

    # Predict on test set using selected features
    X_test_scaled = scaler.transform(X_test_fe)
    X_test_selected = X_test_scaled[:, selected_features_all_targets]
    y_test_pred = model.predict(X_test_selected)
    test_preds_all_folds.append(y_test_pred)

# Average predictions across all folds
y_test_predict_lgbm_rfe = np.mean(test_preds_all_folds, axis=0)

# View average MAPE per blend property
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
mape_df_mean = mape_df.mean().round(4)
print("\n📊 Mean MAPE per Blend Property after RFE:\n", mape_df_mean).


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38485
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 157
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38482
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 156
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38227
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 155
[LightGBM] [Info] Star

#### Tuning the above lightGBM model 74.27700

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mape_per_fold = []
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
test_preds_all_folds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    print(f"\n🚀 Fold {fold + 1}")

    # Split data
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Standardize
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    selected_features_all_targets = set()

    # Apply RFE for a few representative targets only (e.g., first 3)
    for i in range(3):
        lgb = LGBMRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=7,
            num_leaves=48,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            device='gpu'
        )
        rfe = RFE(estimator=lgb, n_features_to_select=75, step=5)
        rfe.fit(X_tr_scaled, y_tr.iloc[:, i])
        selected_features = np.where(rfe.support_)[0]
        selected_features_all_targets.update(selected_features)

    selected_features_all_targets = sorted(list(selected_features_all_targets))
    X_tr_selected = X_tr_scaled[:, selected_features_all_targets]
    X_val_selected = X_val_scaled[:, selected_features_all_targets]

    # Train MultiOutput model
    base_model = LGBMRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=8,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        device='gpu'
    )
    model = MultiOutputRegressor(base_model)
    model.fit(X_tr_selected, y_tr)

    # Predict & evaluate
    y_pred = model.predict(X_val_selected)
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)
    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

    # Predict on test set
    X_test_scaled = scaler.transform(X_test_fe)
    X_test_selected = X_test_scaled[:, selected_features_all_targets]
    y_test_pred = model.predict(X_test_selected)
    test_preds_all_folds.append(y_test_pred)

# Average predictions across all folds
y_test_predict_lgbm_rfe = np.mean(test_preds_all_folds, axis=0)

# Show average MAPE per property
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
mape_df_mean = mape_df.mean().round(4)
print("\n📊 Mean MAPE per Blend Property after RFE:\n", mape_df_mean)


🚀 Fold 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 38485
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 157
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2050, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 156 dense feature groups (0.24 MB) transferred to GPU in 0.012013 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 37462
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 152
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2050, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin e

In [None]:
y_test_pred_lgbm_df = create_submission(y_test_predict_lgbm_rfe)
y_test_pred_lgbm_df.head()
y_test_pred_lgbm_df.to_csv('y_test_pred_lgbmtuned.csv',index=False)

#### Lasso with multioutput and RFE 62.08373

In [None]:
from sklearn.linear_model import Lasso

kf = KFold(n_splits=5, shuffle=True, random_state=42)
lasso_mape_all = []
lasso_test_preds_all = []
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    selected_feats = set()
    for i in range(3):
        rfe = RFE(estimator=Lasso(alpha=0.001, max_iter=5000), n_features_to_select=75, step=1)
        rfe.fit(X_tr_scaled, y_tr.iloc[:, i])
        selected_feats.update(np.where(rfe.support_)[0])
    selected_feats = sorted(list(selected_feats))

    X_tr_sel = X_tr_scaled[:, selected_feats]
    X_val_sel = X_val_scaled[:, selected_feats]

    model = MultiOutputRegressor(Lasso(alpha=0.001, max_iter=5000))
    model.fit(X_tr_sel, y_tr)

    y_pred = model.predict(X_val_sel)
    fold_mape = [mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i]) for i in range(y_val.shape[1])]
    lasso_mape_all.append(fold_mape)

    X_test_scaled = scaler.transform(X_test_fe)
    X_test_sel = X_test_scaled[:, selected_feats]
    y_test_pred = model.predict(X_test_sel)
    lasso_test_preds_all.append(y_test_pred)

# Average test predictions
y_test_lasso_rfe = np.mean(lasso_test_preds_all, axis=0)

# Display mean MAPE
lasso_mape_df = pd.DataFrame(lasso_mape_all, columns=target_names)
print("\n📊 Mean MAPE per Blend Property (Lasso):\n", lasso_mape_df.mean().round(4))


📊 Mean MAPE per Blend Property (Lasso):
 BlendProperty1     0.0676
BlendProperty2     0.2415
BlendProperty3     0.9599
BlendProperty4     0.4243
BlendProperty5     3.4176
BlendProperty6     0.5716
BlendProperty7     1.6533
BlendProperty8     0.9092
BlendProperty9     1.6616
BlendProperty10    0.0614
dtype: float64


##### Train Lasso with RFE 62.08373

In [None]:
from sklearn.linear_model import Lasso

# --- Step 1: Standardize train and test sets ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_fe)
X_test_scaled = scaler.transform(X_test_fe)

# --- Step 2: RFE feature selection across first few targets (e.g. 3) ---
selected_features_set = set()

for i in range(3):  # Select features based on first 3 blend properties
    rfe = RFE(estimator=Lasso(alpha=0.001, max_iter=5000), n_features_to_select=75, step=1)
    rfe.fit(X_train_scaled, y_train_df.iloc[:, i])
    selected_features_set.update(np.where(rfe.support_)[0])

selected_features = sorted(list(selected_features_set))

# --- Step 3: Filter train/test to selected features ---
X_train_selected = X_train_scaled[:, selected_features]
X_test_selected = X_test_scaled[:, selected_features]

# --- Step 4: Train on full training data and predict on test set ---
model = MultiOutputRegressor(Lasso(alpha=0.001, max_iter=5000))
model.fit(X_train_selected, y_train_df)

y_test_pred_lasso = model.predict(X_test_selected)
y_test_pred_lasso_df=create_submission(y_test_pred_lasso)
y_test_pred_lasso_df.to_csv('y_test_pred_lasso_df.csv', index=False)

#### ElasticNet Ridge multioutput and RFE  35.86159

In [None]:
# --- Step 1: Standardize full train and test sets ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_fe)
X_test_scaled = scaler.transform(X_test_fe)

# --- Step 2: RFE feature selection using ElasticNet ---
selected_features_all_targets = set()

for i in range(y_train_df.shape[1]):
    enet = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=1000, random_state=42)
    rfe = RFE(estimator=enet, n_features_to_select=75, step=1)
    rfe.fit(X_train_scaled, y_train_df.iloc[:, i])
    selected_features = np.where(rfe.support_)[0]
    selected_features_all_targets.update(selected_features)

selected_features_all_targets = sorted(list(selected_features_all_targets))

# --- Step 3: Filter for selected features ---
X_train_selected = X_train_scaled[:, selected_features_all_targets]
X_test_selected = X_test_scaled[:, selected_features_all_targets]

# --- Step 4: Train on full train data and predict on test data ---
model = MultiOutputRegressor(ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=1000, random_state=42))
model.fit(X_train_selected, y_train_df)

y_test_pred_final = model.predict(X_test_selected)

In [None]:
y_test_elastic_rfe = create_submission(y_test_pred_final)
y_test_elastic_rfe.to_csv('y_test_predict_elastic_rfe.csv', index=False)
y_test_elastic_rfe.shape 

(500, 11)

##### XGBOOST

In [None]:
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import pandas as pd

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store MAPE results
mape_per_fold = []
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_fe)
X_test_scaled = scaler.transform(X_test_fe)

# Predict storage
test_preds_all_folds = []

# Loop through folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"\n🚀 Fold {fold + 1}")

    X_tr, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Initialize XGBoost model
    base_model = XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        tree_method='gpu_hist'  # use 'gpu_hist' for GPU, 'auto' for CPU
    )
    model = MultiOutputRegressor(base_model)

    # Train
    model.fit(X_tr, y_tr)

    # Validate
    y_pred = model.predict(X_val)
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)
    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

    # Predict on test set
    y_test_pred = model.predict(X_test_scaled)
    test_preds_all_folds.append(y_test_pred)

# Average predictions across all folds
y_test_predict_xgb = np.mean(test_preds_all_folds, axis=0)

# Average MAPE
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
print("\n📊 Average MAPE per Blend Property over 5 folds:")
print(mape_df.mean().round(4))


🚀 Fold 1



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  return booster.num_

✅ Fold 1 MAPE per target: [8.6978 1.0653 1.0706 0.9014 0.1621 0.7618 0.7302 2.716  1.0914 0.5528]

🚀 Fold 2



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  return booster.num_

✅ Fold 2 MAPE per target: [1.2134 1.5208 0.7037 0.8599 0.3566 0.5129 0.9922 0.731  1.2041 1.1181]

🚀 Fold 3



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  return booster.num_

✅ Fold 3 MAPE per target: [0.6085 0.8569 1.2457 0.6707 0.095  0.8691 2.9693 1.1893 0.6405 0.5213]

🚀 Fold 4



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  return booster.num_

✅ Fold 4 MAPE per target: [0.7746 0.6243 2.0902 1.4707 0.1271 0.5052 0.7304 0.7934 2.4074 0.4331]

🚀 Fold 5



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  return booster.num_

✅ Fold 5 MAPE per target: [1.2453 0.4673 0.773  0.5636 0.1768 0.4534 1.5632 0.5554 1.163  1.0157]

📊 Average MAPE per Blend Property over 5 folds:
BlendProperty1     2.5079
BlendProperty2     0.9069
BlendProperty3     1.1766
BlendProperty4     0.8933
BlendProperty5     0.1835
BlendProperty6     0.6205
BlendProperty7     1.3971
BlendProperty8     1.1970
BlendProperty9     1.3013
BlendProperty10    0.7282
dtype: float64


#### BaggingRegressor with XGBOOST using K FOLD cv 74.70199

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Base XGBoost model
base_xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='gpu_hist'  # comment if not using GPU
)

# MAPE results
mape_per_fold = []

# Loop through folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # RFE for feature selection
    selector = RFE(estimator=base_xgb, n_features_to_select=50, step=10)
    selector.fit(X_tr, y_tr)

    # Transform features
    X_tr_sel = selector.transform(X_tr)
    X_val_sel = selector.transform(X_val)

    # Multi-output XGB + Bagging
    multi_xgb = MultiOutputRegressor(base_xgb)
    model = BaggingRegressor(
        estimator=multi_xgb,
        n_estimators=5,
        max_samples=0.8,
        bootstrap=True,
        random_state=fold,
        n_jobs=-1
    )
    model.fit(X_tr_sel, y_tr)

    # Predict
    y_pred = model.predict(X_val_sel)

    # MAPE for each blend property
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)

    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Summary
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_df = pd.DataFrame(mape_per_fold, coalumns=target_names)
print("\n📊 Average MAPE per BlendProperty over 5 folds (with RFE + Bagging + XGBoost):")
print(mape_df.mean().round(4))


✅ Fold 1 MAPE per target: [4.1087 0.7681 1.0628 0.441  2.8436 0.8315 0.8082 1.419  1.2995 0.4158]
✅ Fold 2 MAPE per target: [0.762  0.8981 0.5268 0.8545 3.6596 0.2801 1.175  0.4349 0.7892 1.1296]
✅ Fold 3 MAPE per target: [0.3784 0.5789 2.534  0.4865 1.8735 0.5087 4.1781 1.4785 0.7978 0.3897]
✅ Fold 4 MAPE per target: [1.1415 0.4094 1.182  1.0317 4.0919 0.3553 0.6712 0.4849 1.568  0.4778]
✅ Fold 5 MAPE per target: [0.4359 0.3604 0.8089 0.4251 4.2194 0.2693 1.5328 0.6053 1.403  1.1142]

📊 Average MAPE per BlendProperty over 5 folds (with RFE + Bagging + Ridge):
BlendProperty1     1.3653
BlendProperty2     0.6030
BlendProperty3     1.2229
BlendProperty4     0.6477
BlendProperty5     3.3376
BlendProperty6     0.4490
BlendProperty7     1.6731
BlendProperty8     0.8845
BlendProperty9     1.1715
BlendProperty10    0.7054
dtype: float64


In [None]:
# Select features from test set using the final selector
X_test_sel = selector.transform(X_test_fe)
# Predict on test set using the final model
y_pred_XGBOOST_Bagging = model.predict(X_test_sel)

y_pred_XGBOOST_Bagging_df=create_submission(y_pred_XGBOOST_Bagging)
y_pred_XGBOOST_Bagging_df.to_csv('y_pred_XGBOOST_Bagging_df.csv', index=False)

#### Bagging Ridge Regression 57.26071

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_percentage_error # Make sure this is imported or defined

# Assume X_train_fe and y_train_df are already defined as pandas DataFrames

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Base Ridge model
# alpha is the regularization strength; tune this as needed
base_ridge = Ridge(alpha=1.0, random_state=42) 

# MAPE results
mape_per_fold = []

# Loop through folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # RFE for feature selection
    # Using the base_ridge estimator for RFE
    selector = RFE(estimator=base_ridge, n_features_to_select=50, step=10)
    selector.fit(X_tr, y_tr)

    # Transform features
    X_tr_sel = selector.transform(X_tr)
    X_val_sel = selector.transform(X_val)

    # Multi-output Ridge + Bagging
    # MultiOutputRegressor is needed because Ridge itself doesn't inherently handle multiple outputs
    # in the same way some tree-based models might by default.
    multi_ridge = MultiOutputRegressor(base_ridge)
    model = BaggingRegressor(
        estimator=multi_ridge,
        n_estimators=5,  # Number of base estimators in the ensemble
        max_samples=0.8, # The number of samples to draw from X to train each base estimator
        bootstrap=True,  # Whether samples are drawn with replacement
        random_state=fold,
        n_jobs=-1
    )
    model.fit(X_tr_sel, y_tr)

    # Predict
    y_pred = model.predict(X_val_sel)

    # MAPE for each blend property
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)

    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

# Summary
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
mape_df = pd.DataFrame(mape_per_fold, columns=target_names) # Corrected 'coalumns' to 'columns'
print("\n📊 Average MAPE per BlendProperty over 5 folds (with RFE + Bagging + Ridge):")
print(mape_df.mean().round(4))

✅ Fold 1 MAPE per target: [4.1087 0.7681 1.0628 0.441  2.8436 0.8315 0.8082 1.419  1.2995 0.4158]
✅ Fold 2 MAPE per target: [0.762  0.8981 0.5268 0.8545 3.6596 0.2801 1.175  0.4349 0.7892 1.1296]
✅ Fold 3 MAPE per target: [0.3784 0.5789 2.534  0.4865 1.8735 0.5087 4.1781 1.4785 0.7978 0.3897]
✅ Fold 4 MAPE per target: [1.1415 0.4094 1.182  1.0317 4.0919 0.3553 0.6712 0.4849 1.568  0.4778]
✅ Fold 5 MAPE per target: [0.4359 0.3604 0.8089 0.4251 4.2194 0.2693 1.5328 0.6053 1.403  1.1142]

📊 Average MAPE per BlendProperty over 5 folds (with RFE + Bagging + Ridge):
BlendProperty1     1.3653
BlendProperty2     0.6030
BlendProperty3     1.2229
BlendProperty4     0.6477
BlendProperty5     3.3376
BlendProperty6     0.4490
BlendProperty7     1.6731
BlendProperty8     0.8845
BlendProperty9     1.1715
BlendProperty10    0.7054
dtype: float64


In [None]:
# Select features from test set using the final selector (from the last fold's training)
# Note: In a real-world scenario, for deployment, you would typically train the selector
# and the model on the *entire* training dataset, or use a more robust
# way to finalize the selector (e.g., re-training RFE on all training data
# after cross-validation has determined the best parameters/approach).
# For this example, we're using the selector from the last fold's training.
X_test_sel = selector.transform(X_test_fe)

# Predict on test set using the final model (from the last fold's training)
y_pred_RIDGE_Bagging = model.predict(X_test_sel)

# Assuming create_submission is a function that formats the predictions into a DataFrame
y_pred_RIDGE_Bagging_df = create_submission(y_pred_RIDGE_Bagging)

# Save the predictions to a CSV file
y_pred_RIDGE_Bagging_df.to_csv('y_pred_RIDGE_Bagging_df.csv', index=False)

#### Light GBM in the bagging Regressor

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import BaggingRegressor # Import BaggingRegressor
from sklearn.metrics import mean_absolute_percentage_error 

# Assume X_train_fe, y_train_df, X_test_fe are already defined as pandas DataFrames
# And create_submission function is available

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mape_per_fold = []
target_names = [f'BlendProperty{i+1}' for i in range(y_train_df.shape[1])]
test_preds_all_folds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_fe)):
    print(f"\n🚀 Fold {fold + 1}")

    # Split data
    X_tr, X_val = X_train_fe.iloc[train_idx], X_train_fe.iloc[val_idx]
    y_tr, y_val = y_train_df.iloc[train_idx], y_train_df.iloc[val_idx]

    # Standardize
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)

    selected_features_all_targets = set()

    # Apply RFE for a few representative targets only (e.g., first 3)
    # The RFE estimator will still be a single LGBMRegressor
    for i in range(3):
        lgb_rfe_estimator = LGBMRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=7,
            num_leaves=48,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            device='gpu'
        )
        rfe = RFE(estimator=lgb_rfe_estimator, n_features_to_select=75, step=5)
        rfe.fit(X_tr_scaled, y_tr.iloc[:, i])
        selected_features = np.where(rfe.support_)[0]
        selected_features_all_targets.update(selected_features)

    selected_features_all_targets = sorted(list(selected_features_all_targets))
    X_tr_selected = X_tr_scaled[:, selected_features_all_targets]
    X_val_selected = X_val_scaled[:, selected_features_all_targets]

    # Train MultiOutput model with Bagging
    base_lgbm_model = LGBMRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=8,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        device='gpu'
    )
    
    # Wrap the base LGBM in MultiOutputRegressor
    multi_output_lgbm = MultiOutputRegressor(base_lgbm_model)

    # Wrap the MultiOutputRegressor in BaggingRegressor
    model = BaggingRegressor(
        estimator=multi_output_lgbm, # The estimator is now the MultiOutput LGBM
        n_estimators=5,             # Number of base estimators in the ensemble
        max_samples=0.8,            # The number of samples to draw from X to train each base estimator
        bootstrap=True,             # Whether samples are drawn with replacement
        random_state=fold,          # Ensures different random subsets for each fold's bagging
        n_jobs=-1                   # Use all available cores for bagging
    )
    
    model.fit(X_tr_selected, y_tr)

    # Predict & evaluate
    y_pred = model.predict(X_val_selected)
    fold_mape = [
        mean_absolute_percentage_error(y_val.iloc[:, i], y_pred[:, i])
        for i in range(y_val.shape[1])
    ]
    mape_per_fold.append(fold_mape)
    print(f"✅ Fold {fold + 1} MAPE per target:", np.round(fold_mape, 4))

    # Predict on test set
    X_test_scaled = scaler.transform(X_test_fe)
    X_test_selected = X_test_scaled[:, selected_features_all_targets]
    y_test_pred = model.predict(X_test_selected)
    test_preds_all_folds.append(y_test_pred)

# Average predictions across all folds
y_test_predict_lgbm_rfe_bagging = np.mean(test_preds_all_folds, axis=0)

# Show average MAPE per property
mape_df = pd.DataFrame(mape_per_fold, columns=target_names)
mape_df_mean = mape_df.mean().round(4)
print("\n📊 Mean MAPE per Blend Property after RFE + Bagging:\n", mape_df_mean)

# Save the final test predictions (assuming create_submission function exists)
y_pred_LGBM_Bagging_df = create_submission(y_test_predict_lgbm_rfe_bagging)
y_pred_LGBM_Bagging_df.to_csv('y_pred_LGBM_Bagging_df.csv', index=False)

NameError: name 'test_preds_all_folds' is not defined

In [None]:
import pandas as pd

#### Solutions

In [None]:
import pandas as pd
import os

# Define your base path where all prediction files are stored
base_path = r"C:/Users/Hemant Pathak/kaggle_competitions/Fuel_blend_challenge"

# Load each model's prediction CSV with meaningful names
file_map = {
    "RF_RFE": "y_rf_cv_RFE_test_df.csv",
    "CatBoost": "Catboost_best_82.24153.csv",
    "LightGBM_Tuned": "y_test_pred_lgbmtuned.csv",
    "LightGBM": "y_lgbm_kfcv_test_predictions.csv",
    "XGBoost_Bagging": "y_pred_XGBOOST_Bagging_df.csv",
    "Ridge_Bagging": "y_pred_RIDGE_Bagging_df.csv",
    "Lasso_RFE": "y_test_pred_lasso_df.csv",
    "LinearRegression": "Feature_engineered_LinearRegression.csv"
}


In [None]:
# Load all model prediction DataFrames
model_preds = {}
for model_name, file_name in file_map.items():
    file_path = os.path.join(base_path, file_name)
    model_preds[model_name] = pd.read_csv(file_path)
    print(f"✅ Loaded {model_name} from {file_name}")

✅ Loaded RF_RFE from y_rf_cv_RFE_test_df.csv
✅ Loaded CatBoost from Catboost_best_82.24153.csv
✅ Loaded LightGBM_Tuned from y_test_pred_lgbmtuned.csv
✅ Loaded LightGBM from y_lgbm_kfcv_test_predictions.csv
✅ Loaded XGBoost_Bagging from y_pred_XGBOOST_Bagging_df.csv
✅ Loaded Ridge_Bagging from y_pred_RIDGE_Bagging_df.csv
✅ Loaded Lasso_RFE from y_test_pred_lasso_df.csv
✅ Loaded LinearRegression from Feature_engineered_LinearRegression.csv


In [None]:

# All files have the same ID column
# Start the final predictions DataFrame with the ID column from any model
final_predictions = pd.DataFrame()
final_predictions["ID"] = model_preds["RF_RFE"]["ID"]

# List of target columns
blend_cols = [f"BlendProperty{i}" for i in range(1, 11)]

In [None]:

# 🧠 MANUAL SELECTION SECTION:
# For each BlendProperty, specify which model to use
# Example: "BlendProperty1": "LightGBM_Tuned"
selected_models_per_column = {
    "BlendProperty1": "LinearRegression",
    "BlendProperty2": "LinearRegression",
    "BlendProperty3": "RF_RFE",
    "BlendProperty4": "LinearRegression",
    "BlendProperty5": "RF_RFE",
    "BlendProperty6": "LinearRegression",
    "BlendProperty7": "RF_RFE",
    "BlendProperty8": "CatBoost",
    "BlendProperty9": "CatBoost",
    "BlendProperty10": "LinearRegression"
}

In [None]:
# Build the final DataFrame based on manual selections
for col in blend_cols:
    model_name = selected_models_per_column[col]
    final_predictions[col] = model_preds[model_name][col]
    print(f"✔️ {col} selected from {model_name}")

# Save the final predictions
output_path = os.path.join(base_path, "best_blendproperty_predictions.csv")
final_predictions.to_csv(output_path, index=False)
print(f"\n🎯 Final blended prediction saved to: {output_path}")

✔️ BlendProperty1 selected from LinearRegression
✔️ BlendProperty2 selected from LinearRegression
✔️ BlendProperty3 selected from RF_RFE
✔️ BlendProperty4 selected from LinearRegression
✔️ BlendProperty5 selected from RF_RFE
✔️ BlendProperty6 selected from LinearRegression
✔️ BlendProperty7 selected from RF_RFE
✔️ BlendProperty8 selected from CatBoost
✔️ BlendProperty9 selected from CatBoost
✔️ BlendProperty10 selected from LinearRegression

🎯 Final blended prediction saved to: C:/Users/Hemant Pathak/kaggle_competitions/Fuel_blend_challenge\best_blendproperty_predictions.csv


In [None]:
final_predictions.head()

Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,0.154944,0.163594,0.674859,0.604427,0.329973,0.713289,0.73365,0.313081,-0.261998,0.319214
1,2,-0.81022,-0.594426,-0.976764,0.135045,-0.737951,-0.103835,-0.958305,-1.122425,-0.61109,-0.004453
2,3,1.76925,1.167058,1.01214,1.067736,1.759482,1.861225,0.975835,1.985521,0.517269,2.258532
3,4,-0.453337,0.3133,0.783049,-0.660791,1.899325,-0.43887,0.730684,1.157836,0.681985,-0.952376
4,5,0.155401,-1.188939,1.127862,0.442967,1.863232,0.237333,1.043453,0.08862,-0.615686,1.063872


In [None]:
import os
import pandas as pd

# Base directory containing all files
base_path = r"C:\Users\Hemant Pathak\kaggle_competitions\Fuel_blend_challenge"

# Load the base/best solution
base_file = "final_submission_combining_linreg_xgboost.csv"
base_df = pd.read_csv(os.path.join(base_path, base_file))

# Folder to save all generated variations
output_folder = os.path.join(base_path, "PATHAK")
os.makedirs(output_folder, exist_ok=True)

# Mapping of model names to their CSV filenames
file_map = {
    "RF_RFE": "y_rf_cv_RFE_test_df.csv",
    "CatBoost": "Catboost_best_82.24153.csv",
    "LightGBM_Tuned": "y_test_pred_lgbmtuned.csv",
    "LightGBM": "y_lgbm_kfcv_test_predictions.csv",
    "XGBoost_Bagging": "y_pred_XGBOOST_Bagging_df.csv",
    "Ridge_Bagging": "y_pred_RIDGE_Bagging_df.csv",
    "Lasso_RFE": "y_test_pred_lasso_df.csv",
    "LinearRegression": "Feature_engineered_LinearRegression.csv"
}

# Load all model outputs into memory
model_preds = {}
for model_name, file_name in file_map.items():
    full_path = os.path.join(base_path, file_name)
    model_preds[model_name] = pd.read_csv(full_path)
    print(f"✅ Loaded: {model_name} → {file_name}")

# Blend properties to iterate over
blend_cols = [f"BlendProperty{i}" for i in range(1, 11)]

# Main loop: For each blend property, try replacing it with each model's version one by one
for prop in blend_cols:
    for model_name, pred_df in model_preds.items():
        # Create a copy of the base solution
        modified_df = base_df.copy()

        # Replace only the current blend property column
        modified_df[prop] = pred_df[prop]

        # Construct a meaningful filename
        output_filename = f"replace_{prop}_with_{model_name}.csv"
        output_path = os.path.join(output_folder, output_filename)

        # Save the modified file
        modified_df.to_csv(output_path, index=False)
        print(f"📝 Saved: {output_filename}")

print("\n🎉 All variations saved in:", output_folder)


✅ Loaded: RF_RFE → y_rf_cv_RFE_test_df.csv
✅ Loaded: CatBoost → Catboost_best_82.24153.csv
✅ Loaded: LightGBM_Tuned → y_test_pred_lgbmtuned.csv
✅ Loaded: LightGBM → y_lgbm_kfcv_test_predictions.csv
✅ Loaded: XGBoost_Bagging → y_pred_XGBOOST_Bagging_df.csv
✅ Loaded: Ridge_Bagging → y_pred_RIDGE_Bagging_df.csv
✅ Loaded: Lasso_RFE → y_test_pred_lasso_df.csv
✅ Loaded: LinearRegression → Feature_engineered_LinearRegression.csv
📝 Saved: replace_BlendProperty1_with_RF_RFE.csv
📝 Saved: replace_BlendProperty1_with_CatBoost.csv
📝 Saved: replace_BlendProperty1_with_LightGBM_Tuned.csv
📝 Saved: replace_BlendProperty1_with_LightGBM.csv
📝 Saved: replace_BlendProperty1_with_XGBoost_Bagging.csv
📝 Saved: replace_BlendProperty1_with_Ridge_Bagging.csv
📝 Saved: replace_BlendProperty1_with_Lasso_RFE.csv
📝 Saved: replace_BlendProperty1_with_LinearRegression.csv
📝 Saved: replace_BlendProperty2_with_RF_RFE.csv
📝 Saved: replace_BlendProperty2_with_CatBoost.csv
📝 Saved: replace_BlendProperty2_with_LightGBM_Tuned

In [None]:
X_train_fe.shape, X_test_fe.shape, y_train_df.shape

((2000, 157), (500, 157), (2000, 10))

#### Light GBM + Catboost blended 80.322

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Make sure X_train_fe, y_train_df, and X_test_fe are loaded

# --- Final Blending Script: LGBM + CatBoost ---
print("\n🚀 Starting final blend: LGBM + CatBoost...")

# 1. Scale Data for all models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_fe)
X_test_scaled = scaler.transform(X_test_fe)
print("✅ Data scaled.")

# 2. Train the specified LightGBM Model
print("Training LightGBM model...")
lgbm_model = MultiOutputRegressor(
    LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=40,
        random_state=123,  # Using a consistent seed
        n_jobs=-1
    )
)
lgbm_model.fit(X_train_scaled, y_train_df)
y_predict_lgbm = lgbm_model.predict(X_test_scaled)
print("✅ LightGBM training complete.")

# 3. Train the CatBoost Model
print("\nTraining CatBoost model...")
cat_model = MultiOutputRegressor(
    CatBoostRegressor(random_state=42, verbose=0, loss_function='RMSE')
)
cat_model.fit(X_train_scaled, y_train_df)
y_predict_cat = cat_model.predict(X_test_scaled)
print("✅ CatBoost training complete.")

# 4. Blend the Predictions
print("\nBlending predictions from both models...")
# A simple 50/50 blend is a robust choice
final_blend_predictions = 0.5 * y_predict_lgbm + 0.5 * y_predict_cat

print("\n🏆 FINAL BLEND COMPLETE! Your predictions are in the 'final_blend_predictions' variable.")


🚀 Starting final blend: LGBM + CatBoost...
✅ Data scaled.
Training LightGBM model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51231
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 207
[LightGBM] [Info] Start training from score -0.016879
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51231
[LightGBM] [Info] Number of data points in the train set: 2000, number of used features: 207
[LightGBM] [Info] Start training from score -0.002076
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51231
[LightGBM] [Info] Number of d

In [None]:
final_blend_predictions_df=create_submission(final_blend_predictions)
final_blend_predictions_df.to_csv('newest_new_final_blend_predictions_catboost_LGBM.csv', index=False)
final_blend_predictions_df.head()

Unnamed: 0,ID,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,1,0.108825,0.179385,0.653785,0.610549,0.342654,0.78109,0.557287,0.43235,-0.283381,0.418971
1,2,-0.598917,-0.487973,-1.130006,0.074433,-0.744005,-0.0899,-1.115723,-0.980922,-0.574322,0.043556
2,3,1.676594,1.060246,1.077423,1.087847,2.32705,1.774935,1.045778,1.779945,0.393725,2.184837
3,4,-0.452265,0.393069,0.711294,-0.544975,1.923856,-0.369009,0.691257,1.600745,0.753637,-0.872441
4,5,0.117379,-1.290506,1.06194,0.395255,2.346518,0.21498,1.053961,-0.186333,-0.442972,0.989389


#### Other possibilities

In [None]:
import pandas as pd
import numpy as np
import os

print("🚀 Starting last-minute submission generation...")

# --- 1. DEFINE FILE PATHS AND OUTPUT FOLDER ---

# !!! IMPORTANT: Make sure these paths are correct on your machine !!!
base_path = r"C:\Users\Hemant Pathak\kaggle_competitions\Fuel_blend_challenge"
output_dir = os.path.join(base_path, "PATHAK", "last_try")

# Input prediction files
path_best = os.path.join(base_path, "PATHAK", "replace_BlendProperty5_with_LightGBM_88.57817.csv")
path_cat = os.path.join(base_path, "Catboost_best_82.24153.csv")
# We don't need the linear regression file as it's already part of your best file.

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
print(f"✅ Output folder ready: '{output_dir}'")


# --- 2. LOAD THE PREDICTION FILES ---

try:
    # Set 'ID' as the index column to ensure rows align perfectly
    df_best = pd.read_csv(path_best, index_col='ID')
    df_cat = pd.read_csv(path_cat, index_col='ID')
    print("✅ Successfully loaded prediction files.")
except FileNotFoundError as e:
    print(f"❌ ERROR: File not found. Please check your paths. Details: {e}")
    exit()

# --- 3. STRATEGY 1: WEIGHTED BLENDING ---
# Blend the best submission with the CatBoost submission

print("\n🔥 Generating Strategy 1: Weighted Blends...")

# Weights to give to the `df_best` file. The rest (1 - weight) goes to `df_cat`.
weights = [0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.60, 0.50]
file_count = 0

for w in weights:
    # Calculate the blended dataframe
    blend_df = df_best * w + df_cat * (1 - w)

    # Create a descriptive filename
    weight_best_str = str(int(w*100))
    weight_cat_str = str(int((1-w)*100))
    filename = f"1_blend_best({weight_best_str})_cat({weight_cat_str}).csv"
    filepath = os.path.join(output_dir, filename)

    # Save the new submission file
    blend_df.to_csv(filepath)
    file_count += 1

print(f"✅ Generated {file_count} weighted blend files.")


# --- 4. STRATEGY 2: CHERRY-PICKING COLUMNS ---
# Use the best file as a base, but replace one column at a time with CatBoost's prediction

print("\n🔥 Generating Strategy 2: Cherry-Picking Columns...")

# These are the columns where your best model used Linear Regression
# We will try replacing them one-by-one with the CatBoost prediction
cols_to_swap = [
    "BlendProperty1", "BlendProperty2", "BlendProperty3", "BlendProperty4",
    "BlendProperty6", "BlendProperty7", "BlendProperty8", "BlendProperty10"
]
file_count = 0

for col in cols_to_swap:
    # Start with a fresh copy of your best submission
    cherry_pick_df = df_best.copy()

    # Replace the column with the one from the CatBoost file
    cherry_pick_df[col] = df_cat[col]

    # Create a descriptive filename
    filename = f"2_cherrypick_best_with_cat_on_{col}.csv"
    filepath = os.path.join(output_dir, filename)

    # Save the new submission file
    cherry_pick_df.to_csv(filepath)
    file_count += 1

print(f"✅ Generated {file_count} cherry-picked files.")

print("\n\n🏆 All done! You have 16 new files in the 'last_try' folder. Good luck!")

🚀 Starting last-minute submission generation...
✅ Output folder ready: 'C:\Users\Hemant Pathak\kaggle_competitions\Fuel_blend_challenge\PATHAK\last_try'
✅ Successfully loaded prediction files.

🔥 Generating Strategy 1: Weighted Blends...
✅ Generated 8 weighted blend files.

🔥 Generating Strategy 2: Cherry-Picking Columns...
✅ Generated 8 cherry-picked files.


🏆 All done! You have 16 new files in the 'last_try' folder. Good luck!


#### Trying out changing one column for each output, 

In [None]:
import pandas as pd
import numpy as np
import os

# === File paths ===
linreg_path = r"C:\Users\Hemant Pathak\kaggle_competitions\Fuel_blend_challenge\Feature_engineered_LinearRegression.csv"
catboost_path = r"C:\Users\Hemant Pathak\kaggle_competitions\Fuel_blend_challenge\PATHAK\last_try\88.76420_2_cherrypick_best_with_cat_on_BlendProperty3_.csv"

# === Output folder ===
output_folder = r"C:\Users\Hemant Pathak\kaggle_competitions\Fuel_blend_challenge\half_hour_to_go"
os.makedirs(output_folder, exist_ok=True)

# === Load predictions ===
df_linreg = pd.read_csv(linreg_path)
df_cat = pd.read_csv(catboost_path)

# === Extract ID and prediction columns ===
ID_col = df_linreg.iloc[:, 0]
pred_linreg = df_linreg.iloc[:, 1:]
pred_cat = df_cat.iloc[:, 1:]

# === Save helper ===
def save_ensemble(pred_df, name):
    final_df = pd.concat([ID_col, pred_df], axis=1)
    final_df.to_csv(os.path.join(output_folder, name), index=False)

# === 1. Simple average ===
save_ensemble((pred_linreg + pred_cat) / 2, "01_avg.csv")

# === 2–10. Weighted averages (0.1 to 0.9) ===
for i, alpha in enumerate(np.linspace(0.1, 0.9, 9), start=2):
    blended = pred_linreg * alpha + pred_cat * (1 - alpha)
    save_ensemble(blended, f"{i:02}_wavg_{alpha:.1f}_{1-alpha:.1f}.csv")

# === 11–15. Exponential averages (heavier linreg weight) ===
for i, w in enumerate([1, 2, 3, 5, 8], start=11):
    blended = (pred_linreg * w + pred_cat) / (w + 1)
    save_ensemble(blended, f"{i:02}_expavg_lin{w}_cat1.csv")

# === 16. Rank average ===
rank_linreg = pred_linreg.rank()
rank_cat = pred_cat.rank()
rank_avg = (rank_linreg + rank_cat) / 2
save_ensemble(rank_avg, "16_rank_avg.csv")

# === 17. Log average (geometric mean) ===
log_avg = np.exp((np.log(pred_linreg + 1e-6) + np.log(pred_cat + 1e-6)) / 2)
save_ensemble(log_avg, "17_log_avg.csv")

# === 18. Min prediction ===
save_ensemble(np.minimum(pred_linreg, pred_cat), "18_min.csv")

# === 19. Max prediction ===
save_ensemble(np.maximum(pred_linreg, pred_cat), "19_max.csv")

# === 20. Safety blend: 95% CatBoost + 5% LinReg ===
safe_blend = pred_linreg * 0.05 + pred_cat * 0.95
save_ensemble(safe_blend, "20_safe_catboost_95.csv")

print(f"✅ All 20 ensemble CSVs saved to:\n{output_folder}")

✅ All 20 ensemble CSVs saved to:
C:\Users\Hemant Pathak\kaggle_competitions\Fuel_blend_challenge\half_hour_to_go


  result = func(self.values, **kwargs)
