In [62]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import MiniBatchKMeans, AffinityPropagation
from sklearn.mixture import GaussianMixture

from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder, OrdinalEncoder, FunctionTransformer, KBinsDiscretizer, TargetEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
train_df = pd.concat([train_df, train_extra_df], ignore_index=True)

target = 'price'

def prepare_data(df: pd.DataFrame, is_train: bool = True):
    """
    Prepares the dataset for training or testing by renaming columns, handling missing values,
    converting categorical and numerical features, and creating new features.
    
    Args:
        df (pd.DataFrame): The input dataframe (train or test).
        is_train (bool): Indicates if the dataframe is training data (default is True).
        
    Returns:
        pd.DataFrame: The processed dataframe.
    """
    
    # Define the column names
    columns = [
        'id', 'brand', 'material', 'size', 'compartments', 
        'laptop_compartment', 'is_waterproof', 'style', 'color', 
        'weight_capacity'
    ]
    
    if is_train:
        columns.append('price')
    
    df.columns = columns
    df = df.drop(columns='id')
    
    # Define the mapping for Size conversion
    size_mapping = {"Small": 1, "Medium": 2, "Large": 3}
    df["size_int"] = df["size"].map(size_mapping).fillna(0).astype(int)
    
    # Handle weight capacity
    df['weight_capacity'] = df['weight_capacity'].fillna(0)
    df['weight_capacity_int'] = df['weight_capacity'].astype(int)
    df['weight_capacity_size'] = df['weight_capacity'] * df['size_int']
    
    # Convert categorical columns
    df['compartments'] = df['compartments'].astype('category')
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    df[cat_cols] = df[cat_cols].astype('category')
    
    # Convert boolean columns to integer type
    df['laptop_compartment'] = df['laptop_compartment'].cat.codes.fillna(-1).astype(int)
    df['is_waterproof'] = df['is_waterproof'].cat.codes.fillna(-1).astype(int)
    
    return df

# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)

In [24]:
baseline_features = ['weight_capacity', 'color', 'compartments', 'brand', 'material', 'is_waterproof']

In [64]:
len(train_df['weight_capacity'].unique())

1920346

In [85]:
def preprocess_weight_capacity(train_df, test_df, n_bins=5):
    """
    Function to bin 'weight_capacity' and apply Target Encoding based on the target column.
    
    Parameters:
    train_df (pd.DataFrame): Training dataframe containing 'weight_capacity'.
    test_df (pd.DataFrame): Test dataframe containing 'weight_capacity'.
    target_column (str): Target variable for encoding.
    n_bins (int): Number of bins for discretization.
    
    Returns:
    pd.DataFrame, pd.DataFrame: Transformed train and test DataFrames.
    """
    # Apply KBinsDiscretizer to bin 'weight_capacity'
    bins_discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    train_df['binned_weight_capacity'] = bins_discretizer.fit_transform(train_df[['weight_capacity']])
    test_df['binned_weight_capacity'] = bins_discretizer.transform(test_df[['weight_capacity']])

    # Apply TargetEncoder to encode the binned values based on the target_column
    target_encoder = TargetEncoder(target_type="continuous")
    train_df['encoded_weight_capacity'] = target_encoder.fit_transform(train_df[['binned_weight_capacity']], train_df[target])
    test_df['encoded_weight_capacity'] = target_encoder.transform(test_df[['binned_weight_capacity']])
    
    return train_df, test_df

In [89]:
df = train_df.copy()
bins_discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
df['binned_weight_capacity'] = bins_discretizer.fit_transform(df[['weight_capacity']])

target_encoder = TargetEncoder()
df['encoded_weight_capacity'] = target_encoder.fit_transform(df[['binned_weight_capacity']], df[target])
df

Unnamed: 0,brand,material,size,compartments,laptop_compartment,is_waterproof,style,color,weight_capacity,price,size_int,weight_capacity_int,weight_capacity_size,binned_weight_capacity,encoded_weight_capacity
0,Jansport,Leather,Medium,7.0,1,0,Tote,Black,11.611723,112.15875,2,11,23.223446,1.0,81.477060
1,Jansport,Canvas,Small,10.0,1,1,Messenger,Green,27.078537,68.88056,1,27,27.078537,4.0,81.878728
2,Under Armour,Leather,Small,2.0,1,0,Messenger,Red,16.643760,39.17320,1,16,16.643760,2.0,82.182396
3,Nike,Nylon,Small,8.0,1,0,Messenger,Green,12.937220,80.60793,1,12,12.937220,1.0,81.477060
4,Adidas,Canvas,Medium,1.0,1,1,Messenger,Green,17.749338,86.02312,2,17,35.498677,2.0,82.189217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3994313,Nike,Canvas,,3.0,1,1,Messenger,Blue,28.098120,104.74460,0,28,0.000000,4.0,81.840607
3994314,Puma,Leather,Small,10.0,1,1,Tote,Blue,17.379531,122.39043,1,17,17.379531,2.0,82.182396
3994315,Jansport,Canvas,Large,10.0,0,0,Backpack,Red,17.037708,148.18470,3,17,51.113124,2.0,82.189217
3994316,Puma,Canvas,,2.0,0,0,Backpack,Gray,28.783339,22.32269,0,28,0.000000,4.0,81.863574


In [90]:
df.groupby('binned_weight_capacity').mean('weight_capacity')

Unnamed: 0_level_0,laptop_compartment,is_waterproof,weight_capacity,price,size_int,weight_capacity_int,weight_capacity_size,encoded_weight_capacity
binned_weight_capacity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,0.466075,0.468488,8.157747,79.577652,1.980653,7.626354,16.154446,79.577629
1.0,0.468188,0.467242,13.354291,81.498674,1.969255,12.803882,26.293018,81.498701
2.0,0.474832,0.471856,18.11874,82.197817,1.972133,17.546724,35.740828,82.197782
3.0,0.472611,0.471414,22.859926,81.683447,1.971058,22.264032,45.05755,81.683414
4.0,0.46463,0.467935,27.547885,81.857677,1.97794,26.970671,54.49138,81.857672


In [87]:
def cross_validate_features(models, X, y, kf):
    model_scores = {name: [] for name in models.keys()}
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'verbose': -1,
        'force_row_wise': True
    }

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        print(f"Starting Fold {fold}...")

        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

        X_train, X_valid = preprocess_weight_capacity(pd.concat([X_train, y_train], axis=1), X_valid)
        
        for name, model_features in models.items():
            
            train_data = lgb.Dataset(X_train[model_features], label=y_train)
            valid_data = lgb.Dataset(X_valid[model_features], label=y_valid, reference=train_data)
            fit_model = lgb.train(params, train_data, num_boost_round=500, valid_sets=[valid_data])
            y_pred = fit_model.predict(X_valid[model_features], num_iteration=fit_model.best_iteration)
            
            rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
            model_scores[name].append(rmse)
            
    return pd.DataFrame(model_scores)

In [88]:
models = {
    "baseline": baseline_features,
    "weight_capacity": ["weight_capacity"],
    "binned_weight_capacity": ["binned_weight_capacity"],
    "encoded_weight_capacity": ["encoded_weight_capacity"],
    "bin / encode weight_capacity": ["binned_weight_capacity", "encoded_weight_capacity"],
    # "color": ["color"],
    # "brand": ["brand"],
    # "compartments": ["compartments"],
    # "material": ["material"],
    # "is_waterproof": ["is_waterproof"],
}

X = train_df.drop(columns=[target])
y = train_df[target]
kf = KFold(n_splits=10, shuffle=True, random_state=42)
result_df = cross_validate_features(models, X, y, kf)
summary_df = pd.DataFrame({
    "Mean RMSE": result_df.mean(),
    "Std RMSE": result_df.std()
})
display(summary_df)

Starting Fold 1...
Starting Fold 2...
Starting Fold 3...
Starting Fold 4...
Starting Fold 5...
Starting Fold 6...
Starting Fold 7...
Starting Fold 8...
Starting Fold 9...
Starting Fold 10...


Unnamed: 0,Mean RMSE,Std RMSE
baseline,38.890352,0.040699
weight_capacity,38.913651,0.041688
binned_weight_capacity,38.927774,0.042079
encoded_weight_capacity,38.927784,0.042071
bin / encode weight_capacity,38.927784,0.042071
is_waterproof,38.936586,0.042869


In [103]:
from itertools import combinations

def target_encoding(
    train_df: pd.DataFrame,
    cat_cols: list,
    target: str,
    test_df: pd.DataFrame = None,
    interactions: bool = True
):
    # Make copies to avoid mutating original data
    train_df = train_df.copy()
    test_df = test_df.copy() if test_df is not None else None
    
    encoded_cols = []

    # --- Encode each individual categorical column with TargetEncoder ---
    for col in cat_cols:
        # Initialize a fresh TargetEncoder for each column
        te = TargetEncoder(target_type="continuous")  
        
        # Fit on the training data
        # print(train_df[target])
        train_encoded = te.fit_transform(train_df[[col]], train_df[target])
        train_encoded_col = f"{col}_encoded"
        train_df[train_encoded_col] = train_encoded
        
        # Apply to test data (if provided)
        if test_df is not None:
            test_encoded = te.transform(test_df[[col]])
            test_encoded_col = f"{col}_encoded"
            test_df[test_encoded_col] = test_encoded
            
        encoded_cols.append(train_encoded_col)

    # --- (Optional) Encode interaction columns ---
    if interactions:
        for col1, col2 in combinations(cat_cols, 2):
            # Construct an interaction feature in train
            train_interaction = train_df[col1].astype(str) + "_" + train_df[col2].astype(str)
            
            # We'll store it in a temporary column just for clarity
            train_df["_interaction"] = train_interaction
            
            # Fit a fresh TargetEncoder on this new "interaction" column
            te_inter = TargetEncoder(target_type="continuous")
            train_encoded = te_inter.fit_transform(train_df[["_interaction"]], train_df[target])
            
            # Create a column name for the interaction encoding
            interaction_encoded_col = f"{col1}_{col2}_encoded"
            train_df[interaction_encoded_col] = train_encoded
            
            # Encode the test data (if provided)
            if test_df is not None:
                test_interaction = test_df[col1].astype(str) + "_" + test_df[col2].astype(str)
                test_df["_interaction"] = test_interaction
                test_encoded = te_inter.transform(test_df[["_interaction"]])
                test_df[interaction_encoded_col] = test_encoded
                
                # Drop the temporary interaction column
                test_df.drop(columns="_interaction", inplace=True, errors="ignore")

            # Drop the temporary interaction column from train
            train_df.drop(columns="_interaction", inplace=True, errors="ignore")

            encoded_cols.append(interaction_encoded_col)

    return train_df, test_df, encoded_cols

In [None]:
cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist() + ['laptop_compartment', 'is_waterproof', 'weight_capacity_int']
train_df_encoded, _, encoded_cols = target_encoding(
    train_df=train_df, 
    cat_cols=cat_cols, 
    target=target, 
    test_df=train_df
)
train_df_encoded

In [101]:
encoded_cols

['brand_encoded',
 'material_encoded',
 'size_encoded',
 'compartments_encoded',
 'style_encoded',
 'color_encoded',
 'laptop_compartment_encoded',
 'is_waterproof_encoded',
 'weight_capacity_int_encoded',
 'brand_material_encoded',
 'brand_size_encoded',
 'brand_compartments_encoded',
 'brand_style_encoded',
 'brand_color_encoded',
 'brand_laptop_compartment_encoded',
 'brand_is_waterproof_encoded',
 'brand_weight_capacity_int_encoded',
 'material_size_encoded',
 'material_compartments_encoded',
 'material_style_encoded',
 'material_color_encoded',
 'material_laptop_compartment_encoded',
 'material_is_waterproof_encoded',
 'material_weight_capacity_int_encoded',
 'size_compartments_encoded',
 'size_style_encoded',
 'size_color_encoded',
 'size_laptop_compartment_encoded',
 'size_is_waterproof_encoded',
 'size_weight_capacity_int_encoded',
 'compartments_style_encoded',
 'compartments_color_encoded',
 'compartments_laptop_compartment_encoded',
 'compartments_is_waterproof_encoded',
 'c

In [113]:
def cross_validate_encoded_features(models, X, y, kf):
    model_scores = {name: [] for name in models.keys()}
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'verbose': -1,
        'force_row_wise': True
    }

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        print(f"Starting Fold {fold}...")

        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

        X_train, X_valid = preprocess_weight_capacity(pd.concat([X_train, y_train], axis=1), X_valid)

        X_train, X_valid, encoded_cols = target_encoding(
            train_df=X_train,
            cat_cols=cat_cols,
            test_df=X_valid, 
            target=y_train.name,
        )
        
        for name, model_features in models.items():
            
            train_data = lgb.Dataset(X_train[model_features], label=y_train)
            valid_data = lgb.Dataset(X_valid[model_features], label=y_valid, reference=train_data)
            fit_model = lgb.train(params, train_data, num_boost_round=500, valid_sets=[valid_data])
            y_pred = fit_model.predict(X_valid[model_features], num_iteration=fit_model.best_iteration)
            
            rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
            model_scores[name].append(rmse)
            
    return pd.DataFrame(model_scores)

In [114]:
models = {
    "baseline": baseline_features,
    "encoded_cols": encoded_cols,
    "encoded_cols + baseline": encoded_cols + baseline_features,
    "weight_capacity": ["weight_capacity"],
    "weight_capacity_int": ["weight_capacity_int"],
}

X = train_df.drop(columns=[target])
y = train_df[target]
kf = KFold(n_splits=10, shuffle=True, random_state=42)
result_df = cross_validate_encoded_features(models, X, y, kf)
summary_df = pd.DataFrame({
    "Mean RMSE": result_df.mean(),
    "Std RMSE": result_df.std()
})
display(summary_df)

Starting Fold 1...
Starting Fold 2...
Starting Fold 3...
Starting Fold 4...
Starting Fold 5...
Starting Fold 6...
Starting Fold 7...
Starting Fold 8...
Starting Fold 9...
Starting Fold 10...


Unnamed: 0,Mean RMSE,Std RMSE
baseline,38.890352,0.040699
encoded_cols,38.8784,0.04096
encoded_cols + baseline,38.871738,0.041062
weight_capacity,38.913651,0.041688
weight_capacity_int,38.919285,0.041895


In [111]:
import xgboost as xgb

def cross_validate_encoded_features_xgb(models, X, y, kf):
    model_scores = {name: [] for name in models.keys()}
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        # 'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.9,
        'verbose': -1,
        'device': "cuda",
        'force_row_wise': True,
        'subsample': 0.85,
        'colsample_bylevel': 0.9,
        'colsample_bytree': 0.9,
        'gamma': 0.6,
        'max_depth': 6, 'min_child_weight': 2,
        'reg_alpha': 1.0, 'reg_lambda': 1e-06
    }

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        print(f"Starting Fold {fold}...")

        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

        X_train, X_valid = preprocess_weight_capacity(pd.concat([X_train, y_train], axis=1), X_valid)

        X_train, X_valid, encoded_cols = target_encoding(
            train_df=X_train,
            cat_cols=cat_cols,
            test_df=X_valid, 
            target=y_train.name,
        )
        
        for name, model_features in models.items():

            dtrain = xgb.DMatrix(X_train[model_features], label=y_train, enable_categorical=True)
            dvalid = xgb.DMatrix(X_valid[model_features], label=y_valid, enable_categorical=True)

            bst = xgb.train(
                params=params,
                dtrain=dtrain,
                num_boost_round=1000,
                evals=[(dtrain, "train"), (dvalid, "validation_0")],
                early_stopping_rounds=50,
                verbose_eval=False,
            )
            y_pred = bst.predict(dvalid)
            
            rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
            model_scores[name].append(rmse)
            
    return pd.DataFrame(model_scores)

In [112]:
models = {
    "baseline": baseline_features,
    "encoded_cols": encoded_cols,
    "encoded_cols + baseline": encoded_cols + baseline_features,
    "weight_capacity": ["weight_capacity"],
    "weight_capacity_int": ["weight_capacity_int"],
}

X = train_df.drop(columns=[target])
y = train_df[target]
kf = KFold(n_splits=10, shuffle=True, random_state=42)
result_df = cross_validate_encoded_features_xgb(models, X, y, kf)
summary_df = pd.DataFrame({
    "Mean RMSE": result_df.mean(),
    "Std RMSE": result_df.std()
})
display(summary_df)

Starting Fold 1...
Starting Fold 2...
Starting Fold 3...
Starting Fold 4...
Starting Fold 5...
Starting Fold 6...
Starting Fold 7...
Starting Fold 8...
Starting Fold 9...
Starting Fold 10...


Unnamed: 0,Mean RMSE,Std RMSE
baseline,38.88941,0.041246
encoded_cols,38.880539,0.040766
encoded_cols + baseline,38.873157,0.040084
weight_capacity,38.911797,0.041774
weight_capacity_int,38.919288,0.041894


In [80]:
train_df['weight_capacity_int'] = train_df['weight_capacity'].astype(int)
train_df['weight_capacity_size'] = train_df['weight_capacity'] * train_df['size_int']

train_df['weight_capacity_binned'] = pd.qcut(train_df['weight_capacity'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

# Interaction Terms
train_df['weight_capacity_brand'] = train_df['weight_capacity'] * train_df['brand'].astype('category').cat.codes

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
train_df[['weight_capacity_poly_2']] = poly.fit_transform(train_df[['weight_capacity']])[:, 1:2]  # squared term
train_df[['weight_capacity_poly_3']] = poly.transform(train_df[['weight_capacity']])[:, 2:]

# Exponential
train_df['weight_capacity_exp'] = np.exp(train_df['weight_capacity'])

# Reciprocal Transformations
train_df['weight_capacity_inv'] = 1 / (train_df['weight_capacity'] + 1e-6)  # Avoid division by zero

# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
train_df['weight_capacity_pca'] = pca.fit_transform(train_df[['weight_capacity']])
# train_df['weight_capacity_pca2'] = pca.fit_transform(train_df[['weight_capacity', 'color', 'compartments', 'brand']])

In [76]:

features_to_try = [
    'weight_capacity_int', 'weight_capacity_size', 
    'weight_capacity_binned', 'weight_capacity_brand', 'weight_capacity_poly_2', 
    'weight_capacity_poly_3','weight_capacity_exp', 'weight_capacity_inv', 
    'weight_capacity_pca'] # , 'weight_capacity_density'

models = {
    "baseline": baseline_features,
    "weight_capacity_int": ['weight_capacity_int'] + baseline_features,
    "weight_capacity_size": ['weight_capacity_size'] + baseline_features,
    "weight_capacity_binned": ['weight_capacity_binned'] + baseline_features,
    "weight_capacity_brand": ['weight_capacity_brand'] + baseline_features,
    "weight_capacity_poly_2": ['weight_capacity_poly_2'] + baseline_features,
    "weight_capacity_poly_3": ['weight_capacity_poly_3'] + baseline_features,
    "weight_capacity_exp": ['weight_capacity_exp'] + baseline_features,
    "weight_capacity_inv": ['weight_capacity_inv'] + baseline_features,
    # "weight_capacity_density": ['weight_capacity_density'] + baseline_features,
    "weight_capacity_pca": ['weight_capacity_pca'] + baseline_features,
    
    # Combination models
    "poly_features": ['weight_capacity_poly_2', 'weight_capacity_poly_3'] + baseline_features,
    "transformed_features": ['weight_capacity_exp', 'weight_capacity_inv'] + baseline_features,
    "interaction_features": ['weight_capacity_brand', 'weight_capacity_size'] + baseline_features,
    # "density_pca_features": ['weight_capacity_density', 'weight_capacity_pca', 'weight_capacity'] + baseline_features,
    "all_features": features_to_try + baseline_features  # Full model
}

In [None]:
X = train_df.drop(columns=[target])
y = train_df[target]
kf = KFold(n_splits=10, shuffle=True, random_state=42)
result_df = cross_validate_features(models, X, y, kf, verbose=False)
summary_df = pd.DataFrame({
    "Mean RMSE": result_df.mean(),
    "Std RMSE": result_df.std()
})
display(summary_df)

In [20]:
from itertools import combinations

def mean_std_target_encoding(df, cat_cols, target, interactions=True):
    df = df.copy()  # Avoid modifying the original dataframe
    encoded_cols = []  # Store new encoded column names
    
    for col in cat_cols:
        # Compute mean and std of target for each category in the column
        category_stats = df.groupby(col)[target].agg(['mean', 'std'])
        
        # Map the values to the dataframe
        df[f'{col}_mean'] = df[col].map(category_stats['mean']).astype(float)
        df[f'{col}_std'] = df[col].map(category_stats['std']).astype(float)
        
        encoded_cols.extend([f'{col}_mean', f'{col}_std'])
        
        # Fill NaN values (in case some categories are missing)
        df[f'{col}_mean'].fillna(df[f'{col}_mean'].mean(), inplace=True)
        df[f'{col}_std'].fillna(df[f'{col}_std'].mean(), inplace=True)
    
    if interactions:
        # Interaction Target Encoding
        for col1, col2 in combinations(cat_cols, 2):
            interaction_col = df[col1].astype(str) + "_" + df[col2].astype(str)
            
            # Compute mean and std of target for each interaction category
            interaction_stats = df.groupby(interaction_col)[target].agg(['mean', 'std'])
            
            # Map the values to the dataframe
            df[f'{col1}_{col2}_mean'] = interaction_col.map(interaction_stats['mean']).astype(float)
            df[f'{col1}_{col2}_std'] = interaction_col.map(interaction_stats['std']).astype(float)
            
            encoded_cols.extend([f'{col1}_{col2}_mean', f'{col1}_{col2}_std'])
            
            # Fill NaN values (in case some interactions are missing)
            df[f'{col1}_{col2}_mean'].fillna(df[f'{col1}_{col2}_mean'].mean(), inplace=True)
            df[f'{col1}_{col2}_std'].fillna(df[f'{col1}_{col2}_std'].mean(), inplace=True)
    
    return df, encoded_cols

In [50]:
from itertools import combinations
import pandas as pd

def mean_std_target_encoding(
    train_df: pd.DataFrame,
    cat_cols: list,
    target: str,
    test_df: pd.DataFrame = None,
    interactions: bool = True
):
    """
    Computes mean and std of the target for each categorical feature (and optional interactions)
    from train_df, then maps those statistics onto both train_df and test_df if provided.

    Parameters
    ----------
    train_df : pd.DataFrame
        The training dataframe.
    cat_cols : list
        List of categorical columns to encode.
    target : str
        The target column name on which to compute mean and std.
    test_df : pd.DataFrame, optional
        The test dataframe. If provided, the train statistics are applied to it.
    interactions : bool, optional
        Whether to create interaction columns for every pair of `cat_cols`.

    Returns
    -------
    train_df : pd.DataFrame
        The modified training dataframe with new encoding columns.
    test_df : pd.DataFrame or None
        The modified test dataframe if provided; otherwise None.
    encoded_cols : list
        List of newly created encoded column names.
    """
    
    # Make copies to avoid mutating original data
    train_df = train_df.copy()
    test_df = test_df.copy() if test_df is not None else None
    
    encoded_cols = []

    # --- Encode each individual categorical column ---
    for col in cat_cols:
        # 1) Compute stats on train
        category_stats = train_df.groupby(col)[target].agg(['mean', 'std'])
        
        # 2) Map to train
        mean_col_name = f'{col}_mean'
        std_col_name  = f'{col}_std'
        
        train_df[mean_col_name] = train_df[col].map(category_stats['mean']).astype(float)
        train_df[std_col_name]  = train_df[col].map(category_stats['std']).astype(float)
        
        # 3) Compute the train-level means (for filling missing values)
        train_mean_for_mean = train_df[mean_col_name].mean()
        train_mean_for_std  = train_df[std_col_name].mean()
        
        # Fill any missing in train itself
        train_df[mean_col_name].fillna(train_mean_for_mean, inplace=True)
        train_df[std_col_name].fillna(train_mean_for_std, inplace=True)
        
        # 4) If a test_df is provided, apply same mapping
        if test_df is not None:
            test_df[mean_col_name] = test_df[col].map(category_stats['mean']).astype(float)
            test_df[std_col_name]  = test_df[col].map(category_stats['std']).astype(float)
            test_df[mean_col_name].fillna(train_mean_for_mean, inplace=True)
            test_df[std_col_name].fillna(train_mean_for_std, inplace=True)

        encoded_cols.extend([mean_col_name, std_col_name])

    # --- (Optional) Encode interaction columns ---
    if interactions:
        for col1, col2 in combinations(cat_cols, 2):
            # Construct an interaction column in train
            train_interaction = train_df[col1].astype(str) + "_" + train_df[col2].astype(str)
            
            # Compute stats on train using the interaction
            interaction_stats = train_df.groupby(train_interaction)[target].agg(['mean', 'std'])
            
            # Column names for the new features
            mean_col_name = f'{col1}_{col2}_mean'
            std_col_name  = f'{col1}_{col2}_std'
            
            # Map onto the train
            train_df[mean_col_name] = train_interaction.map(interaction_stats['mean']).astype(float)
            train_df[std_col_name]  = train_interaction.map(interaction_stats['std']).astype(float)
            
            # Compute the train-level means for missing values
            train_mean_for_mean = train_df[mean_col_name].mean()
            train_mean_for_std  = train_df[std_col_name].mean()
            
            # Fill missing in train
            train_df[mean_col_name].fillna(train_mean_for_mean, inplace=True)
            train_df[std_col_name].fillna(train_mean_for_std, inplace=True)
            
            # If test_df provided, apply same logic
            if test_df is not None:
                test_interaction = test_df[col1].astype(str) + "_" + test_df[col2].astype(str)
                test_df[mean_col_name] = test_interaction.map(interaction_stats['mean']).astype(float)
                test_df[std_col_name]  = test_interaction.map(interaction_stats['std']).astype(float)
                test_df[mean_col_name].fillna(train_mean_for_mean, inplace=True)
                test_df[std_col_name].fillna(train_mean_for_std, inplace=True)
                

            encoded_cols.extend([mean_col_name, std_col_name])

    return train_df, test_df, encoded_cols


In [51]:
# df = train_df.copy()  # Avoid modifying the original dataframe
# encoded_cols = []  # Store new encoded column names

# for col in cat_cols:
#     # Compute mean and std of target for each category in the column
#     category_stats = df.groupby(col)[target].agg(['mean', 'std'])
    
#     # Map the values to the dataframe
#     df[f'{col}_mean'] = df[col].map(category_stats['mean']).astype(float)
#     df[f'{col}_std'] = df[col].map(category_stats['std']).astype(float)
    
#     encoded_cols.extend([f'{col}_mean', f'{col}_std'])
#     break
    
#     # Fill NaN values (in case some categories are missing)
#     df[f'{col}_mean'].fillna(df[f'{col}_mean'].mean(), inplace=True)
#     df[f'{col}_std'].fillna(df[f'{col}_std'].mean(), inplace=True)

# df[encoded_cols].info()

In [52]:
cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist() + ['laptop_compartment', 'is_waterproof']
train_df_encoded, _, encoded_cols = mean_std_target_encoding(
    train_df, 
    cat_cols=cat_cols, 
    target=target, 
    test_df=train_df
)

In [40]:
X.select_dtypes(include=['object', 'category']).columns.tolist() + ['laptop_compartment', 'is_waterproof']

['brand',
 'material',
 'size',
 'compartments',
 'style',
 'color',
 'laptop_compartment',
 'is_waterproof']

In [58]:
def cross_validate_features(models, X, y, kf, verbose=True):
    model_scores = {name: [] for name in models.keys()}
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        # 'num_leaves': 310,
        # 'learning_rate': 0.1,
        # 'feature_fraction': 0.9,
        'verbose': -1,
        'force_row_wise': True
    }

    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist() + ['laptop_compartment', 'is_waterproof']

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        # if verbose:
        print(f"Starting Fold {fold}...")
        fold_start_time = time.time()

        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

        X_train, X_valid, encoded_cols = mean_std_target_encoding(
            train_df=pd.concat([X_train, y_train], axis=1),
            cat_cols=cat_cols,
            test_df=pd.concat([X_valid, y_valid], axis=1), 
            target=y_train.name,
        )
        
        for name, model_features in models.items():
            model_start_time = time.time()
            
            train_data = lgb.Dataset(X_train[model_features], label=y_train)
            valid_data = lgb.Dataset(X_valid[model_features], label=y_valid, reference=train_data)
            fit_model = lgb.train(params, train_data, num_boost_round=500, valid_sets=[valid_data])
            y_pred = fit_model.predict(X_valid[model_features], num_iteration=fit_model.best_iteration)
            
            rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
            model_scores[name].append(rmse)
            
            if verbose:
                print(f"{name} Model - Fold {fold} - Training & Prediction time: {time.time() - model_start_time:.2f} seconds")

        if verbose:
            print(f"Total time for Fold {fold}: {time.time() - fold_start_time:.2f} seconds")
            print("-" * 50)

    return pd.DataFrame(model_scores)

In [59]:
models = {
    "baseline": baseline_features,
    "encoded_cols": encoded_cols,
    "all": encoded_cols + baseline_features,
}

X = train_df.drop(columns=[target])
y = train_df[target]
kf = KFold(n_splits=10, shuffle=True, random_state=42)
result_df = cross_validate_features(models, X, y, kf, verbose=False)
summary_df = pd.DataFrame({
    "Mean RMSE": result_df.mean(),
    "Std RMSE": result_df.std()
})
display(summary_df)

Starting Fold 1...
Starting Fold 2...
Starting Fold 3...
Starting Fold 4...
Starting Fold 5...
Starting Fold 6...
Starting Fold 7...
Starting Fold 8...
Starting Fold 9...
Starting Fold 10...


Unnamed: 0,Mean RMSE,Std RMSE
baseline,39.098039,0.071875
encoded_cols,39.18123,0.062547
all,39.130446,0.066703


In [57]:
models = {
    "baseline": baseline_features,
    "encoded_cols": encoded_cols,
    "all": encoded_cols + baseline_features,
}

X = train_df.drop(columns=[target])
y = train_df[target]
kf = KFold(n_splits=10, shuffle=True, random_state=42)
result_df = cross_validate_features(models, X, y, kf, verbose=False)
summary_df = pd.DataFrame({
    "Mean RMSE": result_df.mean(),
    "Std RMSE": result_df.std()
})
display(summary_df)

Unnamed: 0,Mean RMSE,Std RMSE
baseline,39.712155,0.078255
encoded_cols,40.583526,0.055446
all,40.058195,0.084779


In [8]:
# def mean_std_target_encoding(df, cat_cols, target_col):
#     df = df.copy()  # Avoid modifying the original dataframe
#     encoded_cols = []  # Store new encoded column names
    
#     for col in cat_cols:
#         # Compute mean and std of target for each category in the column
#         category_stats = df.groupby(col)[target_col].agg(['mean', 'std'])
        
#         # Map the values to the dataframe
#         df[f'{col}_mean'] = df[col].map(category_stats['mean'])
#         df[f'{col}_std'] = df[col].map(category_stats['std'])
        
#         encoded_cols.extend([f'{col}_mean', f'{col}_std'])
        
#         # Fill NaN values (in case some categories are missing)
#         df[f'{col}_mean'].fillna(df[f'{col}_mean'].mean(), inplace=True)
#         df[f'{col}_std'].fillna(df[f'{col}_std'].mean(), inplace=True)
    
#     return df, encoded_cols


In [5]:
cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist() + ['laptop_compartment', 'is_waterproof']

In [6]:
for col1, col2 in combinations(cat_cols, 2):
    print(col1, col2)

brand material
brand size
brand compartments
brand style
brand color
brand laptop_compartment
brand is_waterproof
material size
material compartments
material style
material color
material laptop_compartment
material is_waterproof
size compartments
size style
size color
size laptop_compartment
size is_waterproof
compartments style
compartments color
compartments laptop_compartment
compartments is_waterproof
style color
style laptop_compartment
style is_waterproof
color laptop_compartment
color is_waterproof
laptop_compartment is_waterproof


In [72]:
# Loop over each categorical column and map each category to the mean target value
for col in cat_cols:
    # Compute the mean target for each category in the column
    category_target_mean = train_df.groupby(col)[target].mean()
    train_df[f'{col}_mean'] = train_df[col].map(category_target_mean)

cat_mean_cols = [f'{col}_mean' for col in cat_cols]

In [4]:
train_df.select_dtypes(include=['object', 'category']).columns.tolist() + ['laptop_compartment', 'is_waterproof']

['brand',
 'material',
 'size',
 'compartments',
 'style',
 'color',
 'laptop_compartment',
 'is_waterproof']

In [74]:
cat_mean_cols

['brand_mean',
 'material_mean',
 'size_mean',
 'compartments_mean',
 'style_mean',
 'color_mean',
 'laptop_compartment_mean',
 'is_waterproof_mean']

In [None]:
train_df.columns

Index(['id', 'Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)', 'Price'],
      dtype='object')

In [81]:
train_df.columns

Index(['brand', 'material', 'size', 'compartments', 'laptop_compartment',
       'is_waterproof', 'style', 'color', 'weight_capacity', 'price',
       'size_int', 'weight_capacity_int', 'weight_capacity_size', 'brand_mean',
       'material_mean', 'size_mean', 'compartments_mean', 'style_mean',
       'color_mean', 'laptop_compartment_mean', 'is_waterproof_mean',
       'weight_capacity_binned', 'weight_capacity_brand',
       'weight_capacity_poly_2', 'weight_capacity_poly_3',
       'weight_capacity_exp', 'weight_capacity_inv', 'weight_capacity_pca'],
      dtype='object')

In [84]:
models = {
    "baseline": baseline_features,
    "cat_mean_cols": cat_mean_cols,
    "cat_mean_cols + ": cat_mean_cols + baseline_features,
    "all_features +":  features_to_try + baseline_features,
    "all_features ++":  cat_mean_cols + features_to_try + baseline_features
}

X = train_df.drop(columns=[target])
y = train_df[target]
kf = KFold(n_splits=10, shuffle=True, random_state=42)
result_df = cross_validate_features(models, X, y, kf, verbose=False)
summary_df = pd.DataFrame({
    "Mean RMSE": result_df.mean(),
    "Std RMSE": result_df.std()
})
display(summary_df)

Unnamed: 0,Mean RMSE,Std RMSE
baseline,39.03768,0.063194
cat_mean_cols,39.057084,0.067168
cat_mean_cols +,39.033518,0.064217
all_features +,39.040334,0.062872
all_features ++,39.036105,0.06511


In [85]:
train_df

Unnamed: 0,brand,material,size,compartments,laptop_compartment,is_waterproof,style,color,weight_capacity,price,...,color_mean,laptop_compartment_mean,is_waterproof_mean,weight_capacity_binned,weight_capacity_brand,weight_capacity_poly_2,weight_capacity_poly_3,weight_capacity_exp,weight_capacity_inv,weight_capacity_pca
0,Jansport,Leather,Medium,7.0,1,0,Tote,Black,11.611723,112.15875,...,80.513439,81.463842,81.572050,Low,11.611723,134.832107,1565.633046,1.103843e+05,0.086120,6.409977
1,Jansport,Canvas,Small,10.0,1,1,Messenger,Green,27.078537,68.88056,...,82.381308,81.463842,81.403489,Very High,27.078537,733.247143,19855.259594,5.755181e+11,0.036930,-9.056836
2,Under Armour,Leather,Small,2.0,1,0,Messenger,Red,16.643760,39.17320,...,81.011644,81.463842,81.572050,Medium,66.575040,277.014745,4610.566922,1.691582e+07,0.060083,1.377940
3,Nike,Nylon,Small,8.0,1,0,Messenger,Green,12.937220,80.60793,...,82.381308,81.463842,81.572050,Medium,25.874441,167.371669,2165.324158,4.154927e+05,0.077296,5.084480
4,Adidas,Canvas,Medium,1.0,1,1,Messenger,Green,17.749338,86.02312,...,82.381308,81.463842,81.403489,Medium,0.000000,315.039016,5591.734125,5.110222e+07,0.056340,0.272362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,Adidas,Leather,Small,9.0,0,0,Tote,Blue,12.730812,129.99749,...,82.006994,81.464702,81.572050,Medium,0.000000,162.073567,2063.328072,3.380036e+05,0.078550,5.290888
299996,Jansport,Leather,Large,6.0,0,1,Tote,Blue,26.633182,19.85819,...,82.006994,81.464702,81.403489,Very High,26.633182,709.326396,18891.619165,3.686753e+11,0.037547,-8.611482
299997,Puma,Canvas,Large,9.0,1,1,Backpack,Pink,11.898250,111.41364,...,81.630864,81.463842,81.403489,Low,35.694749,141.568346,1684.415531,1.470091e+05,0.084046,6.123450
299998,Adidas,Nylon,Small,1.0,0,1,Tote,Pink,6.175738,115.89080,...,81.630864,81.464702,81.403489,Low,0.000000,38.139739,235.541029,4.809378e+02,0.161924,11.845962


In [131]:
def cross_validate_lightgbm_feature_importances(
    params, X, y, kf, num_boost_round=100
):
    # Prepare a DataFrame to store feature importances for each fold
    # feature_importance_df = pd.DataFrame(
    #     np.zeros((X.shape[1], kf.get_n_splits())),
    #     index=X.columns
    # )
    fold_importances = {}
    scores = []

    for fold_idx, (train_index, test_index) in enumerate(kf.split(X), 1):
        print(f"Starting Fold {fold_idx}...")

        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

        X_train, X_valid = preprocess_weight_capacity(pd.concat([X_train, y_train], axis=1), X_valid)

        X_train, X_valid, encoded_cols = target_encoding(
            train_df=X_train,
            cat_cols=cat_cols,
            test_df=X_valid, 
            target=y_train.name,
        )
        X_train = X_train.drop(columns=[target])

        # Prepare LightGBM datasets
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

        # Train the model
        fit_model = lgb.train(
            params,
            train_data,
            num_boost_round=num_boost_round,
            valid_sets=[valid_data],
        )

        # Predict
        y_pred = fit_model.predict(X_valid, num_iteration=fit_model.best_iteration)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        scores.append(rmse)

        # Store feature importances
        # fold_importances = fit_model.feature_importance(importance_type='gain')
        # feature_importance_df.iloc[:, fold_idx - 1] = fold_importances
        fold_importance_series = pd.Series(
            fit_model.feature_importance(importance_type='gain'),
            index=X_train.columns
        )
        fold_importances[f'fold_{fold_idx}'] = fold_importance_series

    feature_importance_df = pd.DataFrame(fold_importances).fillna(0)

    # Compute the mean importance across folds
    feature_importance_df['mean_importance'] = feature_importance_df.mean(axis=1)
    feature_importance_df.sort_values('mean_importance', ascending=False, inplace=True)
    
    # Compute average feature importance across folds
    # feature_importance_df['mean_importance'] = feature_importance_df.mean(axis=1)
    # feature_importance_df.sort_values('mean_importance', ascending=False, inplace=True)

    return scores, feature_importance_df

In [132]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
lightgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.95,
    'verbose': -1,
    'force_row_wise': True
}

# scores, feature_importances = cross_validate_lightgbm_feature_importances(lightgbm_params, X, y, kf)
# feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
# feature_importances.head(20)  # top 20 most important features

In [133]:
X = train_df.drop(columns=[target])
y = train_df[target]

scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, X, y, kf
)
display(feature_importances[['mean_importance']])
print("Average RMSE across folds:", np.mean(scores))

Starting Fold 1...
Starting Fold 2...
Starting Fold 3...
Starting Fold 4...
Starting Fold 5...
Starting Fold 6...
Starting Fold 7...
Starting Fold 8...
Starting Fold 9...
Starting Fold 10...


Unnamed: 0,mean_importance
material_weight_capacity_int_encoded,11037800.0
brand_weight_capacity_int_encoded,10650700.0
color_weight_capacity_int_encoded,8826507.0
size_weight_capacity_int_encoded,7269190.0
is_waterproof_weight_capacity_int_encoded,7231504.0
weight_capacity,7198871.0
material_color_encoded,6622816.0
compartments,6490143.0
size_compartments_encoded,5158973.0
brand_size_encoded,5057936.0


Average RMSE across folds: 38.87266071931904


In [137]:
temp = feature_importances[['mean_importance']]
temp

Unnamed: 0,mean_importance
material_weight_capacity_int_encoded,11037800.0
brand_weight_capacity_int_encoded,10650700.0
color_weight_capacity_int_encoded,8826507.0
size_weight_capacity_int_encoded,7269190.0
is_waterproof_weight_capacity_int_encoded,7231504.0
weight_capacity,7198871.0
material_color_encoded,6622816.0
compartments,6490143.0
size_compartments_encoded,5158973.0
brand_size_encoded,5057936.0


In [135]:
feature_importances.index

Index(['material_weight_capacity_int_encoded',
       'brand_weight_capacity_int_encoded',
       'color_weight_capacity_int_encoded', 'size_weight_capacity_int_encoded',
       'is_waterproof_weight_capacity_int_encoded', 'weight_capacity',
       'material_color_encoded', 'compartments', 'size_compartments_encoded',
       'brand_size_encoded', 'compartments_weight_capacity_int_encoded',
       'brand_color_encoded', 'size_is_waterproof_encoded',
       'laptop_compartment_is_waterproof_encoded',
       'size_laptop_compartment_encoded', 'brand_material_encoded',
       'compartments_is_waterproof_encoded', 'size_color_encoded',
       'material_size_encoded', 'style_is_waterproof_encoded',
       'brand_style_encoded', 'style_laptop_compartment_encoded',
       'compartments_laptop_compartment_encoded', 'compartments_color_encoded',
       'material_compartments_encoded', 'brand_compartments_encoded',
       'material_is_waterproof_encoded', 'compartments_style_encoded',
       'sty

In [54]:
X

Unnamed: 0,brand,material,size,compartments,laptop_compartment,is_waterproof,style,color,weight_capacity,size_int,weight_capacity_int,weight_capacity_size,weight_capacity_binned,weight_capacity_brand,weight_capacity_poly_2,weight_capacity_poly_3,weight_capacity_exp,weight_capacity_inv,weight_capacity_pca
0,Jansport,Leather,Medium,7.0,1,0,Tote,Black,11.611723,2,11,23.223446,Low,11.611723,134.832107,1565.633046,1.103843e+05,0.086120,6.409977
1,Jansport,Canvas,Small,10.0,1,1,Messenger,Green,27.078537,1,27,27.078537,Very High,27.078537,733.247143,19855.259594,5.755181e+11,0.036930,-9.056836
2,Under Armour,Leather,Small,2.0,1,0,Messenger,Red,16.643760,1,16,16.643760,Medium,66.575040,277.014745,4610.566922,1.691582e+07,0.060083,1.377940
3,Nike,Nylon,Small,8.0,1,0,Messenger,Green,12.937220,1,12,12.937220,Medium,25.874441,167.371669,2165.324158,4.154927e+05,0.077296,5.084480
4,Adidas,Canvas,Medium,1.0,1,1,Messenger,Green,17.749338,2,17,35.498677,Medium,0.000000,315.039016,5591.734125,5.110222e+07,0.056340,0.272362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,Adidas,Leather,Small,9.0,0,0,Tote,Blue,12.730812,1,12,12.730812,Medium,0.000000,162.073567,2063.328072,3.380036e+05,0.078550,5.290888
299996,Jansport,Leather,Large,6.0,0,1,Tote,Blue,26.633182,3,26,79.899547,Very High,26.633182,709.326396,18891.619165,3.686753e+11,0.037547,-8.611482
299997,Puma,Canvas,Large,9.0,1,1,Backpack,Pink,11.898250,3,11,35.694749,Low,35.694749,141.568346,1684.415531,1.470091e+05,0.084046,6.123450
299998,Adidas,Nylon,Small,1.0,0,1,Tote,Pink,6.175738,1,6,6.175738,Low,0.000000,38.139739,235.541029,4.809378e+02,0.161924,11.845962


In [10]:
X.columns

Index(['product_id', 'brand', 'material', 'size', 'num_compartments',
       'laptop_compartment', 'is_waterproof', 'style', 'color',
       'weight_capacity_kg'],
      dtype='object')

In [33]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X, 
    y, kf, verbose=False
)
display(feature_importances[['mean_importance']])
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,47176180.0,47219590.0,46781760.0,47361370.0,48175620.0,47790430.0,47833300.0,45947000.0,46588060.0,46522440.0,...,45848650.0,48196380.0,47203340.0,47564740.0,45504350.0,45253540.0,46719620.0,45206450.0,46536460.0,46833570.0
color,11408440.0,11231510.0,11363850.0,11154320.0,11013980.0,11202750.0,11121980.0,11271230.0,11469270.0,11482960.0,...,11262560.0,11239690.0,11409320.0,11633090.0,11282780.0,11363370.0,11349590.0,11295270.0,11363770.0,11316100.0
num_compartments,9850175.0,9953490.0,10480170.0,10411160.0,9966054.0,9856776.0,9937975.0,10503310.0,10058850.0,10319510.0,...,9777032.0,10982340.0,10096480.0,10037050.0,9804528.0,10580810.0,10785210.0,10230050.0,9706605.0,10159660.0
brand,9751406.0,10220660.0,10381630.0,10201070.0,9409825.0,10079460.0,10182330.0,10905710.0,10192800.0,10095090.0,...,10209750.0,10090050.0,9548218.0,10234040.0,9919946.0,9963720.0,9688279.0,10100950.0,10279830.0,10077380.0
material,10061060.0,9759496.0,9451678.0,10014240.0,9734259.0,9532279.0,9919994.0,9546287.0,9655160.0,9750221.0,...,9695774.0,9768202.0,9669919.0,10036110.0,9613994.0,9413702.0,9627305.0,9469726.0,9694849.0,9692751.0
is_waterproof,5646812.0,5642205.0,5065810.0,5204073.0,4936562.0,4538939.0,5441004.0,5383234.0,5183691.0,4246944.0,...,5418283.0,4870393.0,5550657.0,4927017.0,5267349.0,4972105.0,5453935.0,4874958.0,4646908.0,5093478.0
size,4629158.0,4873048.0,4844941.0,4590188.0,4904792.0,5102831.0,4580255.0,4994526.0,4647928.0,4899191.0,...,4846777.0,4716332.0,5059158.0,4549812.0,4759914.0,4594763.0,4699014.0,4826413.0,4790805.0,4775271.0
noise_norm,3858605.0,3600125.0,3853762.0,3774520.0,4063042.0,3939191.0,4178238.0,3828656.0,3650533.0,3899833.0,...,3955847.0,3535573.0,3967788.0,3691078.0,4240877.0,4265998.0,3779018.0,4290970.0,3903421.0,3899414.0
product_id,3576922.0,3945006.0,4036544.0,4135792.0,3926440.0,4065940.0,3740384.0,3820629.0,3878249.0,4025878.0,...,3799571.0,3884763.0,3940938.0,3903273.0,3716683.0,3707034.0,3992994.0,3881061.0,3758094.0,3867014.0
noise_uniform,4099976.0,3302216.0,3464669.0,3441611.0,3647948.0,3510186.0,3483155.0,3980496.0,3867991.0,3970294.0,...,4091708.0,3842930.0,3900921.0,3460060.0,3725567.0,3670027.0,3698930.0,3737291.0,3672984.0,3719553.0


Average RMSE across folds: 38.885562132662876


In [17]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X, 
    y, kf, verbose=False
)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,49708360.0,50091770.0,48811880.0,49939660.0,50437740.0,50394780.0,49293070.0,48320350.0,49612030.0,50040660.0,...,48891070.0,51121340.0,50358840.0,50486940.0,48529320.0,49055350.0,48902130.0,48480750.0,48298920.0,49495780.0
color,11460100.0,11597770.0,11853240.0,11522320.0,11461490.0,11990620.0,11558200.0,11693600.0,11587950.0,11542710.0,...,11797150.0,11425820.0,11719690.0,11854410.0,11349330.0,11675030.0,11763870.0,11730740.0,11605610.0,11630650.0
num_compartments,10924180.0,10791110.0,10775240.0,10884460.0,11217630.0,10844590.0,11340710.0,11228050.0,11017850.0,11029640.0,...,10671380.0,11705320.0,10506960.0,11070280.0,11087210.0,10830030.0,11670280.0,10916140.0,10844550.0,10999990.0
brand,10157400.0,10148070.0,10738660.0,10372270.0,9817144.0,10160960.0,10741670.0,10660860.0,10279840.0,10691480.0,...,10853540.0,10686380.0,10288630.0,10643580.0,10152500.0,10680220.0,10461950.0,10779280.0,11027900.0,10479320.0
material,10396140.0,10267240.0,10034390.0,10413730.0,10307750.0,9847701.0,10173640.0,10126450.0,10121170.0,10026120.0,...,10086410.0,10397050.0,10495900.0,10419070.0,10494480.0,9845522.0,10062050.0,9915087.0,10014500.0,10165860.0
size,5115050.0,5060329.0,5410241.0,5279154.0,4880583.0,5649601.0,5258076.0,5058771.0,5117460.0,5254743.0,...,5121037.0,4818203.0,5112423.0,5409638.0,5215485.0,5229924.0,5135211.0,5033824.0,5110233.0,5161405.0
is_waterproof,5227879.0,4637209.0,5132789.0,5047952.0,5693232.0,4534978.0,5910206.0,5505472.0,4651608.0,4376571.0,...,5626157.0,4017785.0,5388030.0,4767879.0,4994341.0,5005151.0,5706819.0,4845218.0,5408251.0,5120108.0
product_id,5242075.0,5138895.0,5173410.0,4943708.0,4933860.0,4790186.0,5108704.0,5543346.0,5442714.0,5130544.0,...,5224038.0,4831928.0,5283092.0,5153767.0,4882136.0,4958511.0,4908895.0,5165048.0,5038287.0,5111515.0
laptop_compartment,1486326.0,1479301.0,1761428.0,1736026.0,1442151.0,1653900.0,1227560.0,1901829.0,1453008.0,1672447.0,...,1873241.0,1595976.0,1352292.0,1438032.0,1633936.0,1546898.0,1253184.0,2347272.0,1733061.0,1613927.0
style,1566064.0,1783020.0,1768112.0,1337933.0,1416998.0,1450940.0,1546021.0,1238834.0,1680685.0,1421421.0,...,1515651.0,1531387.0,1520441.0,1378670.0,1510594.0,1764205.0,1693717.0,1606759.0,1408614.0,1526128.0


Average RMSE across folds: 38.88401308882421


In [36]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['noise_uniform']], 
    y, kf, verbose=False
)
display(feature_importances[['mean_importance']])
print(f"Average RMSE across folds: {np.mean(scores):.4f}")

Unnamed: 0,mean_importance
noise_uniform,1711164.0


Average RMSE across folds: 38.93977597829933


In [37]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['product_id']], 
    y, kf, verbose=False
)
display(feature_importances[['mean_importance']])
print(f"Average RMSE across folds: {np.mean(scores):.4f}")

Unnamed: 0,mean_importance
product_id,2053748.0


Average RMSE across folds: 38.939532938793356


In [39]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg']], 
    y, kf, verbose=False
)
display(feature_importances[['mean_importance']])
print(f"Average RMSE across folds: {np.mean(scores):.4f}")

Unnamed: 0,mean_importance
weight_capacity_kg,42998550.0


Average RMSE across folds: 38.9133


In [40]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color']], 
    y, kf, verbose=False
)
display(feature_importances[['mean_importance']])
print(f"Average RMSE across folds: {np.mean(scores):.4f}")

Unnamed: 0,mean_importance
weight_capacity_kg,44704210.0
color,11464840.0


Average RMSE across folds: 38.9087


In [41]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color', 'num_compartments']], 
    y, kf, verbose=False
)
display(feature_importances[['mean_importance']])
print(f"Average RMSE across folds: {np.mean(scores):.4f}")

Unnamed: 0,mean_importance
weight_capacity_kg,46142480.0
color,11935050.0
num_compartments,9031126.0


Average RMSE across folds: 38.9047


In [42]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color', 'num_compartments', 'brand']], 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances[['mean_importance']])
print(f"Average RMSE across folds: {np.mean(scores):.4f}")

Unnamed: 0,mean_importance
weight_capacity_kg,45876540.0
color,12009610.0
brand,9593835.0
num_compartments,8953673.0


Average RMSE across folds: 38.9008


In [43]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color', 'num_compartments', 'brand', 'material']], 
    y, kf, verbose=False
)
display(feature_importances[['mean_importance']])
print(f"Average RMSE across folds: {np.mean(scores):.4f}")

Unnamed: 0,mean_importance
weight_capacity_kg,45499200.0
color,12066800.0
brand,9855395.0
material,9655583.0
num_compartments,8719744.0


Average RMSE across folds: 38.8963


In [44]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color', 'num_compartments', 'brand', 'material', 'is_waterproof']], 
    y, kf, verbose=False
)
display(feature_importances[['mean_importance']])
print(f"Average RMSE across folds: {np.mean(scores):.4f}")

Unnamed: 0,mean_importance
weight_capacity_kg,47636980.0
color,12276280.0
material,9968583.0
brand,9720940.0
num_compartments,9358182.0
is_waterproof,5934946.0


Average RMSE across folds: 38.8909
