In [2]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, cross_val_score

import lightgbm as lgb
import optuna

import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')

train_df = pd.concat([train_df, train_extra_df], ignore_index=True)

train_df.columns = [
    'product_id', 'brand', 'material', 'size', 'num_compartments', 
    'laptop_compartment', 'is_waterproof', 'style', 'color', 
    'weight_capacity_kg', 'price'
]
test_df.columns = [
    'product_id', 'brand', 'material', 'size', 'num_compartments', 
    'laptop_compartment', 'is_waterproof', 'style', 'color', 
    'weight_capacity_kg',
]

cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Convert all to be the same type
train_df[cat_cols] = train_df[cat_cols].astype('category')
test_df[cat_cols] = test_df[cat_cols].astype('category')

target = "price"
X = train_df.drop(columns=[target])
y = train_df[target]

In [None]:
rng = np.random.default_rng(seed=42)
train_df['uniform_noise'] = rng.uniform(1, 10000, len(train_df))

In [5]:
def cross_validate_lightgbm_feature_importances(
    params, X, y, kf, num_boost_round=100, verbose=True
):
    """
    Perform cross-validation using a LightGBM model and collect feature importances.
    
    Parameters:
        params (dict): LightGBM training parameters.
        X (DataFrame): Feature dataset.
        y (Series): Target variable.
        kf (KFold): KFold cross-validation splitter.
        num_boost_round (int): Number of boosting rounds.
        verbose (bool): If True, prints timing information.

    Returns:
        scores (list): RMSE scores for each fold.
        feature_importance_df (DataFrame): Average (and per-fold) feature importances.
    """
    # Prepare a DataFrame to store feature importances for each fold
    feature_importance_df = pd.DataFrame(
        np.zeros((X.shape[1], kf.get_n_splits())),
        index=X.columns
    )
    
    scores = []

    for fold_idx, (train_index, test_index) in enumerate(kf.split(X), 1):
        if verbose:
            print(f"Starting Fold {fold_idx}...")

        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

        # Prepare LightGBM datasets
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

        # Train the model
        fit_model = lgb.train(
            params,
            train_data,
            num_boost_round=num_boost_round,
            valid_sets=[valid_data],
        )

        # Predict
        y_pred = fit_model.predict(X_valid, num_iteration=fit_model.best_iteration)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        scores.append(rmse)

        # Store feature importances
        fold_importances = fit_model.feature_importance(importance_type='gain')
        feature_importance_df.iloc[:, fold_idx - 1] = fold_importances

        if verbose:
            print(f"Fold {fold_idx} RMSE: {rmse:.4f}")
            print("-" * 50)

    # Compute average feature importance across folds
    feature_importance_df['mean_importance'] = feature_importance_df.mean(axis=1)

    if verbose:
        print("Cross-validation RMSE per fold:", scores)
        print("Average RMSE across folds:", np.mean(scores))

    return scores, feature_importance_df

In [16]:
kf = KFold(n_splits=20, shuffle=True, random_state=42)
lightgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.95,
    'verbose': -1,
    'force_row_wise': True
}

# scores, feature_importances = cross_validate_lightgbm_feature_importances(lightgbm_params, X, y, kf)
# feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
# feature_importances.head(20)  # top 20 most important features

In [25]:
X

Unnamed: 0,product_id,brand,material,size,num_compartments,laptop_compartment,is_waterproof,style,color,weight_capacity_kg
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.643760
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.937220
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338
...,...,...,...,...,...,...,...,...,...,...
3994313,4194313,Nike,Canvas,,3.0,Yes,Yes,Messenger,Blue,28.098120
3994314,4194314,Puma,Leather,Small,10.0,Yes,Yes,Tote,Blue,17.379531
3994315,4194315,Jansport,Canvas,Large,10.0,No,No,Backpack,Red,17.037708
3994316,4194316,Puma,Canvas,,2.0,No,No,Backpack,Gray,28.783339


In [10]:
X.columns

Index(['product_id', 'brand', 'material', 'size', 'num_compartments',
       'laptop_compartment', 'is_waterproof', 'style', 'color',
       'weight_capacity_kg'],
      dtype='object')

In [17]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X, 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,49708360.0,50091770.0,48811880.0,49939660.0,50437740.0,50394780.0,49293070.0,48320350.0,49612030.0,50040660.0,...,48891070.0,51121340.0,50358840.0,50486940.0,48529320.0,49055350.0,48902130.0,48480750.0,48298920.0,49495780.0
color,11460100.0,11597770.0,11853240.0,11522320.0,11461490.0,11990620.0,11558200.0,11693600.0,11587950.0,11542710.0,...,11797150.0,11425820.0,11719690.0,11854410.0,11349330.0,11675030.0,11763870.0,11730740.0,11605610.0,11630650.0
num_compartments,10924180.0,10791110.0,10775240.0,10884460.0,11217630.0,10844590.0,11340710.0,11228050.0,11017850.0,11029640.0,...,10671380.0,11705320.0,10506960.0,11070280.0,11087210.0,10830030.0,11670280.0,10916140.0,10844550.0,10999990.0
brand,10157400.0,10148070.0,10738660.0,10372270.0,9817144.0,10160960.0,10741670.0,10660860.0,10279840.0,10691480.0,...,10853540.0,10686380.0,10288630.0,10643580.0,10152500.0,10680220.0,10461950.0,10779280.0,11027900.0,10479320.0
material,10396140.0,10267240.0,10034390.0,10413730.0,10307750.0,9847701.0,10173640.0,10126450.0,10121170.0,10026120.0,...,10086410.0,10397050.0,10495900.0,10419070.0,10494480.0,9845522.0,10062050.0,9915087.0,10014500.0,10165860.0
size,5115050.0,5060329.0,5410241.0,5279154.0,4880583.0,5649601.0,5258076.0,5058771.0,5117460.0,5254743.0,...,5121037.0,4818203.0,5112423.0,5409638.0,5215485.0,5229924.0,5135211.0,5033824.0,5110233.0,5161405.0
is_waterproof,5227879.0,4637209.0,5132789.0,5047952.0,5693232.0,4534978.0,5910206.0,5505472.0,4651608.0,4376571.0,...,5626157.0,4017785.0,5388030.0,4767879.0,4994341.0,5005151.0,5706819.0,4845218.0,5408251.0,5120108.0
product_id,5242075.0,5138895.0,5173410.0,4943708.0,4933860.0,4790186.0,5108704.0,5543346.0,5442714.0,5130544.0,...,5224038.0,4831928.0,5283092.0,5153767.0,4882136.0,4958511.0,4908895.0,5165048.0,5038287.0,5111515.0
laptop_compartment,1486326.0,1479301.0,1761428.0,1736026.0,1442151.0,1653900.0,1227560.0,1901829.0,1453008.0,1672447.0,...,1873241.0,1595976.0,1352292.0,1438032.0,1633936.0,1546898.0,1253184.0,2347272.0,1733061.0,1613927.0
style,1566064.0,1783020.0,1768112.0,1337933.0,1416998.0,1450940.0,1546021.0,1238834.0,1680685.0,1421421.0,...,1515651.0,1531387.0,1520441.0,1378670.0,1510594.0,1764205.0,1693717.0,1606759.0,1408614.0,1526128.0


Average RMSE across folds: 38.88401308882421


In [26]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['product_id']], 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
product_id,2022845.0,2062912.0,2095006.0,2090250.0,1948929.0,1963238.0,1987960.0,2064326.0,1975591.0,1947158.0,...,2019436.0,2072437.0,2104565.0,2104749.0,2092013.0,2172963.0,2110488.0,2061219.0,2021961.0,2053748.0


Average RMSE across folds: 38.939532938793356


In [18]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg']], 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,43586470.0,43462310.0,42375000.0,43423950.0,43832690.0,43307920.0,44239860.0,41664100.0,42925220.0,42947540.0,...,42612380.0,44216180.0,43256380.0,43348920.0,41936530.0,43090510.0,42670630.0,41113220.0,42750530.0,42998550.0


Average RMSE across folds: 38.91325282296667


In [19]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color']], 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,45043660.0,45165750.0,44333130.0,45135650.0,45192400.0,45121070.0,45701220.0,43573670.0,44561640.0,44536720.0,...,44605240.0,45770110.0,44881900.0,44984980.0,43773500.0,44770480.0,44763070.0,42929010.0,44232860.0,44704210.0
color,11455050.0,11549790.0,11282830.0,11129030.0,11492710.0,11606030.0,11306740.0,11416700.0,11602660.0,12053630.0,...,11303110.0,11441470.0,11564180.0,11523060.0,11445410.0,11377190.0,11262080.0,11555880.0,11497550.0,11464840.0


Average RMSE across folds: 38.90873135595375


In [20]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color', 'num_compartments']], 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,46549510.0,46707710.0,45949420.0,46470040.0,47162520.0,46425750.0,46722740.0,44791730.0,46041320.0,45947390.0,...,46093240.0,47230900.0,46262270.0,46433180.0,44967350.0,46193600.0,45944510.0,44347470.0,45900830.0,46142480.0
color,11674530.0,12112140.0,11701870.0,11411660.0,11878900.0,12193740.0,11745740.0,11969830.0,11920530.0,12309760.0,...,11975440.0,12215880.0,12142450.0,11863380.0,11744390.0,11924430.0,12020460.0,11991400.0,12184920.0,11935050.0
num_compartments,9224124.0,8929949.0,8827553.0,9599950.0,8787380.0,8919505.0,9470034.0,9492073.0,9069844.0,9090401.0,...,8456575.0,8882259.0,9069624.0,9106940.0,8979008.0,8774940.0,9222925.0,9397323.0,8484959.0,9031126.0


Average RMSE across folds: 38.90471665955875


In [22]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color', 'num_compartments', 'brand']], 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,46193780.0,46387160.0,45282460.0,46366440.0,46501850.0,46219130.0,46872750.0,44920420.0,45468580.0,45270690.0,...,46182990.0,47066970.0,46061200.0,46447550.0,45200180.0,45879930.0,45308190.0,44497710.0,45512640.0,45876540.0
color,12237190.0,12230100.0,12019830.0,11692330.0,11827760.0,12191770.0,11845650.0,11958820.0,11958670.0,12547140.0,...,12018690.0,12011200.0,12167340.0,12052160.0,11959700.0,11797490.0,12359760.0,11434270.0,12074390.0,12009610.0
brand,9614709.0,9535901.0,9919790.0,9579636.0,9359229.0,9607248.0,9724971.0,9683134.0,9613803.0,9625943.0,...,9600937.0,9771362.0,9297077.0,9604064.0,9518306.0,9557709.0,9515451.0,9605187.0,9479817.0,9593835.0
num_compartments,9102016.0,8578423.0,9095052.0,8875533.0,8923442.0,8674047.0,8883804.0,9152889.0,9260701.0,9134165.0,...,8340378.0,8902686.0,9329346.0,8916127.0,8994286.0,8921466.0,9355680.0,9100247.0,8761831.0,8953673.0


Average RMSE across folds: 38.90080103675224


In [23]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color', 'num_compartments', 'brand', 'material']], 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,46059520.0,45686650.0,44936730.0,45418260.0,45949530.0,45512750.0,46760340.0,44529120.0,44919500.0,45107880.0,...,45217290.0,47014700.0,46011630.0,45956590.0,44628880.0,45318600.0,45471420.0,43944150.0,45370030.0,45499200.0
color,11735800.0,12042360.0,12242720.0,11814430.0,12021630.0,12305490.0,11763460.0,12010880.0,11999300.0,12552350.0,...,11858780.0,12046700.0,12213340.0,11997290.0,12009080.0,12045890.0,12327070.0,12153450.0,12205390.0,12066800.0
brand,10041650.0,9923154.0,10170340.0,9905123.0,9579599.0,9836569.0,9813437.0,9965219.0,9949131.0,9711045.0,...,10062710.0,9921566.0,9345914.0,9978377.0,9969711.0,9672618.0,9492561.0,9833036.0,9887911.0,9855395.0
material,9668533.0,9709707.0,9508299.0,9880144.0,9674600.0,9621357.0,9559737.0,9684386.0,9752460.0,9490878.0,...,9790500.0,9631387.0,9457368.0,9680228.0,9637653.0,9904205.0,9653716.0,9662172.0,9691542.0,9655583.0
num_compartments,8722004.0,8470793.0,8718335.0,9016568.0,9039732.0,8582812.0,8652962.0,8927898.0,8958168.0,8808564.0,...,8280353.0,8755505.0,8734904.0,8614927.0,8617475.0,8808836.0,8849666.0,9055272.0,8483691.0,8719744.0


Average RMSE across folds: 38.89627594296494


In [24]:
scores, feature_importances = cross_validate_lightgbm_feature_importances(
    lightgbm_params, 
    X[['weight_capacity_kg', 'color', 'num_compartments', 'brand', 'material', 'is_waterproof']], 
    y, kf, verbose=False
)
feature_importances.sort_values('mean_importance', ascending=False, inplace=True)
display(feature_importances)
print("Average RMSE across folds:", np.mean(scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,mean_importance
weight_capacity_kg,48139310.0,47782710.0,47281990.0,48291320.0,48567700.0,47747860.0,48089990.0,46525110.0,47342210.0,47389350.0,...,48253480.0,48895740.0,47514260.0,48173030.0,46486300.0,47865490.0,47611290.0,45966280.0,46866860.0,47636980.0
color,12075210.0,12162660.0,12293230.0,12069290.0,12521530.0,12439320.0,12127890.0,12449070.0,12261230.0,12612190.0,...,12145320.0,11996750.0,12464330.0,12246300.0,12227970.0,12539460.0,12219470.0,12154520.0,12210680.0,12276280.0
material,10019260.0,10020250.0,9586424.0,10063090.0,9994700.0,9984176.0,10139030.0,9880960.0,10215190.0,10057150.0,...,9721957.0,10278090.0,9963805.0,10265700.0,9984522.0,9989392.0,9676389.0,9756967.0,10078690.0,9968583.0
brand,9777825.0,9696430.0,9831755.0,9535602.0,9384479.0,9653991.0,9805665.0,9813871.0,9938502.0,9807319.0,...,9650892.0,9759118.0,9711563.0,9905178.0,9730680.0,9675293.0,9564337.0,9837992.0,9818825.0,9720940.0
num_compartments,9530367.0,9141902.0,9470131.0,9769775.0,9658384.0,9259686.0,9159112.0,9322238.0,9222833.0,9502885.0,...,8843960.0,9335489.0,9368370.0,9351869.0,9582395.0,9376308.0,9289103.0,9582548.0,9269840.0,9358182.0
is_waterproof,5850627.0,6223020.0,5957945.0,5760535.0,5793275.0,5680038.0,6344398.0,6156604.0,5890044.0,5243772.0,...,5963197.0,5608480.0,6382343.0,5881119.0,5990013.0,5402177.0,6375253.0,6153380.0,5955328.0,5934946.0


Average RMSE across folds: 38.89092917844328
