In [1]:
from backpack_predictor import prepare_data, target_encoding
from backpack_predictor.features import target, baseline_features, feature_list, cat_cols

%load_ext autoreload
%autoreload 2

from datetime import datetime
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import root_mean_squared_error

import xgboost as xgb
import lightgbm as lgb

from autogluon.tabular import TabularPredictor

import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
train_df = pd.concat([train_df, train_extra_df], ignore_index=True)


# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
predictor = TabularPredictor(
    label=target, 
    problem_type='regression', 
    eval_metric='root_mean_squared_error'
).fit(
    train_df, 
    presets='best', # https://auto.gluon.ai/dev/tutorials/tabular/tabular-essentials.html#presets
    # presets='medium_quality', 
    # time_limit=60 * 60 # 1 hour
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250211_231009"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.10
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:30 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6000
CPU Count:          10
Memory Avail:       23.25 GB / 64.00 GB (36.3%)
Disk Space Avail:   439.58 GB / 926.35 GB (47.5%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions

In [3]:
hyperparameters = {
    'GBM': [
        {},  # default LightGBM
        # {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},  # LightGBMXT
        {
            'learning_rate': 0.04748156996966733,
            'num_leaves': 256,
            'max_bin': 69779,
            'lambda_l1': 0.0021556304294933115,
            'lambda_l2': 0.05627364738597358,
            'feature_fraction': 0.8,
            'min_data_in_leaf': 24,
            'ag_args': {
                'name_suffix': 'Custom1',
                'priority': 100,  # High priority so it's sure to train
            }
        },
        # You can keep adding more variants
    ],
    # You can also define XGB, CAT, etc. here if you want.
}

predictor = TabularPredictor(
    label=target, 
    problem_type='regression', 
    eval_metric='root_mean_squared_error'
).fit(
    train_df, 
    presets='medium_quality',  # or 'best_quality'
    # num_stack_levels=2,  # enable multi-layer stacking
    # num_bag_folds=5,     # or 10, the higher the better (usually)
    hyperparameters=hyperparameters
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250212_152135"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.10
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:30 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6000
CPU Count:          10
Memory Avail:       21.15 GB / 64.00 GB (33.0%)
Disk Space Avail:   437.33 GB / 926.35 GB (47.2%)
Presets specified: ['medium_quality']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "/Users/jordanbarker/Documents/Kaggle/backpack-prediction-challenge/notebooks/AutogluonModels/ag-20250212_152135"
Train Data Rows:    3994318
Train Data Columns: 9
Label Column:       price
Problem Type:       regression
Preprocess

[1000]	valid_set's rmse: 38.8651


	-38.8638	 = Validation score   (-root_mean_squared_error)
	231.63s	 = Training   runtime
	0.45s	 = Validation runtime
Fitting model: LightGBM ...
	-38.9231	 = Validation score   (-root_mean_squared_error)
	21.7s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBMCustom1': 1.0}
	-38.8638	 = Validation score   (-root_mean_squared_error)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 257.45s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 88295.1 rows/s (39944 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/Users/jordanbarker/Documents/Kaggle/backpack-prediction-challenge/notebooks/AutogluonModels/ag-20250212_152135")


In [None]:
# te = TargetEncoder(target_type="continuous", smooth=20)
# train_te_all = te.fit_transform(train_fold[candidate_cols], train_fold[target])
# val_te_all = te.transform(val_fold[candidate_cols])

In [8]:
train_df.columns

Index(['brand', 'material', 'size', 'compartments', 'laptop_compartment',
       'is_waterproof', 'style', 'color', 'weight_capacity', 'price',
       'brand_material_weight_combined', 'brand_size_weight_combined',
       'brand_compartments_weight_combined', 'brand_style_weight_combined',
       'brand_color_weight_combined',
       'brand_laptop_compartment_weight_combined',
       'brand_is_waterproof_weight_combined', 'material_size_weight_combined',
       'material_compartments_weight_combined',
       'material_style_weight_combined', 'material_color_weight_combined',
       'material_laptop_compartment_weight_combined',
       'material_is_waterproof_weight_combined',
       'size_compartments_weight_combined', 'size_style_weight_combined',
       'size_color_weight_combined', 'size_laptop_compartment_weight_combined',
       'size_is_waterproof_weight_combined',
       'compartments_style_weight_combined',
       'compartments_color_weight_combined',
       'compartments_lapto

In [2]:
from itertools import combinations

# Ensure all categorical columns are of type 'category' and create code columns.
for col in cat_cols:
    if train_df[col].dtype != 'category':
        train_df[col] = train_df[col].astype('category')
    train_df[f"{col}_code"] = train_df[col].cat.codes

# For each pair of categorical columns, combine their codes with weight_capacity.
# The function below is applied in a vectorized way using np.where.
for col1, col2 in combinations(cat_cols, 2):
    code1 = train_df[f"{col1}_code"]
    code2 = train_df[f"{col2}_code"]
    weight = train_df["weight_capacity"]
    
    # Create a boolean mask where either code is -1.
    mask = (code1 == -1) | (code2 == -1)
    
    # If either code is -1, we compute -1*weight - 1, else we combine them.
    # Adjust multipliers (here 10000 and 100) as needed for your data scale.
    combined = np.where(mask, 
                        -1 * weight - 1, 
                        code1 * 10000 + code2 * 100 + weight)
    
    new_col_name = f"{col1}_{col2}_weight_combined"
    train_df[new_col_name] = combined

temp_code_cols = [f"{col}_code" for col in cat_cols]
train_df.drop(columns=temp_code_cols, inplace=True)

In [7]:
train_df.columns

Index(['brand', 'material', 'size', 'compartments', 'laptop_compartment',
       'is_waterproof', 'style', 'color', 'weight_capacity', 'price',
       'brand_material_weight_combined', 'brand_size_weight_combined',
       'brand_compartments_weight_combined', 'brand_style_weight_combined',
       'brand_color_weight_combined',
       'brand_laptop_compartment_weight_combined',
       'brand_is_waterproof_weight_combined', 'material_size_weight_combined',
       'material_compartments_weight_combined',
       'material_style_weight_combined', 'material_color_weight_combined',
       'material_laptop_compartment_weight_combined',
       'material_is_waterproof_weight_combined',
       'size_compartments_weight_combined', 'size_style_weight_combined',
       'size_color_weight_combined', 'size_laptop_compartment_weight_combined',
       'size_is_waterproof_weight_combined',
       'compartments_style_weight_combined',
       'compartments_color_weight_combined',
       'compartments_lapto



In [3]:
predictor = TabularPredictor(
    label=target, 
    problem_type='regression', 
    eval_metric='root_mean_squared_error'
).fit(
    train_df[[
        'brand', 'material', 'size', 'compartments', 'laptop_compartment',
        'is_waterproof', 'style', 'color', 'weight_capacity', 'price',
        'brand_color_weight_combined',
        'brand_is_waterproof_weight_combined',
        'brand_material_weight_combined',
        'brand_size_weight_combined',
        'laptop_compartment_is_waterproof_weight_combined',
        'material_color_weight_combined',
        'material_is_waterproof_weight_combined',
        'material_laptop_compartment_weight_combined',
        'material_size_weight_combined',
        'material_style_weight_combined',
        'size_style_weight_combined',
        'style_color_weight_combined',
    ]].copy(), 
    presets='best', # https://auto.gluon.ai/dev/tutorials/tabular/tabular-essentials.html#presets
    time_limit=60 * 60 * 6 # 60 is 1 minute
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250212_193222"


Preset alias specified: 'best' maps to 'best_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.10
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.3.0: Thu Jan  2 20:24:16 PST 2025; root:xnu-11215.81.4~3/RELEASE_ARM64_T6000
CPU Count:          10
Memory Avail:       34.21 GB / 64.00 GB (53.5%)
Disk Space Avail:   462.20 GB / 926.35 GB (49.9%)
Presets specified: ['best']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then ho

[36m(_ray_fit pid=4517)[0m [1000]	valid_set's rmse: 38.8623


[36m(_dystack pid=4450)[0m 	-38.8848	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4450)[0m 	137.14s	 = Training   runtime
[36m(_dystack pid=4450)[0m 	106.79s	 = Validation runtime
[36m(_dystack pid=4450)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 3401.13s of the 5198.95s of remaining time.
[36m(_dystack pid=4450)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=6.37%)
[36m(_dystack pid=4450)[0m 	-38.8758	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4450)[0m 	75.64s	 = Training   runtime
[36m(_dystack pid=4450)[0m 	49.96s	 = Validation runtime
[36m(_dystack pid=4450)[0m Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 3311.28s of the 5109.10s of remaining time.
[36m(_dystack pid=4450)[0m 	-38.9351	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4450)[0m 	1876.28s	 = Training   runtime
[

[36m(_ray_fit pid=7177)[0m [1000]	valid_set's rmse: 38.846[32m [repeated 6x across cluster][0m
[36m(_ray_fit pid=7173)[0m [1000]	valid_set's rmse: 38.8716[32m [repeated 6x across cluster][0m
[36m(_ray_fit pid=7177)[0m [2000]	valid_set's rmse: 38.8435
[36m(_ray_fit pid=7172)[0m [2000]	valid_set's rmse: 38.839
[36m(_ray_fit pid=7173)[0m [2000]	valid_set's rmse: 38.8705[32m [repeated 5x across cluster][0m
[36m(_ray_fit pid=7177)[0m [3000]	valid_set's rmse: 38.8419
[36m(_ray_fit pid=7172)[0m [3000]	valid_set's rmse: 38.8366
[36m(_ray_fit pid=7173)[0m [3000]	valid_set's rmse: 38.8692[32m [repeated 5x across cluster][0m
[36m(_ray_fit pid=7177)[0m [4000]	valid_set's rmse: 38.8413
[36m(_ray_fit pid=7174)[0m [4000]	valid_set's rmse: 38.8798
[36m(_ray_fit pid=7176)[0m [4000]	valid_set's rmse: 38.8545[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=7177)[0m [5000]	valid_set's rmse: 38.8398[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=7176)[0m [

[36m(_dystack pid=4450)[0m 	-38.8616	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4450)[0m 	961.14s	 = Training   runtime
[36m(_dystack pid=4450)[0m 	2917.07s	 = Validation runtime
[36m(_dystack pid=4450)[0m Fitting model: LightGBM_BAG_L2 ... Training model for up to 106.03s of the 105.86s of remaining time.
[36m(_dystack pid=4450)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=6.74%)
[36m(_ray_fit pid=7740)[0m 	Ran out of time, early stopping on iteration 714. Best iteration is:
[36m(_ray_fit pid=7740)[0m 	[714]	valid_set's rmse: 38.8809
[36m(_dystack pid=4450)[0m 	-38.8646	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=4450)[0m 	80.44s	 = Training   runtime
[36m(_dystack pid=4450)[0m 	62.84s	 = Validation runtime
[36m(_ray_fit pid=7743)[0m 	Ran out of time, early stopping on iteration 710. Best iteration is:[32m [repeated 7x across cluster

In [6]:
three_feature_list = [
    ['brand_material_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_color_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['brand_color_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_size_weight_combined', 'style_color_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['brand_is_waterproof_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_is_waterproof_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_laptop_compartment_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['brand_size_weight_combined', 'material_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_size_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['brand_size_weight_combined', 'style_color_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
]
f_list = []
for f in three_feature_list:
    for c in f:
        f_list.append(c)
set(f_list)

{'brand_color_weight_combined',
 'brand_is_waterproof_weight_combined',
 'brand_material_weight_combined',
 'brand_size_weight_combined',
 'laptop_compartment_is_waterproof_weight_combined',
 'material_color_weight_combined',
 'material_is_waterproof_weight_combined',
 'material_laptop_compartment_weight_combined',
 'material_size_weight_combined',
 'material_style_weight_combined',
 'size_style_weight_combined',
 'style_color_weight_combined'}

In [7]:
three_feature_list = [
    ['brand_material_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_color_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['brand_color_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_size_weight_combined', 'style_color_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['brand_is_waterproof_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_is_waterproof_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_laptop_compartment_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['brand_size_weight_combined', 'material_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['material_size_weight_combined', 'size_style_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
    ['brand_size_weight_combined', 'style_color_weight_combined', 'laptop_compartment_is_waterproof_weight_combined'],
]

shared_lgbm_params = {
    'learning_rate': 0.04748156996966733,
    'num_leaves': 256,
    'max_bin': 69779,
    'lambda_l1': 0.0021556304294933115,
    'lambda_l2': 0.05627364738597358,
    'feature_fraction': 0.8,
    'min_data_in_leaf': 24,
}

# Build the hyperparameters dict for AutoGluon
hyperparams = {'GBM': []}
for i, feature_set in enumerate(three_feature_list):
    model_name_suffix = f'_3F_{i+1}'
    model_params = {
        **shared_lgbm_params,
        'ag_args': {
            'name_suffix': model_name_suffix, 
        },
        'ag_args_fit': {
            'only_use_features': feature_set,
        },
    }
    hyperparams['GBM'].append(model_params)

predictor = TabularPredictor(
    label=target, 
    problem_type='regression', 
    eval_metric='root_mean_squared_error'
).fit(
    train_df, 
    presets='medium_quality',  # or 'best_quality'
    # num_stack_levels=2,  # enable multi-layer stacking
    # num_bag_folds=5,     # or 10, the higher the better (usually)
    hyperparameters=hyperparams
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250212_154700"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.10
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:30 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6000
CPU Count:          10
Memory Avail:       22.95 GB / 64.00 GB (35.9%)
Disk Space Avail:   436.02 GB / 926.35 GB (47.1%)
Presets specified: ['medium_quality']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "/Users/jordanbarker/Documents/Kaggle/backpack-prediction-challenge/notebooks/AutogluonModels/ag-20250212_154700"
Train Data Rows:    3994318
Train Data Columns: 37
Label Column:       price
Problem Type:       regression
Preproces