In [2]:
from backpack_predictor import prepare_data, target_encoding
from backpack_predictor.features import target, baseline_features, feature_list, cat_cols

%load_ext autoreload
%autoreload 2

from datetime import datetime
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import root_mean_squared_error

# import xgboost as xgb
import lightgbm as lgb

from optuna.integration import LightGBMPruningCallback #XGBoostPruningCallback, CatBoostPruningCallback
import optuna

import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
# train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
# train_df = pd.concat([train_df, train_extra_df], ignore_index=True)

# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
col1 = 'brand'
# col1 = 'weight_capacity'
# col2 = 'is_waterproof'
col2 = 'weight_capacity'
stats = ['mean', 'skew', 'var', 'count', 'min', 'max']

agg_stats = train_df.groupby(col1)[col2].agg(stats)
agg_stats

Unnamed: 0_level_0,mean,skew,var,count,min,max
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-1,17.774679,-0.088817,53.173022,9705,-1.0,30.0
0,18.074264,-0.073533,48.141221,60077,-1.0,30.0
1,17.949744,-0.049983,49.014051,56814,-1.0,30.0
2,18.019246,-0.060796,47.981299,57336,-1.0,30.0
3,18.000535,-0.068164,48.5591,56076,-1.0,30.0
4,18.096995,-0.08802,48.947975,59992,-1.0,30.0


In [4]:
train_df.columns

Index(['brand', 'material', 'size', 'compartments', 'laptop_compartment',
       'is_waterproof', 'style', 'color', 'weight_capacity', 'price'],
      dtype='object')

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.cluster import MiniBatchKMeans

numeric_cols = ['weight_capacity']
cat_cols = ['size', 'is_waterproof', 'brand', 'material', 'laptop_compartment', 'compartments', 'style', 'color']

numeric_transformer = Pipeline(
    steps=[('scaler', StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[('onehot', OneHotEncoder(handle_unknown='ignore', drop=None))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop'  # drop other columns not listed
)

### MiniBatchKMeans
model_start_time = time.time()
kmeans_pipeline = Pipeline(
    steps=[
        ('preprocessing', preprocessor),
        ('clusterer', MiniBatchKMeans(n_clusters=500, random_state=42))
    ]
)

kmeans_pipeline.fit(train_df)
kmeans_labels = kmeans_pipeline.named_steps['clusterer'].labels_
train_df['cluster'] = kmeans_labels
print(f"MiniBatchKMeans Training & Prediction time: {time.time() - model_start_time:.2f} seconds")

print("MiniBatchKMeans cluster counts:")
display(pd.Series(kmeans_labels).value_counts().reset_index())

MiniBatchKMeans Training & Prediction time: 1.68 seconds
MiniBatchKMeans cluster counts:


Unnamed: 0,index,count
0,214,1554
1,191,1235
2,291,1193
3,59,1118
4,92,1103
...,...,...
495,280,195
496,356,188
497,251,185
498,26,179


In [None]:
# Add original data
orig_df = pd.read_csv(r'..//data//orig.csv')
orig_df.insert(loc=0, column='id', value=0) # Add id to first col to match new train
orig_df = prepare_data(orig_df, is_train=True)
orig_df.columns = [f"{c}_orig" for c in orig_df.columns]
orig_df_columns = orig_df.columns.to_list()
train_df = train_df.merge(orig_df.loc[(orig_df["weight_capacity_orig"]>5)&(orig_df["weight_capacity_orig"]<30)], left_on='weight_capacity', right_on='weight_capacity_orig', how='left')

In [None]:
model_str = "lgb_"
study_name = 'many_cols_'

In [None]:
import itertools

def create_comb_features(train_df, test_df, cols, allowed_features, comb_size):
    for comb in itertools.combinations(cols, comb_size):
        col_name = "_".join(comb)
        if col_name not in allowed_features:
            continue
        # Create the new feature by joining the selected columns row-wise
        train_df[col_name] = train_df[list(comb)].astype(str).agg('_'.join, axis=1)
        test_df[col_name] = test_df[list(comb)].astype(str).agg('_'.join, axis=1)


best_2 = [
    'material_is_waterproof', 'material_laptop_compartment',
    'material_size', 'material_style', 'is_waterproof_color', 'style_color'
]
best_3 = [
    'material_laptop_compartment_is_waterproof', 'material_laptop_compartment_style',
    'material_is_waterproof_style', 'material_size_laptop_compartment', 'laptop_compartment_is_waterproof_color'
]
best_4 = [
    'material_size_laptop_compartment_is_waterproof', 'material_laptop_compartment_is_waterproof_style',
    'brand_laptop_compartment_is_waterproof_style', 'material_laptop_compartment_is_waterproof_color',
    'brand_material_laptop_compartment_is_waterproof'
]

cols = ['brand', 'material', 'size', 'laptop_compartment', 'is_waterproof', 'style', 'color']
create_comb_features(train_df, test_df, cols, best_2, 2)
create_comb_features(train_df, test_df, cols, best_3, 3)
create_comb_features(train_df, test_df, cols, best_4, 4)

In [None]:
stats = ['mean', 'skew', 'count', 'var']
FOLDS = 3
inner_folds = 10

target_feature_cols = ['weight_capacity', 'compartments', 'laptop_compartment', 'is_waterproof', 'material', 'brand']
target_feature_cols = target_feature_cols + best_2 + best_3 + best_4 + orig_df_columns 
target_feature_sets = {
    f"{col1}": {
        "stats": stats
    }
    for col1 in target_feature_cols
}

all_new_cols = []
for col1, v in target_feature_sets.items():
    new_cols = [f"{col1}_{target}_{stat}" for stat in stats]
    target_feature_sets[col1]['new_cols'] = new_cols
    all_new_cols.extend(new_cols)

feature_sets = {
    'material': {
        'col2_list': ['is_waterproof', 'laptop_compartment', 'style', 'size'],
        'stats': stats,
    },
}
for col1, v in feature_sets.items():
    col2_list = v['col2_list']
    new_cols = [f"{col1}_{col2}_{stat}" for col2 in col2_list for stat in stats]
    feature_sets[col1]['new_cols'] = new_cols
    all_new_cols.extend(new_cols)

features = [
    'weight_capacity', 'compartments', 'laptop_compartment', 'is_waterproof', 'brand', 'color', 'size', 'material', 'style' 
] + all_new_cols + orig_df_columns

print(f"Using {len(features)} features.")

data_splits = []

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
for i, (train_idx, valid_idx) in enumerate(kf.split(train_df), 1):
    train_fold = train_df.loc[train_idx].reset_index(drop=True)
    valid_fold = train_df.loc[valid_idx].reset_index(drop=True)

    # -----------------------------------------------------
    # Inner K-Fold (for partial target encoding)
    # -----------------------------------------------------
    kf_inner = KFold(n_splits=inner_folds, shuffle=True, random_state=42)
    for j, (inner_train_idx, inner_valid_idx) in enumerate(kf_inner.split(train_fold)):
        inner_train = train_fold.loc[inner_train_idx].copy()
        for col1, v in target_feature_sets.items():
            stats = v['stats']
            new_cols = v['new_cols']
            agg_stats = inner_train.groupby(col1)[target].agg(stats)
            for stat, new_col in zip(stats, new_cols):
                train_fold.loc[inner_valid_idx, new_col] = train_fold.loc[inner_valid_idx, col1].map(agg_stats[stat])

    # -----------------------------------------------------
    # Outer K-Fold Add Feature Sets
    # -----------------------------------------------------
    for col1, v in target_feature_sets.items():
        stats = v['stats']
        new_cols = v['new_cols']
        agg_stats = inner_train.groupby(col1)[target].agg(stats)
        for stat, new_col in zip(stats, new_cols):
            valid_fold.loc[:, new_col] = valid_fold.loc[:, col1].map(agg_stats[stat])

    for col1, v in feature_sets.items():
        stats = v['stats']
        col2_list = v['col2_list']
        new_cols = v['new_cols']
        for k, col2 in enumerate(col2_list):
            agg_stats = inner_train.groupby(col1)[col2].agg(stats)
            for stat, new_col in zip(stats, new_cols[k*len(stats):(k+1)*len(stats)]):
                train_fold.loc[:, new_col] = train_fold.loc[:, col1].map(agg_stats[stat])
                valid_fold.loc[:, new_col] = valid_fold.loc[:, col1].map(agg_stats[stat])

    train_fold[['compartments', 'laptop_compartment', 'is_waterproof', 'brand', 'color', 'size', 'material', 'style']] = \
        train_fold[['compartments', 'laptop_compartment', 'is_waterproof', 'brand', 'color', 'size', 'material', 'style']].astype('category') 
    
    valid_fold[['compartments', 'laptop_compartment', 'is_waterproof', 'brand', 'color', 'size', 'material', 'style']] = \
        valid_fold[['compartments', 'laptop_compartment', 'is_waterproof', 'brand', 'color', 'size', 'material', 'style']].astype('category') 

    data_splits.append((train_fold, valid_fold))

In [None]:
for i, (train_fold, valid_fold) in enumerate(data_splits, 1):
    # Convert object columns to categorical in train fold
    object_cols = train_fold.select_dtypes(include=['object']).columns
    train_fold[object_cols] = train_fold[object_cols].astype('category')
    valid_fold[object_cols] = valid_fold[object_cols].astype('category')

In [None]:
def objective(trial):
    params = {
        'random_state': 42,
        'verbose': -1,  # -1: Fatal, 0: Warning, 1: Info, 2: Debug
        'objective': 'regression',
        'metric': 'rmse',
        'force_row_wise': True,
        # 'early_stopping_rounds': 50, # the {n}th accuracy on the validation set does not improve, stop training
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 1000),

        # bagging_fraction is like feature_fraction, but randomly selects data without resampling
        # bagging_freq must be non-zero to enable bagging
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),

        # default = 10.0,  used for the categorical features
        'cat_l2':  trial.suggest_float('cat_l2', 0.01, 100),

        # if set to true, when evaluating node splits LightGBM will check only one randomly-chosen threshold for each feature
        'extra_trees': trial.suggest_categorical("extra_trees", [True, False]),

        # subset of features on each iteration (tree) to select
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        # colsample_bytree is ignored when feature_fraction is set
        # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.25, 0.35),

        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),

        # max number of bins that feature values will be bucketed in
        'max_bin': trial.suggest_int('max_bin', 2, 20000),

        # <= 0 means no limit. Used to deal with over-fitting when data is small. Tree still grows leaf-wise. 
        'max_depth': trial.suggest_int('max_depth', -1, 2000),  

        # Very important to prevent over-fitting. Setting it to hundreds or thousands is enough for a large dataset.
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 100),
        'min_split_gain': 0.5,
        
        'n_estimators': trial.suggest_int('n_estimators', 100, 20000),

        # main parameter to control the complexity of the tree model. Should be smaller than 2^max_depth
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),

        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),

        # subsample is ignored when bagging_fraction is set
        # 'subsample': trial.suggest_float('subsample', 0.2, 0.25),
        
    }


    rmse_list = []
    for i, (train_fold, valid_fold) in enumerate(data_splits, 1):

        train_data = lgb.Dataset(train_fold[features], label=train_fold[target])
        valid_data = lgb.Dataset(valid_fold[features], label=valid_fold[target], reference=train_data)
      
        model = lgb.train(
            params=params,
            train_set=train_data,
            valid_sets=[train_data, valid_data],
            valid_names=['train_0', 'valid_0'],
            callbacks=[
                LightGBMPruningCallback(trial, "rmse", valid_name="valid_0"),
                lgb.log_evaluation(-1)                   # Suppress training logs
            ]
        )
        y_pred = model.predict(valid_fold[features], num_iteration=model.best_iteration)
        rmse = root_mean_squared_error(valid_fold[target], y_pred)
        rmse_list.append(rmse)

    return np.mean(rmse_list)

study = optuna.create_study(
        storage=f"sqlite:///..//optuna//{model_str}db.sqlite3",
        study_name=model_str + study_name + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=1000)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["metric"] = "rmse"
best_params["force_row_wise"] = True

In [None]:
best_params

In [None]:
for i, (train_fold, valid_fold) in enumerate(data_splits, 1):

    train_data = lgb.Dataset(train_fold[features], label=train_fold[target])
    valid_data = lgb.Dataset(valid_fold[features], label=valid_fold[target], reference=train_data)
    break

model = lgb.train(params=best_params, train_set=train_data, valid_sets=[train_data, valid_data])

lgb.plot_importance(model, importance_type="gain", figsize=(7,6), title="LightGBM Feature Importance (Gain)")
plt.show()

lgb.plot_importance(model, importance_type="split", figsize=(7,6), title="LightGBM Feature Importance (Split)")
plt.show()

In [None]:
feature_importance_split = model.feature_importance(importance_type='split')
feature_importance_gain = model.feature_importance(importance_type='gain')

# Create a DataFrame for visualization
importance_df = pd.DataFrame({
    'Feature': features,
    'Split Importance': feature_importance_split,
    'Gain Importance': feature_importance_gain
}).sort_values(by='Gain Importance', ascending=False)
importance_df

In [None]:
split_lower = importance_df['Split Importance'].quantile(.5), 
gain_lower = importance_df['Gain Importance'].quantile(.5)

# Features to remove based on conditions
features_to_remove = importance_df[
    (importance_df['Split Importance'] < split_lower) & 
    (importance_df['Gain Importance'] < gain_lower)
]['Feature'].to_list()

# Remove them from the 'features' list
filtered_features = [feature for feature in features if feature not in features_to_remove]

# Display the updated features list
print(len(filtered_features), "of", len(features))

In [None]:
def objective(trial):
    params = {
        'random_state': 42,
        'verbose': -1,  # -1: Fatal, 0: Warning, 1: Info, 2: Debug
        'objective': 'regression',
        'metric': 'rmse',
        'force_row_wise': True,
        # 'early_stopping_rounds': 50, # the {n}th accuracy on the validation set does not improve, stop training
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 1000),

        # bagging_fraction is like feature_fraction, but randomly selects data without resampling
        # bagging_freq must be non-zero to enable bagging
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),

        # default = 10.0,  used for the categorical features
        'cat_l2':  trial.suggest_float('cat_l2', 0.01, 100),

        # if set to true, when evaluating node splits LightGBM will check only one randomly-chosen threshold for each feature
        'extra_trees': trial.suggest_categorical("extra_trees", [True, False]),

        # subset of features on each iteration (tree) to select
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        # colsample_bytree is ignored when feature_fraction is set
        # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.25, 0.35),

        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),

        # max number of bins that feature values will be bucketed in
        'max_bin': trial.suggest_int('max_bin', 2, 20000),

        # <= 0 means no limit. Used to deal with over-fitting when data is small. Tree still grows leaf-wise. 
        'max_depth': trial.suggest_int('max_depth', -1, 2000),  

        # Very important to prevent over-fitting. Setting it to hundreds or thousands is enough for a large dataset.
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 100),
        'min_split_gain': 0.5,
        
        'n_estimators': trial.suggest_int('n_estimators', 100, 20000),

        # main parameter to control the complexity of the tree model. Should be smaller than 2^max_depth
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),

        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),

        # subsample is ignored when bagging_fraction is set
        # 'subsample': trial.suggest_float('subsample', 0.2, 0.25),
        
    }


    rmse_list = []
    for i, (train_fold, valid_fold) in enumerate(data_splits, 1):

        train_data = lgb.Dataset(train_fold[filtered_features], label=train_fold[target])
        valid_data = lgb.Dataset(valid_fold[filtered_features], label=valid_fold[target], reference=train_data)
      
        model = lgb.train(
            params=params,
            train_set=train_data,
            valid_sets=[train_data, valid_data],
            valid_names=['train_0', 'valid_0'],
            callbacks=[
                LightGBMPruningCallback(trial, "rmse", valid_name="valid_0"),
                lgb.log_evaluation(-1)                   # Suppress training logs
            ]
        )
        y_pred = model.predict(valid_fold[filtered_features], num_iteration=model.best_iteration)
        rmse = root_mean_squared_error(valid_fold[target], y_pred)
        rmse_list.append(rmse)

    return np.mean(rmse_list)

study = optuna.create_study(
        storage=f"sqlite:///..//optuna//{model_str}db.sqlite3",
        study_name=model_str + study_name + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=1000)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["metric"] = "rmse"
best_params["force_row_wise"] = True

In [None]:
for i, (train_fold, valid_fold) in enumerate(data_splits, 1):

    train_data = lgb.Dataset(train_fold[filtered_features], label=train_fold[target])
    valid_data = lgb.Dataset(valid_fold[filtered_features], label=valid_fold[target], reference=train_data)
    break

model = lgb.train(params=best_params, train_set=train_data, valid_sets=[train_data, valid_data])

lgb.plot_importance(model, importance_type="gain", figsize=(7,6), title="LightGBM Feature Importance (Gain)")
plt.show()

lgb.plot_importance(model, importance_type="split", figsize=(7,6), title="LightGBM Feature Importance (Split)")
plt.show()