In [1]:
from backpack_predictor import prepare_data, target_encoding
from backpack_predictor.features import target, baseline_features, feature_list, cat_cols

%load_ext autoreload
%autoreload 2

from datetime import datetime
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import KBinsDiscretizer, TargetEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import root_mean_squared_error

import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
# train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
# train_df = pd.concat([train_df, train_extra_df], ignore_index=True)


# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)

# X = train_df.drop(target, axis=1)
# y = train_df[target]

In [2]:
def cv_te(smooth='auto', cv=5):
    results = []
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    for fold, (train_index, val_index) in enumerate(kf.split(train_df), 1):
        # print(f"Starting Fold {fold}...")
        
        train_fold = train_df.iloc[train_index]
        val_fold = train_df.iloc[val_index]
        
        fold_results = {"Fold": fold}
        cols_to_transform = [col for col in train_fold.columns if col != 'price']
        te = TargetEncoder(target_type="continuous", smooth=smooth, cv=cv)
        te.fit(train_fold[cols_to_transform], train_fold[target])
        te_vals = te.transform(val_fold[cols_to_transform])

        for i, col in enumerate(cols_to_transform):
            rmse = root_mean_squared_error(val_fold[target], te_vals[:, i])
            fold_results[col] = rmse
        
        results.append(fold_results)

    results_df = pd.DataFrame(results)
    summary = results_df.describe().loc[['mean', 'std']].T.round(3).sort_values(by='mean').drop('Fold')
    print(f"\nsmooth = {smooth},  CV = {cv}")
    display(summary)
    display(summary.mean())

In [3]:
cv_te(smooth=20)


smooth = 20,  CV = 5


Unnamed: 0,mean,std
weight_capacity,39.0,0.064
material,39.034,0.066
color,39.035,0.068
brand,39.037,0.066
size,39.037,0.067
is_waterproof,39.037,0.066
laptop_compartment,39.038,0.067
compartments,39.039,0.067
style,39.04,0.067


mean    39.033000
std      0.066444
dtype: float64

price only has 5 decimal places sooo

In [8]:
train_df['weight_capacity_5'] = train_df['weight_capacity'].round(5)
train_df['weight_capacity_4'] = train_df['weight_capacity'].round(4)
train_df['weight_capacity_3'] = train_df['weight_capacity'].round(3)
train_df['weight_capacity_2'] = train_df['weight_capacity'].round(2)
train_df['weight_capacity_1'] = train_df['weight_capacity'].round(1)
train_df['weight_capacity_0'] = train_df['weight_capacity'].round(0)
cv_te(smooth=20)


smooth = 20,  CV = 5


Unnamed: 0,mean,std
weight_capacity,39.0,0.064
weight_capacity_0,39.025,0.064
weight_capacity_5,39.031,0.068
weight_capacity_1,39.032,0.071
material,39.034,0.066
color,39.035,0.068
brand,39.037,0.066
size,39.037,0.067
is_waterproof,39.037,0.066
laptop_compartment,39.038,0.067


mean    39.047800
std      0.068067
dtype: float64

In [9]:
cat_cols

['brand',
 'material',
 'size',
 'compartments',
 'style',
 'color',
 'laptop_compartment',
 'is_waterproof']

In [22]:
import pandas as pd

def update_category_codes(df, cat_cols):
    """
    Updates category columns in a DataFrame by replacing -1 with 0 and increasing all other codes by 1.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing categorical columns.
        cat_cols (list): List of categorical column names.
    
    Returns:
        pd.DataFrame: Updated DataFrame with modified category codes.
    """
    df = df.copy()
    
    for col in cat_cols:
        if df[col].dtype.name == 'category':
            df[col] = df[col].cat.codes  # Convert categories to integer codes
            if -1 in df[col].values:
                df[col] = df[col] + 1  # Increment all codes by 1
    
    return df

train_df = update_category_codes(train_df, cat_cols)

In [23]:
for col in cat_cols:
    if train_df[col].dtype != 'category':
        train_df[col] = train_df[col].astype('category')
    train_df[f"{col}_code"] = train_df[col].cat.codes

In [24]:
for col in cat_cols:
    print(sorted(train_df[col].cat.codes.unique()), col)

[0, 1, 2, 3, 4, 5] brand
[0, 1, 2, 3, 4] material
[0, 1, 2, 3] size
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] compartments
[0, 1, 2, 3] style
[0, 1, 2, 3, 4, 5, 6] color
[0, 1, 2] laptop_compartment
[0, 1, 2] is_waterproof


In [25]:
# train_df['weight_capacity']

In [26]:
# [-1, 0, 1, 2, 3, 4] brand

# [0, 1, 2, 3] size
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] compartments

# 111.611723

In [27]:
combined = train_df['weight_capacity'].copy()
multiplier = 100

for col in cat_cols:
    # Add the encoded column multiplied by the appropriate factor.
    combined += train_df[col].cat.codes * multiplier
    multiplier *= 10

# Save the new combined feature.
train_df['combined_feature'] = combined

In [29]:
cv_te(smooth=20)


smooth = 20,  CV = 5


Unnamed: 0,mean,std
weight_capacity,39.0,0.064
weight_capacity_0,39.025,0.064
weight_capacity_5,39.031,0.068
weight_capacity_1,39.032,0.071
material_code,39.034,0.066
material,39.034,0.066
color_code,39.035,0.068
color,39.035,0.068
is_waterproof,39.037,0.066
brand_code,39.037,0.066


mean    39.043875
std      0.067583
dtype: float64

In [30]:
def create_combined_feature(df, cat_cols, leave_out=None):
    """
    Combine 'weight_capacity' and the encoded values of categorical columns
    in cat_cols (except for the one specified in leave_out) into a single number.
    
    Parameters:
    - df: DataFrame containing the data.
    - cat_cols: List of categorical column names to combine.
    - leave_out: A column name from cat_cols to leave out (default is None, meaning include all).
    
    Returns:
    - A Series representing the combined feature.
    """
    # Start with the 'weight_capacity' column
    combined = df['weight_capacity'].copy()
    multiplier = 100  # Initial multiplier value

    # Loop through the categorical columns, skipping the one to leave out.
    for col in cat_cols:
        if col == leave_out:
            continue  # Skip the column that is being left out.
        # Add the encoded values multiplied by the current multiplier.
        combined += df[col].cat.codes * multiplier
        multiplier *= 10  # Increase the multiplier for the next column.

    return combined

# Create a new combined feature for each cat column left out.
for col in cat_cols:
    new_feature_name = f'combined_feature_less_{col}'
    train_df[new_feature_name] = create_combined_feature(train_df, cat_cols, leave_out=col)

In [31]:
cv_te(smooth=20)


smooth = 20,  CV = 5


Unnamed: 0,mean,std
weight_capacity,39.0,0.064
weight_capacity_0,39.025,0.064
weight_capacity_5,39.031,0.068
weight_capacity_1,39.032,0.071
material,39.034,0.066
material_code,39.034,0.066
color_code,39.035,0.068
color,39.035,0.068
is_waterproof_code,39.037,0.066
size_code,39.037,0.067


mean    39.042750
std      0.067437
dtype: float64