<a href="https://www.kaggle.com/code/hopesb/backpack-baseline-predicition?scriptVersionId=222670995" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Backpack Predition

In [1]:
# import all necessary libraries.
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px

import sklearn
from sklearn.metrics import mean_squared_error
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from tqdm.notebook import tqdm
import optuna
import warnings
warnings.simplefilter("ignore")

In [2]:
# Reading the files
test_filepath = "/kaggle/input/playground-series-s5e2/test.csv"
train_filepath = "/kaggle/input/playground-series-s5e2/train.csv"
df = pd.read_csv(train_filepath, index_col="id")
df_test = pd.read_csv(test_filepath, index_col="id")

## Explore

In [3]:
print(df.shape)
df.head()

(300000, 10)


Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300000 entries, 0 to 299999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Brand                 290295 non-null  object 
 1   Material              291653 non-null  object 
 2   Size                  293405 non-null  object 
 3   Compartments          300000 non-null  float64
 4   Laptop Compartment    292556 non-null  object 
 5   Waterproof            292950 non-null  object 
 6   Style                 292030 non-null  object 
 7   Color                 290050 non-null  object 
 8   Weight Capacity (kg)  299862 non-null  float64
 9   Price                 300000 non-null  float64
dtypes: float64(3), object(7)
memory usage: 25.2+ MB


In [5]:
df.isnull().sum().sort_values()

Compartments               0
Price                      0
Weight Capacity (kg)     138
Size                    6595
Waterproof              7050
Laptop Compartment      7444
Style                   7970
Material                8347
Brand                   9705
Color                   9950
dtype: int64

In [6]:
df.nunique().sort_values()

Laptop Compartment           2
Waterproof                   2
Size                         3
Style                        3
Material                     4
Brand                        5
Color                        6
Compartments                10
Price                    48212
Weight Capacity (kg)    181596
dtype: int64

In [7]:
df.describe()

Unnamed: 0,Compartments,Weight Capacity (kg),Price
count,300000.0,299862.0,300000.0
mean,5.44359,18.029994,81.411107
std,2.890766,6.966914,39.03934
min,1.0,5.0,15.0
25%,3.0,12.097867,47.38462
50%,5.0,18.068614,80.95612
75%,8.0,24.002375,115.01816
max,10.0,30.0,150.0


In [8]:
def wrangle(filepath):
    df = pd.read_csv(filepath, index_col= "id")
    # Fill the weight with mean.
    df["Weight Capacity (kg)"] = df["Weight Capacity (kg)"].fillna(df["Weight Capacity (kg)"].mean())
    # fill the color.
    df["Color"] = df["Color"].fillna("Multi-color")
    # fill the Brand with unknown brand.
    df["Brand"] = df["Brand"].fillna("Unknown")
    # fill the size.
    df["Size"] = df["Size"].fillna("Others")
    # fill the waterproof.
    df["Waterproof"] = df["Waterproof"].fillna("Undetermined")
    # fill the Laptop compartment.
    df["Laptop Compartment"] = df["Laptop Compartment"].fillna("Unknown")
    # Fill the material ans style.
    df["Material"] = df["Material"].fillna("Unknown")
    df["Style"] = df["Style"].fillna("Unknown")

    df["Compartments"] = df["Compartments"].astype(int)
    df["Weight Capacity (kg)"] = round(df["Weight Capacity (kg)"], 2)
    df["Weight Class"] = df["Weight Capacity (kg)"].apply(lambda x: "Minimum" if x < 10 else(
        "Above_Minimum" if x < 15 else(
            "Medium" if x < 25 else "High"
        )
    ))

    return df

In [9]:
df = wrangle(train_filepath)
df.isnull().sum()

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
Price                   0
Weight Class            0
dtype: int64

In [10]:
df.head()

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,Weight Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Jansport,Leather,Medium,7,Yes,No,Tote,Black,11.61,112.15875,Above_Minimum
1,Jansport,Canvas,Small,10,Yes,Yes,Messenger,Green,27.08,68.88056,High
2,Under Armour,Leather,Small,2,Yes,No,Messenger,Red,16.64,39.1732,Medium
3,Nike,Nylon,Small,8,Yes,No,Messenger,Green,12.94,80.60793,Above_Minimum
4,Adidas,Canvas,Medium,1,Yes,Yes,Messenger,Green,17.75,86.02312,Medium


## Split Dataset.

In [11]:
target = "Price"
X = df.drop(columns= target)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model.

In [12]:
# Create a function that takes in a list of algortithm and then make the prediction then save the prediction.
def score_predict(algorithms, X_train, X_test, y_train, y_test, test_df=None, save=True): 

    data_dict = {}
    # loop throught the algorithms.
    for algorithm in tqdm(algorithms, desc="Starting"):
        # make the pipeline
        model = make_pipeline(
            OrdinalEncoder(),
            StandardScaler(),
            algorithm
        )
        # Fitting the model.
        alg_name = list(model.named_steps.keys())[-1]
        print(f"Fitting the {alg_name}")
        model.fit(X_train, y_train)
        # Get the score.
        print("Making Prediction")
        y_test_pred = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_test_pred))
        data_dict[alg_name] = score

        if save == True:
            y_pred = model.predict(test_df)
            pd.DataFrame({"Price": y_pred}, index=test_df.index).to_csv(f"{alg_name}.csv")

    score_df = pd.DataFrame(data_dict, index=["Accuracy"]).T
    return score_df

In [13]:
test_df = wrangle(test_filepath)

In [14]:
xgb_params = {'max_depth': 10,
 'learning_rate': 0.009808276016092757,
 'min_child_weight': 0.1864061635011552,
 'reg_alpha': 0.6061118910567473,
 'reg_lambda': 1.5811779998996678,
 'colsample_bylevel': 0.6539553755789465,
 'colsample_bytree': 0.8407266876336359,
 'colsample_bynode': 0.6589587521539209}
algorithms = [LGBMRegressor(verbose=-1, random_state=42),
             XGBRegressor(**xgb_params, random_state=42),
             CatBoostRegressor(verbose=0, random_state=42)]
score = score_predict(algorithms, X_train, X_test, y_train, y_test, test_df)
score

Starting:   0%|          | 0/3 [00:00<?, ?it/s]

Fitting the lgbmregressor
Making Prediction
Fitting the xgbregressor
Making Prediction
Fitting the catboostregressor
Making Prediction


Unnamed: 0,Accuracy
lgbmregressor,38.910749
xgbregressor,38.91139
catboostregressor,38.962801


In [15]:
lgb = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    LGBMRegressor(verbose=-1, random_state=42)
)
cat = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    CatBoostRegressor(verbose=0, random_state=42)
)
xgb = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    XGBRegressor(**xgb_params, random_state=42)
)

estimators = [("lgb", lgb), ("cat", cat), ("xgb", xgb)]
vote = VotingRegressor(estimators=estimators, weights=[3, 1, 2])
vote.fit(X_train, y_train)
y_pred = vote.predict(test_df)
pd.DataFrame({"Price": y_pred}, index=test_df.index).to_csv(f"submission.csv")

## Hyperparameter tuning.


In [16]:
def objective(trials):
    params = {
        "max_depth": trials.suggest_int("max_depth", 3, 20),
        "n_estimators": trials.suggest_int("n_estimators", 1000, 3000),
        "learning_rate": trials.suggest_float("learning_rate", 0.0001, 0.01),
        "min_child_weight": trials.suggest_float("min_child_weight", 0.0001, 1.0),  # This should be in float.
        "reg_alpha": trials.suggest_float("reg_alpha", 0.1, 1.0),
        "reg_lambda": trials.suggest_float("reg_lambda", 0.0, 20.0), 
        "colsample_bylevel": trials.suggest_float("colsample_bylevel", 0.1, 1.0),
        "colsample_bytree": trials.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bynode": trials.suggest_float("colsample_bynode", 0.1, 1.0),
    }

    model = make_pipeline(
            OrdinalEncoder(),
            StandardScaler(),
            XGBRegressor(**params, random_state=42)
        )
    # Fitting the model.
    model.fit(X_train, y_train)
    # Get the score.
    y_test_pred = model.predict(X_test)
    score = np.sqrt(mean_squared_error(y_test, y_test_pred))

    return score

In [17]:
study = optuna.create_study(study_name="XGB", direction="minimize")
study.optimize(objective, n_trials=300, show_progress_bar=True)

[I 2025-02-15 11:20:57,553] A new study created in memory with name: XGB


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-02-15 11:21:23,224] Trial 0 finished with value: 38.90222441636352 and parameters: {'max_depth': 16, 'n_estimators': 2622, 'learning_rate': 0.0027277335360516063, 'min_child_weight': 0.2968125048209854, 'reg_alpha': 0.46888951972456216, 'reg_lambda': 8.555366237260507, 'colsample_bylevel': 0.548179851475245, 'colsample_bytree': 0.39946371421554383, 'colsample_bynode': 0.33739109508802323}. Best is trial 0 with value: 38.90222441636352.
[I 2025-02-15 11:21:42,556] Trial 1 finished with value: 38.90131347285729 and parameters: {'max_depth': 10, 'n_estimators': 2069, 'learning_rate': 0.0037190333263578995, 'min_child_weight': 0.4053690752968444, 'reg_alpha': 0.31959136586792003, 'reg_lambda': 1.49958398719783, 'colsample_bylevel': 0.3362671703937707, 'colsample_bytree': 0.39033659402077825, 'colsample_bynode': 0.7804189842202105}. Best is trial 1 with value: 38.90131347285729.
[I 2025-02-15 11:21:54,598] Trial 2 finished with value: 38.902908025725665 and parameters: {'max_depth':

In [18]:
study.best_params

{'max_depth': 4,
 'n_estimators': 2878,
 'learning_rate': 0.009134728536503977,
 'min_child_weight': 0.0008871149116998148,
 'reg_alpha': 0.8340976021595664,
 'reg_lambda': 0.975791576152824,
 'colsample_bylevel': 0.948606321143124,
 'colsample_bytree': 0.2265378445858436,
 'colsample_bynode': 0.4382277169361072}

In [19]:
optuna.visualization.plot_param_importances(study)

In [20]:
def objective(trials):
    params = {
        "max_depth": trials.suggest_int("max_depth", 3, 20),
        "n_estimators": trials.suggest_int("n_estimators", 1000, 3000),
        "learning_rate": trials.suggest_float("learning_rate", 0.0001, 0.01),
        "min_child_weight": trials.suggest_float("min_child_weight", 0.0001, 1.0),  # This should be in float.
        "reg_alpha": trials.suggest_float("reg_alpha", 0.1, 1.0),
        "reg_lambda": trials.suggest_float("reg_lambda", 0.0, 20.0), 
        "colsample_bylevel": trials.suggest_float("colsample_bylevel", 0.1, 1.0),
        "colsample_bytree": trials.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bynode": trials.suggest_float("colsample_bynode", 0.1, 1.0),
    }

    model = make_pipeline(
            OrdinalEncoder(),
            StandardScaler(),
            LGBMRegressor(**params, random_state=42, verbose=-1)
        )
    # Fitting the model.
    model.fit(X_train, y_train)
    # Get the score.
    y_test_pred = model.predict(X_test)
    score = np.sqrt(mean_squared_error(y_test, y_test_pred))

    return score

In [21]:
study = optuna.create_study(study_name="LGB", direction="minimize")
study.optimize(objective, n_trials=300, show_progress_bar=True)

[I 2025-02-15 12:52:53,378] A new study created in memory with name: LGB


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-02-15 12:53:12,864] Trial 0 finished with value: 38.89628489174221 and parameters: {'max_depth': 3, 'n_estimators': 2592, 'learning_rate': 0.007313030395065389, 'min_child_weight': 0.4384799442562647, 'reg_alpha': 0.5221088501341029, 'reg_lambda': 6.60086787448866, 'colsample_bylevel': 0.5534948903188859, 'colsample_bytree': 0.6462944210851804, 'colsample_bynode': 0.484461101213035}. Best is trial 0 with value: 38.89628489174221.
[I 2025-02-15 12:53:40,725] Trial 1 finished with value: 38.89869640136999 and parameters: {'max_depth': 4, 'n_estimators': 2731, 'learning_rate': 0.0036573249004804055, 'min_child_weight': 0.6418186359253978, 'reg_alpha': 0.6978475793049648, 'reg_lambda': 8.887316002997995, 'colsample_bylevel': 0.27779126903062856, 'colsample_bytree': 0.8760557191088142, 'colsample_bynode': 0.23938324238600722}. Best is trial 0 with value: 38.89628489174221.
[I 2025-02-15 12:53:52,152] Trial 2 finished with value: 38.91616062375394 and parameters: {'max_depth': 4, 'n_

In [22]:
study.best_params

{'max_depth': 19,
 'n_estimators': 1768,
 'learning_rate': 0.0071929991812130015,
 'min_child_weight': 0.44804845768071466,
 'reg_alpha': 0.33891999864510775,
 'reg_lambda': 0.3162170955979936,
 'colsample_bylevel': 0.4418001058699319,
 'colsample_bytree': 0.39367050503493833,
 'colsample_bynode': 0.7156736897319109}

In [23]:
optuna.visualization.plot_param_importances(study)