<a href="https://www.kaggle.com/code/hopesb/backpack-baseline-predicition?scriptVersionId=223967930" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Backpack Predition

In [1]:
# import all necessary libraries.
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px

import sklearn
from sklearn.metrics import mean_squared_error
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from cuml.preprocessing import TargetEncoder
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import VotingRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, plot_importance
from tqdm.notebook import tqdm
import optuna
import warnings
warnings.simplefilter("ignore")

In [2]:
# Reading the files
test_filepath = "/kaggle/input/playground-series-s5e2/test.csv"
train_filepath = "/kaggle/input/playground-series-s5e2/train.csv"
df = pd.read_csv(train_filepath, index_col="id")
df_test = pd.read_csv(test_filepath, index_col="id")

In [3]:
extra_filepath = "/kaggle/input/playground-series-s5e2/training_extra.csv"
df_extra = pd.read_csv(extra_filepath, index_col="id")
df_extra.shape

(3694318, 10)

## Explore

In [4]:
print(df.shape)
df.head()

(300000, 10)


Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300000 entries, 0 to 299999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Brand                 290295 non-null  object 
 1   Material              291653 non-null  object 
 2   Size                  293405 non-null  object 
 3   Compartments          300000 non-null  float64
 4   Laptop Compartment    292556 non-null  object 
 5   Waterproof            292950 non-null  object 
 6   Style                 292030 non-null  object 
 7   Color                 290050 non-null  object 
 8   Weight Capacity (kg)  299862 non-null  float64
 9   Price                 300000 non-null  float64
dtypes: float64(3), object(7)
memory usage: 25.2+ MB


In [6]:
df.isnull().sum().sort_values()

Compartments               0
Price                      0
Weight Capacity (kg)     138
Size                    6595
Waterproof              7050
Laptop Compartment      7444
Style                   7970
Material                8347
Brand                   9705
Color                   9950
dtype: int64

In [7]:
df.nunique().sort_values()

Waterproof                   2
Laptop Compartment           2
Size                         3
Style                        3
Material                     4
Brand                        5
Color                        6
Compartments                10
Price                    48212
Weight Capacity (kg)    181596
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Compartments,Weight Capacity (kg),Price
count,300000.0,299862.0,300000.0
mean,5.44359,18.029994,81.411107
std,2.890766,6.966914,39.03934
min,1.0,5.0,15.0
25%,3.0,12.097867,47.38462
50%,5.0,18.068614,80.95612
75%,8.0,24.002375,115.01816
max,10.0,30.0,150.0


In [9]:
def wrangle(filepaths, train=True):
    if train: 
        df_1 = pd.read_csv(filepaths[0], index_col= "id")
        df_2 = pd.read_csv(filepaths[-1], index_col="id")
        # Merge the dataset.
        df = pd.concat([df_1, df_2])
    else:
        df = pd.read_csv(filepaths, index_col="id")

    
    # Fill the weight with mean.
    df["Weight Capacity (kg)"] = df["Weight Capacity (kg)"].fillna(df["Weight Capacity (kg)"].mean())
    # fill the color.
    df["Color"] = df["Color"].fillna("Multi-color")
    # fill the Brand with unknown brand.
    df["Brand"] = df["Brand"].fillna("Unknown")
    # fill the size.
    df["Size"] = df["Size"].fillna("Others")
    # fill the waterproof.
    df["Waterproof"] = df["Waterproof"].fillna("Undetermined")
    # fill the Laptop compartment.
    df["Laptop Compartment"] = df["Laptop Compartment"].fillna("Unknown")
    # Fill the material ans style.
    df["Material"] = df["Material"].fillna("Unknown")
    df["Style"] = df["Style"].fillna("Unknown")

    df["Compartments"] = df["Compartments"].astype(int)
    df["Weight Capacity (kg)"] = round(df["Weight Capacity (kg)"], 2)
    df["Weight Class"] = df["Weight Capacity (kg)"].apply(lambda x: "Minimum" if x < 10 else(
        "Above_Minimum" if x < 15 else(
            "Medium" if x < 25 else "High"
        )
    ))
    df["Mat_style"] = df[["Material", "Style"]].sum(axis=1)
    df["Size_Weight"] = df[["Size", "Weight Class"]].sum(axis=1)
    df["Brand_size"] = df[["Brand", "Size"]].sum(axis=1)
    df["Brand_mat"] = df.groupby(["Brand", "Material"])["Weight Capacity (kg)"].transform("mean")

    return df

In [10]:
filepaths = [train_filepath, extra_filepath]
df = wrangle(filepaths)
df.isnull().sum()

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
Price                   0
Weight Class            0
Mat_style               0
Size_Weight             0
Brand_size              0
Brand_mat               0
dtype: int64

In [11]:
df.head()

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,Weight Class,Mat_style,Size_Weight,Brand_size,Brand_mat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,Jansport,Leather,Medium,7,Yes,No,Tote,Black,11.61,112.15875,Above_Minimum,LeatherTote,MediumAbove_Minimum,JansportMedium,18.057898
1,Jansport,Canvas,Small,10,Yes,Yes,Messenger,Green,27.08,68.88056,High,CanvasMessenger,SmallHigh,JansportSmall,17.896261
2,Under Armour,Leather,Small,2,Yes,No,Messenger,Red,16.64,39.1732,Medium,LeatherMessenger,SmallMedium,Under ArmourSmall,18.087056
3,Nike,Nylon,Small,8,Yes,No,Messenger,Green,12.94,80.60793,Above_Minimum,NylonMessenger,SmallAbove_Minimum,NikeSmall,17.997413
4,Adidas,Canvas,Medium,1,Yes,Yes,Messenger,Green,17.75,86.02312,Medium,CanvasMessenger,MediumMedium,AdidasMedium,17.945289


## Split Dataset.

In [12]:
target = "Price"
X = df.drop(columns= target)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
test_df = wrangle(test_filepath, train=False)

## Model.

In [14]:
# Create a function that takes in a list of algortithm and then make the prediction then save the prediction.
def score_predict(algorithms, X_train, X_test, y_train, y_test, test_df=None, save=True): 

    data_dict = {}
    # loop throught the algorithms.
    for algorithm in tqdm(algorithms, desc="Starting"):
        # make the pipeline
        model = make_pipeline(
            OrdinalEncoder(),
            StandardScaler(),
            algorithm
        )
        # Fitting the model.
        alg_name = list(model.named_steps.keys())[-1]
        print(f"Fitting the {alg_name}")
        model.fit(X_train, y_train)
        # Get the score.
        print("Making Prediction")
        y_test_pred = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, y_test_pred))
        data_dict[alg_name] = score

        if save == True:
            y_pred = model.predict(test_df)
            pd.DataFrame({"Price": y_pred}, index=test_df.index).to_csv(f"{alg_name}.csv")

    score_df = pd.DataFrame(data_dict, index=["Accuracy"]).T
    return score_df

In [15]:
xgb_params = {'max_depth': 8, 
              'n_estimators': 1701, 
              'learning_rate': 0.006899755927601937, 
              'min_child_weight': 0.2751236938771351, 
              'reg_alpha': 0.527480714707444, 
              'reg_lambda': 13.198806851176458, 
              'colsample_bylevel': 0.9772481366672879, 
              'colsample_bytree': 0.7774789504388184, 
              'colsample_bynode': 0.8541180728630634}
lgb_params = {'max_depth': 17, 
              'n_estimators': 2995, 
              'learning_rate': 0.00943875220877273, 
              'min_child_weight': 0.9260630565539196, 
              'reg_alpha': 0.9200603427801367, 
              'reg_lambda': 1.4831257707229202, 
              'colsample_bylevel': 0.5223344478716733, 
              'colsample_bytree': 0.9937966786732069, 
              'colsample_bynode': 0.8479188035499157}
cat_params = {'max_depth': 8, 
              'n_estimators': 2830, 
              'learning_rate': 0.009960055354186655}
algorithms = [LGBMRegressor(**lgb_params, verbose=-1, random_state=42),
             XGBRegressor(**xgb_params, random_state=42),
             CatBoostRegressor(verbose=0, random_state=42)]
score = score_predict(algorithms, X_train, X_test, y_train, y_test, test_df)
score

Starting:   0%|          | 0/3 [00:00<?, ?it/s]

Fitting the lgbmregressor
Making Prediction
Fitting the xgbregressor
Making Prediction
Fitting the catboostregressor
Making Prediction


Unnamed: 0,Accuracy
lgbmregressor,38.848805
xgbregressor,38.853854
catboostregressor,38.854321


In [16]:
lgb = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    LGBMRegressor(verbose=-1, random_state=42)
)
xgb = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    XGBRegressor(random_state=42)
)

estimators = [("lgb", lgb), ("xgb", xgb)]
vote = VotingRegressor(estimators=estimators, weights=[2, 1])
vote.fit(X_train, y_train)
y_pred_test = vote.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
y_pred = vote.predict(test_df)
pd.DataFrame({"Price": y_pred}, index=test_df.index).to_csv(f"submission.csv")

38.85063625977135


In [17]:
stack = StackingRegressor(estimators, Ridge(), cv=5)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))
y_pred = vote.predict(test_df)
pd.DataFrame({"Price": y_pred}, index=test_df.index).to_csv(f"submission_stack.csv")

38.849507067807245
