# 1. Importing libraries

In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn

In [2]:
df = pd.read_csv("data/cleaned_train_dataset.csv")
df.drop("Unnamed: 0",axis=1,inplace=True)
df.head(10)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
5,7,Puma,Canvas,Small,1.0,Yes,Yes,Backpack,Blue,21.488864,27.15815
6,8,Under Armour,Polyester,Medium,8.0,Yes,No,Tote,Gray,10.20778,25.98652
7,9,Under Armour,Nylon,Medium,2.0,Yes,Yes,Messenger,Pink,15.8951,38.48741
8,12,Adidas,Nylon,Medium,5.0,Yes,No,Backpack,Blue,15.87205,111.80791
9,13,Under Armour,Leather,Medium,8.0,No,No,Tote,Red,26.079409,26.37209


In [3]:
df.shape

(246686, 11)

# 2. Feature engineering

### 2.1 converting categorical features into numerical using get_dummies method(bascially one-hot encoding)

In [4]:
categorical_cols = df.select_dtypes(include=['object','category']).columns.to_list()

In [5]:
categorical_cols

['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [6]:
df_encoded = pd.get_dummies(df,columns=categorical_cols,drop_first=False)
df_encoded.shape

(246686, 29)

In [7]:
df_encoded.head(10)

Unnamed: 0,id,Compartments,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Material_Canvas,...,Waterproof_Yes,Style_Backpack,Style_Messenger,Style_Tote,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red
0,0,7.0,11.611723,112.15875,False,True,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
1,1,10.0,27.078537,68.88056,False,True,False,False,False,True,...,True,False,True,False,False,False,False,True,False,False
2,2,2.0,16.64376,39.1732,False,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,True
3,3,8.0,12.93722,80.60793,False,False,True,False,False,False,...,False,False,True,False,False,False,False,True,False,False
4,4,1.0,17.749338,86.02312,True,False,False,False,False,True,...,True,False,True,False,False,False,False,True,False,False
5,7,1.0,21.488864,27.15815,False,False,False,True,False,True,...,True,True,False,False,False,True,False,False,False,False
6,8,8.0,10.20778,25.98652,False,False,False,False,True,False,...,False,False,False,True,False,False,True,False,False,False
7,9,2.0,15.8951,38.48741,False,False,False,False,True,False,...,True,False,True,False,False,False,False,False,True,False
8,12,5.0,15.87205,111.80791,True,False,False,False,False,False,...,False,True,False,False,False,True,False,False,False,False
9,13,8.0,26.079409,26.37209,False,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,True


In [8]:
df_encoded = df_encoded.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)
df_encoded.head()

Unnamed: 0,id,Compartments,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Material_Canvas,...,Waterproof_Yes,Style_Backpack,Style_Messenger,Style_Tote,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red
0,0,7.0,11.611723,112.15875,0,1,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,1,10.0,27.078537,68.88056,0,1,0,0,0,1,...,1,0,1,0,0,0,0,1,0,0
2,2,2.0,16.64376,39.1732,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
3,3,8.0,12.93722,80.60793,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,4,1.0,17.749338,86.02312,1,0,0,0,0,1,...,1,0,1,0,0,0,0,1,0,0


In [9]:
df_encoded.shape

(246686, 29)

# 3. Model Training

### 3.1 Defining independent variables & Target variable

In [14]:
X = df_encoded.drop('Price',axis=1)

In [15]:
X.head()

Unnamed: 0,id,Compartments,Weight Capacity (kg),Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Material_Canvas,Material_Leather,...,Waterproof_Yes,Style_Backpack,Style_Messenger,Style_Tote,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red
0,0,7.0,11.611723,0,1,0,0,0,0,1,...,0,0,0,1,1,0,0,0,0,0
1,1,10.0,27.078537,0,1,0,0,0,1,0,...,1,0,1,0,0,0,0,1,0,0
2,2,2.0,16.64376,0,0,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,1
3,3,8.0,12.93722,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,4,1.0,17.749338,1,0,0,0,0,1,0,...,1,0,1,0,0,0,0,1,0,0


In [16]:
Y = df_encoded['Price']
Y.head()

0    112.15875
1     68.88056
2     39.17320
3     80.60793
4     86.02312
Name: Price, dtype: float64

### 3.2 Splitting the dataset

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=23)

### 3.3 Model selection

#### let's start with basic regression models

In [22]:
from sklearn.linear_model import LinearRegression

lr= LinearRegression()

lr.fit(X_train,y_train)

In [27]:
prediction_test_data = lr.predict(X_test)
prediction_test_data

array([79.98239686, 84.1690089 , 81.74161136, ..., 81.4025285 ,
       81.90633362, 81.00562951])

In [29]:
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, prediction_test_data)
mse = mean_squared_error(y_test, prediction_test_data)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, prediction_test_data)

print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

MAE: 33.6805
MSE: 1515.4210
RMSE: 38.9284
R² Score: 0.0006
