In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv("dataset.csv")
train['Compartments'] = train['Compartments'].fillna(train['Compartments'].median())
features = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_category = encoder.fit_transform(train[features])
encoded_df = pd.DataFrame(encoded_category, columns=encoder.get_feature_names_out(features))

train = train.drop(columns=features).reset_index(drop=True)
train = pd.concat([train, encoded_df], axis=1)
train.head()


Unnamed: 0,Compartments,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,Brand_nan,Material_Canvas,...,Style_Messenger,Style_Tote,Style_nan,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red,Color_nan
0,2.0,13.340058,143.445135,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,4.0,5.91803,72.086319,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,5.0,24.088386,29.699631,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,5.0,27.18199,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,8.0,11.258172,71.953236,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error


train = train.dropna(subset=['Price'])

X = train.drop(columns=['Price'])
y = train['Price']

X_encoded = pd.get_dummies(X, drop_first=True)

xtr, xte, ytr, yte = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(xtr, ytr)

y_pred = model.predict(xte)

mse = mean_squared_error(yte, y_pred)
mae = mean_absolute_error(yte, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")


Mean Squared Error: 1594.5033143876117
Mean Absolute Error: 34.25700570938816


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

train = pd.read_csv("dataset.csv")

train['Compartments'] = train['Compartments'].fillna(train['Compartments'].median())

train['Total_Compartments'] = train['Compartments'] + train['Laptop Compartment'].map({'Yes': 1, 'No': 0})

categorical_features = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_category = encoder.fit_transform(train[categorical_features])
encoded_df = pd.DataFrame(encoded_category, columns=encoder.get_feature_names_out(categorical_features))

train = train.drop(columns=categorical_features).reset_index(drop=True)
train = pd.concat([train, encoded_df], axis=1)

train = train.dropna(subset=['Price'])

X = train.drop(columns=['Price'])
y = train['Price']

xtr, xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)
model.fit(xtr, ytr)

y_pred = model.predict(xte)

mse = mean_squared_error(yte, y_pred)
mae = mean_absolute_error(yte, y_pred)

print(f"Prediction: {y_pred[:50]}")

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

# feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)
# feature_importances.nlargest(10).plot(kind='barh')
# plt.title("Top 10 Feature Importances")
# plt.show()


Prediction: [73.96882  78.78145  83.007385 83.53759  88.74695  74.53537  87.04291
 88.052536 66.94466  78.603455 79.02738  85.2419   85.2352   78.70987
 81.89919  95.88264  79.350006 69.07209  82.74044  83.09205  80.608635
 84.01324  83.676735 79.836365 78.0928   76.281204 64.59686  80.55926
 83.546364 82.939415 80.40112  84.30337  95.338806 70.96474  81.09933
 80.19031  89.70524  79.36484  77.47422  83.16909  81.055115 75.914314
 80.65211  76.702835 83.10643  89.873604 77.19324  59.14771  77.48837
 85.7964  ]
Mean Squared Error: 1568.0025634438039
Mean Absolute Error: 34.18342997882912


In [2]:
import pandas as pd

train_df = pd.read_csv("train.csv")
extra_df = pd.read_csv("training_extra.csv")
test_df = pd.read_csv("test.csv")

# train_df.head()
# extra_df.head()
# test_df.head()

def missing_values(df):
    if df.isnull().sum().sum() > 0:
        print(df.isnull().sum())
    else:
        print("No missing values")

# missing_values(train_df)
# missing_values(extra_df)
# missing_values(test_df)


In [3]:
train = pd.concat([train_df, extra_df])

print(f"Obs: ", train.shape[0])
print(f"Features: " ,train.shape[1])

Obs:  3994318
Features:  11


In [4]:
train.drop('id', axis=1, inplace=True)
train.drop_duplicates(inplace=True)
missing_values(train)

Brand                   126758
Material                110962
Size                     87785
Compartments                 0
Laptop Compartment       98533
Waterproof               94324
Style                   104180
Color                   133617
Weight Capacity (kg)      1808
Price                        0
dtype: int64


In [5]:
train_df['Weight Capacity (kg)'].fillna(train_df['Weight Capacity (kg)'].median(), inplace=True)
test_df['Weight Capacity (kg)'].fillna(test_df['Weight Capacity (kg)'].median(), inplace=True)
# missing_values(train_df)
# missing_values(test_df)
train_df.fillna('Unknown', inplace=True)
test_df.fillna('Unknown', inplace=True)
# missing_values(train_df)
# missing_values(test_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Weight Capacity (kg)'].fillna(train_df['Weight Capacity (kg)'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Weight Capacity (kg)'].fillna(test_df['Weight Capacity (kg)'].median(), inplace=True)


In [6]:
print(f"Mean: ", train['Price'].mean())
print(f"Median: ", train['Price'].median())
print(f"Standard Deviation: ", train['Price'].std())
print(f"Minimum: ", train['Price'].min())
print(f"Maximum: ", train['Price'].max())
print(f"Change in Price: ", train['Price'].max() - train['Price'].min())

Mean:  81.36217459275402
Median:  80.98495
Standard Deviation:  38.93868410784115
Minimum:  15.0
Maximum:  150.0
Change in Price:  135.0


In [7]:
brand_price = train.groupby('Brand')['Price'].mean().sort_values(ascending=False).reset_index()
brand_count = train['Brand'].value_counts().reset_index()
brand_count.columns = ['Brand', 'Count']
brand_stats = pd.merge(brand_price, brand_count, on='Brand')
print(brand_stats)

          Brand      Price   Count
0  Under Armour  82.065208  801035
1      Jansport  81.777549  749340
2          Nike  81.284804  764407
3          Puma  81.225577  755778
4        Adidas  80.527683  797000


In [8]:
compartment_price = train.groupby('Compartments')['Price'].mean().sort_values(ascending=False).reset_index()
compartment_count = train['Compartments'].value_counts().reset_index()
compartment_count.columns = ['Compartments', 'Count']
compartment_stats = pd.merge(compartment_price, compartment_count, on='Compartments')
print(compartment_stats)

   Compartments      Price   Count
0           8.0  81.730636  383172
1           6.0  81.642001  360640
2           2.0  81.616355  408150
3           7.0  81.601284  400824
4           4.0  81.573869  417246
5          10.0  81.552311  396303
6           5.0  81.474432  399418
7           3.0  81.166371  406796
8           1.0  81.032016  423577
9           9.0  80.280526  398192


In [9]:
material_price = train.groupby('Material')['Price'].mean().sort_values(ascending=False).reset_index()
material_count = train['Material'].value_counts().reset_index()
material_count.columns = ['Material', 'Count']
material_stats = pd.merge(material_price, material_count, on='Material')
print(material_stats)

    Material      Price    Count
0  Polyester  82.029424  1060882
1     Canvas  81.831022   903632
2      Nylon  81.071794   942656
3    Leather  80.488749   976186


In [10]:
size_price = train.groupby('Size')['Price'].mean().sort_values(ascending=False).reset_index()
size_count = train['Size'].value_counts().reset_index()
size_count.columns = ['Size', 'Count']
size_stats = pd.merge(size_price, size_count, on='Size')
print(size_stats)

     Size      Price    Count
0   Large  81.611747  1312295
1   Small  81.467620  1239751
2  Medium  81.201377  1354487


In [11]:
laptop_compartment_price = train.groupby('Laptop Compartment')['Price'].mean().sort_values(ascending=False).reset_index()
laptop_compartment_count = train['Laptop Compartment'].value_counts().reset_index()
laptop_compartment_count.columns = ['Laptop Compartment', 'Count']
laptop_compartment_stats = pd.merge(laptop_compartment_price, laptop_compartment_count, on='Laptop Compartment')
print(laptop_compartment_stats)

  Laptop Compartment      Price    Count
0                Yes  81.420190  1972937
1                 No  81.350487  1922848


In [12]:
waterproof_price = train.groupby('Waterproof')['Price'].mean().sort_values(ascending=False).reset_index()
waterproof_count = train['Waterproof'].value_counts().reset_index()
waterproof_count.columns = ['Waterproof', 'Count']
waterproof_stats = pd.merge(waterproof_price, waterproof_count, on='Waterproof')
print(waterproof_stats)

  Waterproof      Price    Count
0         No  81.438525  1930789
1        Yes  81.411426  1969205


In [13]:
style_price = train.groupby('Style')['Price'].mean().sort_values(ascending=False).reset_index()
style_count = train['Style'].value_counts().reset_index()
style_count.columns = ['Style', 'Count']
style_stats = pd.merge(style_price, style_count, on='Style')
print(style_stats)

       Style      Price    Count
0       Tote  81.500007  1297942
1   Backpack  81.357717  1262519
2  Messenger  81.185531  1329677


In [14]:
color_price = train.groupby('Color')['Price'].mean().sort_values(ascending=False).reset_index()
color_count = train['Color'].value_counts().reset_index()
color_count.columns = ['Color', 'Count']
color_stats = pd.merge(color_price, color_count, on='Color')
print(color_stats)

   Color      Price   Count
0  Green  82.252359  617024
1   Blue  82.013390  638485
2   Pink  81.596170  688257
3    Red  81.010017  630215
4   Gray  80.917014  666110
5  Black  80.326088  620610


In [15]:
weight_capacity_price = train.groupby('Weight Capacity (kg)')['Price'].mean().sort_values(ascending=False).reset_index()
weight_capacity_count = train['Weight Capacity (kg)'].value_counts().reset_index()
weight_capacity_count.columns = ['Weight Capacity (kg)', 'Count']
weight_capacity_stats = pd.merge(weight_capacity_price, weight_capacity_count, on='Weight Capacity (kg)')
print(weight_capacity_stats)

         Weight Capacity (kg)  Price  Count
0                   27.254424  150.0      1
1                   27.254325  150.0      1
2                   27.254061  150.0      1
3                   27.253784  150.0      1
4                   27.255874  150.0      1
...                       ...    ...    ...
1920340             24.252448   15.0      1
1920341             22.572746   15.0      1
1920342             16.886079   15.0      1
1920343             27.572624   15.0      1
1920344             13.205776   15.0      1

[1920345 rows x 3 columns]


If you look into the data you will see that the specifics dont make alone too big of a cange, this means, that the specifications together make the price change

In [16]:
train_df = pd.get_dummies(train_df, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)
# train_df.head() 
# test_df.head()

In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X = train_df.drop('Price', axis=1)
y = train_df['Price']
X_train, X_val, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_scaler = scaler.fit_transform(X_train)
x_val_scaler = scaler.transform(X_val)



In [36]:
import numpy as np
from sklearn.metrics import r2_score
def model_evaluation(model):
    model.fit(x_train_scaler, y_train)
    y_pred = model.predict(x_train_scaler)
    y_value_pred = model.predict(x_val_scaler)
    mse = mean_squared_error(y_train, y_pred)
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    mae = mean_absolute_error(y_train, y_pred)
    r2 = r2_score(y_train, y_pred)

    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R2 Score: {r2}")


In [37]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error


decision_tree = DecisionTreeRegressor()
model_evaluation(decision_tree)


Mean Squared Error: 0.0
Root Mean Squared Error: 0.0
Mean Absolute Error: 0.0
R2 Score: 1.0


In [38]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Separate input (X) and target variable (y)
X = train_df.drop('Price', axis=1)
y = train_df['Price']

# Ensure a proper train-test split (before scaling)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features properly
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)  # Fit on training data
X_test_s = scaler.transform(X_test)  # Transform test data only

# Model evaluation function
def model_evaluation(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)  # Train model
    y_pred = model.predict(X_test)  # Predictions
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, rmse, mae, r2

# Train Decision Tree with constraints to prevent overfitting
decision_tree = DecisionTreeRegressor(max_depth=5, min_samples_split=10, random_state=42)
dt_mse, dt_rmse, dt_mae, dt_r2 = model_evaluation(decision_tree, X_train_s, y_train, X_test_s, y_test)

# Print results with more decimal places
print(f"Decision Tree Mean Squared Error: {dt_mse:.7f}")
print(f"Decision Tree Root Mean Squared Error: {dt_rmse:.7f}")
print(f"Decision Tree Mean Absolute Error: {dt_mae:.7f}")
print(f"Decision Tree R² Score: {dt_r2:.7f}")


Decision Tree Mean Squared Error: 1515.1187617
Decision Tree Root Mean Squared Error: 38.9245265
Decision Tree Mean Absolute Error: 33.6478037
Decision Tree R² Score: 0.0009872
