In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')
extra_train_df = pd.read_csv('/kaggle/input/playground-series-s5e2/training_extra.csv')

categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
for col in categorical_cols:
    mode_value = train_df[col].mode()[0]  
    train_df[col].fillna(mode_value, inplace=True)
    test_df[col].fillna(mode_value, inplace=True)
    extra_train_df[col].fillna(mode_value, inplace=True)

numerical_cols = ['Weight Capacity (kg)', 'Compartments']
for col in numerical_cols:
    train_df[col].fillna(train_df[col].median(), inplace=True)
    test_df[col].fillna(test_df[col].median(), inplace=True)
    extra_train_df[col].fillna(extra_train_df[col].median(), inplace=True)

train_df['Material_Size'] = train_df['Material'] + "_" + train_df['Size']
test_df['Material_Size'] = test_df['Material'] + "_" + test_df['Size']
extra_train_df['Material_Size'] = extra_train_df['Material'] + "_" + extra_train_df['Size']

train_df['Laptop Compartment'] = train_df['Laptop Compartment'].apply(lambda x: 1 if x == "Yes" else 0)
test_df['Laptop Compartment'] = test_df['Laptop Compartment'].apply(lambda x: 1 if x == "Yes" else 0)
extra_train_df['Laptop Compartment'] = extra_train_df['Laptop Compartment'].apply(lambda x: 1 if x == "Yes" else 0)

train_df['Total_Compartments'] = train_df['Compartments'] + train_df['Laptop Compartment']
test_df['Total_Compartments'] = test_df['Compartments'] + test_df['Laptop Compartment']
extra_train_df['Total_Compartments'] = extra_train_df['Compartments'] + extra_train_df['Laptop Compartment']

categorical_features = ['Brand', 'Material_Size', 'Waterproof', 'Style', 'Color']
encoder = LabelEncoder()

for col in categorical_features:
    train_df[col] = encoder.fit_transform(train_df[col])  
    test_df[col] = encoder.transform(test_df[col])        
    extra_train_df[col] = encoder.transform(extra_train_df[col])

train_df.drop(columns=['Material', 'Size', 'Compartments', 'Laptop Compartment'], inplace=True)
test_df.drop(columns=['Material', 'Size', 'Compartments', 'Laptop Compartment'], inplace=True)
extra_train_df.drop(columns=['Material', 'Size', 'Compartments', 'Laptop Compartment'], inplace=True)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

train_sample = train_df.sample(n=1000000, random_state=42)
extra_sample = extra_train_df.sample(n=400000, random_state=42) 
train_df = pd.concat([train_sample, extra_sample], ignore_index=True)

scaler = StandardScaler()
train_df['Weight Capacity (kg)'] = scaler.fit_transform(train_df[['Weight Capacity (kg)']])
test_df['Weight Capacity (kg)'] = scaler.transform(test_df[['Weight Capacity (kg)']])

X = train_df.drop(columns=['Price'])  
y = train_df['Price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=8, learning_rate=0.1, n_jobs=-1)

xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)

y_pred = xgb_model.predict(X_val)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
y_pred = xgb_model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

In [None]:
final_xgb_model = xgb.XGBRegressor(n_estimators=150, max_depth=10, learning_rate=0.05, n_jobs=-1)
final_xgb_model.fit(X, y, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)
final_xgb_model.save_model("final_xgb_model.json")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
y_val_pred = final_xgb_model.predict(X_val)

mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
rmse = mse ** 0.5

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

In [None]:
test_preds = final_xgb_model.predict(test_df)
submission = pd.DataFrame({'id': test_df['id'], 'Price': test_preds}) 
submission.to_csv("submission.csv", index=False)
print("Submission file saved!")