In [22]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load Data
train_df = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')
extra_train_df = pd.read_csv('/kaggle/input/playground-series-s5e2/training_extra.csv')

# Fill categorical missing values with mode from train_df
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
for col in categorical_cols:
    mode_value = train_df[col].mode()[0]  
    train_df[col].fillna(mode_value, inplace=True)
    test_df[col].fillna(mode_value, inplace=True)
    extra_train_df[col].fillna(mode_value, inplace=True)

# Fill numerical missing values with median
numerical_cols = ['Weight Capacity (kg)', 'Compartments']
for col in numerical_cols:
    train_df[col].fillna(train_df[col].median(), inplace=True)
    test_df[col].fillna(test_df[col].median(), inplace=True)
    extra_train_df[col].fillna(extra_train_df[col].median(), inplace=True)

# Create a new feature combining 'Material' and 'Size'
train_df['Material_Size'] = train_df['Material'] + "_" + train_df['Size']
test_df['Material_Size'] = test_df['Material'] + "_" + test_df['Size']
extra_train_df['Material_Size'] = extra_train_df['Material'] + "_" + extra_train_df['Size']

# Convert 'Laptop Compartment' to numeric (1 if 'Yes', 0 otherwise)
train_df['Laptop Compartment'] = train_df['Laptop Compartment'].apply(lambda x: 1 if x == "Yes" else 0)
test_df['Laptop Compartment'] = test_df['Laptop Compartment'].apply(lambda x: 1 if x == "Yes" else 0)
extra_train_df['Laptop Compartment'] = extra_train_df['Laptop Compartment'].apply(lambda x: 1 if x == "Yes" else 0)

# Merge 'Compartments' and 'Laptop Compartment' into 'Total_Compartments'
train_df['Total_Compartments'] = train_df['Compartments'] + train_df['Laptop Compartment']
test_df['Total_Compartments'] = test_df['Compartments'] + test_df['Laptop Compartment']
extra_train_df['Total_Compartments'] = extra_train_df['Compartments'] + extra_train_df['Laptop Compartment']

# Encode categorical features using train_df's encoding
categorical_features = ['Brand', 'Material_Size', 'Waterproof', 'Style', 'Color']
encoder = LabelEncoder()

for col in categorical_features:
    train_df[col] = encoder.fit_transform(train_df[col])  
    test_df[col] = encoder.transform(test_df[col])        
    extra_train_df[col] = encoder.transform(extra_train_df[col])

# Drop merged columns
train_df.drop(columns=['Material', 'Size', 'Compartments', 'Laptop Compartment'], inplace=True)
test_df.drop(columns=['Material', 'Size', 'Compartments', 'Laptop Compartment'], inplace=True)
extra_train_df.drop(columns=['Material', 'Size', 'Compartments', 'Laptop Compartment'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [24]:
# Merge the extra training data with the main training data
train_df = pd.concat([train_df, extra_train_df], ignore_index=True)

In [25]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Apply scaling
train_df['Weight Capacity (kg)'] = scaler.fit_transform(train_df[['Weight Capacity (kg)']])
test_df['Weight Capacity (kg)'] = scaler.transform(test_df[['Weight Capacity (kg)']])


In [26]:
from sklearn.model_selection import train_test_split

# Define features and target
X = train_df.drop(columns=['Price'])  # Replace 'Target' with your actual target column name
y = train_df['Price']

# Split data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Best parameters (based on common optimizations)
rf_model = RandomForestRegressor(
    n_estimators=100,   # Balanced between speed & accuracy
    max_depth=20,       # Prevents overfitting
    min_samples_split=5, # Better generalization
    random_state=42,
    n_jobs=-1           # Use all CPU cores
)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)


In [None]:
best_params = grid_search.best_params_

# Train the final model with best parameters
final_model = RandomForestRegressor(**best_params, random_state=42)
final_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Predict on validation set
y_val_pred = final_model.predict(X_val)

# Evaluate performance
mae = mean_absolute_error(y_val, y_val_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print(f"Validation MAE: {mae}")
print(f"Validation RMSE: {rmse}")


In [None]:
# Predict on test data
test_predictions = final_model.predict(X_test)


In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'Id': test_df['Id'],  # Make sure 'Id' column exists in test data
    'Weight Capacity (kg)': test_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file saved as 'submission.csv'")