In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Loading data...


In [3]:
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTrain columns: {list(train.columns)}")

Train shape: (4990, 13)
Test shape: (3532, 12)

Train columns: ['Item_ID', 'Store_ID', 'Item_Store_ID', 'Item_Weight', 'Item_Sugar_Content', 'Item_Visibility', 'Item_Type', 'Item_Price', 'Store_Start_Year', 'Store_Size', 'Store_Location_Type', 'Store_Type', 'Item_Store_Returns']


In [4]:
# Display basic info
print("=== TRAIN DATA INFO ===")
print(train.info())
print("\n=== BASIC STATISTICS ===")
print(train.describe())

=== TRAIN DATA INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4990 entries, 0 to 4989
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Item_ID              4990 non-null   object 
 1   Store_ID             4990 non-null   object 
 2   Item_Store_ID        4990 non-null   object 
 3   Item_Weight          4188 non-null   float64
 4   Item_Sugar_Content   4990 non-null   object 
 5   Item_Visibility      4990 non-null   float64
 6   Item_Type            4990 non-null   object 
 7   Item_Price           4990 non-null   float64
 8   Store_Start_Year     4990 non-null   int64  
 9   Store_Size           3540 non-null   object 
 10  Store_Location_Type  4990 non-null   object 
 11  Store_Type           4990 non-null   object 
 12  Item_Store_Returns   4990 non-null   float64
dtypes: float64(4), int64(1), object(8)
memory usage: 506.9+ KB
None

=== BASIC STATISTICS ===
       Item_Weight  Item

In [5]:
# Check missing values
print("=== MISSING VALUES ===")
missing_train = train.isnull().sum()
missing_test = test.isnull().sum()

missing_df = pd.DataFrame({
    'Train_Missing': missing_train,
    'Test_Missing': missing_test,
    'Train_Percent': (missing_train / len(train)) * 100,
    'Test_Percent': (missing_test / len(test)) * 100
})
print(missing_df[missing_df['Train_Missing'] > 0])

=== MISSING VALUES ===
             Train_Missing  Test_Missing  Train_Percent  Test_Percent
Item_Weight            802         661.0      16.072144     18.714609
Store_Size            1450         959.0      29.058116     27.151755


In [6]:
#4. Data Preprocessing and Feature Engineering
# Combine train and test for consistent preprocessing
train['source'] = 'train'
test['source'] = 'test'
test['Item_Store_Returns'] = np.nan


In [7]:
# Combine datasets
data = pd.concat([train, test], ignore_index=True)
print(f"Combined data shape: {data.shape}")

Combined data shape: (8522, 14)


In [8]:
# Feature Engineering
print("Engineering features...")

# Temporal features
data['Store_Age'] = 2025 - data['Store_Start_Year']

# Price-visibility interactions
data['Visibility_Adjusted_Price'] = data['Item_Price'] / (data['Item_Visibility'] + 1e-5)

# Store exposure metric
data['Store_Exposure'] = data['Item_Visibility'] * data['Store_Age']

# Price per weight ratio
data['Item_Price_per_Weight'] = data['Item_Price'] / data['Item_Weight']

# Weight categories
data['Item_Weight_Category'] = pd.cut(
    data['Item_Weight'], 
    bins=[0, 5, 10, 20, 100],
    labels=['Very_Light', 'Light', 'Medium', 'Heavy']
)

# Price categories based on quantiles
price_quantiles = data['Item_Price'].quantile([0.25, 0.5, 0.75]).values
data['Item_Price_Category'] = pd.cut(
    data['Item_Price'],
    bins=[0] + list(price_quantiles) + [float('inf')],
    labels=['Low', 'Medium', 'High', 'Premium']
)

# Visibility categories
data['Item_Visibility_Category'] = pd.cut(
    data['Item_Visibility'],
    bins=5,
    labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High']
)

print("Feature engineering completed!")
print(f"New features created: {data.shape[1] - len(train.columns)}")

Engineering features...
Feature engineering completed!
New features created: 7


In [9]:
# Handle missing values
print("Handling missing values...")

# Numeric columns
numeric_cols = ['Item_Weight', 'Item_Visibility', 'Item_Price']
for col in numeric_cols:
    if col in data.columns:
        data[col].fillna(data[col].mean(), inplace=True)

# Categorical columns
if 'Store_Size' in data.columns:
    data['Store_Size'].fillna(data['Store_Size'].mode()[0], inplace=True)

print("Missing values handled!")

Handling missing values...
Missing values handled!


In [10]:
# Encode categorical variables
print("Encoding categorical variables...")

# Define categorical columns
cat_cols = [
    'Item_ID', 'Store_ID', 'Item_Store_ID', 'Item_Sugar_Content',
    'Item_Type', 'Store_Size', 'Store_Location_Type', 'Store_Type',
    'Item_Weight_Category', 'Item_Price_Category', 'Item_Visibility_Category'
]

Encoding categorical variables...


In [11]:
# Label encode high-cardinality IDs
label_encoders = {}
label_cols = ['Item_ID', 'Store_ID', 'Item_Store_ID']

for col in label_cols:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le

In [13]:
# One-hot encode other categoricals
other_cats = [col for col in cat_cols if col not in label_cols and col in data.columns]
if other_cats:
    data = pd.get_dummies(data, columns=other_cats, drop_first=True)

print(f"Encoding completed! Final shape: {data.shape}")

Encoding completed! Final shape: (8522, 47)


In [14]:
#5. Dataset Preparation
# Separate train/test
train_final = data[data['source'] == 'train'].drop(['source'], axis=1)
test_final = data[data['source'] == 'test'].drop(['source', 'Item_Store_Returns'], axis=1)


In [15]:
# Features and target
X = train_final.drop('Item_Store_Returns', axis=1)
y = np.log1p(train_final['Item_Store_Returns'].astype(float))  # Log transform
X_test = test_final

In [16]:
print(f"Training features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Test features shape: {X_test.shape}")

Training features shape: (4990, 45)
Target shape: (4990,)
Test features shape: (3532, 45)


In [17]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

In [18]:
print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")


Train set: 3992 samples
Validation set: 998 samples


In [19]:
#6. Model Training - XGBoost
# XGBoost with hyperparameter tuning
print("Training XGBoost with hyperparameter tuning...")

# Parameter distribution for random search
param_dist = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3, 1]
}

# Random search
xgb_model = xgb.XGBRegressor(random_state=42)
search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_xgb_params = search.best_params_
print(f"Best XGBoost parameters: {best_xgb_params}")


Training XGBoost with hyperparameter tuning...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best XGBoost parameters: {'subsample': 1.0, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 1, 'colsample_bytree': 0.6}


In [20]:
# Train final XGBoost model with early stopping
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': best_xgb_params.get('max_depth', 6),
    'eta': best_xgb_params.get('learning_rate', 0.05),
    'subsample': best_xgb_params.get('subsample', 0.8),
    'colsample_bytree': best_xgb_params.get('colsample_bytree', 0.8),
    'seed': 42
}

xgb_final = xgb.train(
    xgb_params, dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dval, 'eval')],
    early_stopping_rounds=50,
    verbose_eval=50
)

# XGBoost predictions and evaluation
xgb_preds = xgb_final.predict(dval)
xgb_rmse = mean_squared_error(np.expm1(y_val), np.expm1(xgb_preds), squared=False)
print(f"\nXGBoost Validation RMSE: {xgb_rmse:.2f}")


[0]	train-rmse:0.99870	eval-rmse:0.95971
[50]	train-rmse:0.44740	eval-rmse:0.54452
[100]	train-rmse:0.39301	eval-rmse:0.54492
[111]	train-rmse:0.38500	eval-rmse:0.54604

XGBoost Validation RMSE: 3087.71


In [21]:

# ## 7. Model Training - CatBoost

# %%
# Train CatBoost model
print("Training CatBoost model...")

cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100
)

cat_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=100
)

Training CatBoost model...
0:	learn: 1.0016579	test: 0.9610068	best: 0.9610068 (0)	total: 133ms	remaining: 2m 12s
100:	learn: 0.5063687	test: 0.5332939	best: 0.5332939 (100)	total: 952ms	remaining: 8.48s
200:	learn: 0.4856904	test: 0.5335299	best: 0.5321506 (150)	total: 1.76s	remaining: 7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.5321506196
bestIteration = 150

Shrink model to first 151 iterations.


<catboost.core.CatBoostRegressor at 0x19373facd30>

In [22]:
# CatBoost predictions and evaluation
cat_preds = cat_model.predict(X_val)
cat_rmse = mean_squared_error(np.expm1(y_val), np.expm1(cat_preds), squared=False)
print(f"\nCatBoost Validation RMSE: {cat_rmse:.2f}")


CatBoost Validation RMSE: 3040.58


In [23]:
#8. Model Comparison and Selection

# Compare models
models_performance = {
    'XGBoost': xgb_rmse,
    'CatBoost': cat_rmse
}

print("=== MODEL COMPARISON ===")
for model_name, rmse in models_performance.items():
    print(f"{model_name}: {rmse:.2f}")


=== MODEL COMPARISON ===
XGBoost: 3087.71
CatBoost: 3040.58


In [24]:
# Select best model
best_model_name = min(models_performance.keys(), key=lambda k: models_performance[k])
best_rmse = models_performance[best_model_name]

print(f"\nBest Model: {best_model_name} (RMSE: {best_rmse:.2f})")



Best Model: CatBoost (RMSE: 3040.58)


In [25]:
#10. Final Predictions and Submission
# Generate test predictions using the best model
print("Generating final predictions...")

if best_model_name == 'XGBoost':
    # Retrain XGBoost on full training data
    dfull = xgb.DMatrix(X, label=y)
    dtest = xgb.DMatrix(X_test)
    
    full_xgb_model = xgb.train(
        xgb_params, dfull,
        num_boost_round=xgb_final.best_iteration
    )
    test_preds_log = full_xgb_model.predict(dtest)
    
else:  # CatBoost
    # Retrain CatBoost on full training data
    full_cat_model = CatBoostRegressor(
        iterations=cat_model.get_param('iterations'),
        learning_rate=cat_model.get_param('learning_rate'),
        depth=cat_model.get_param('depth'),
        random_seed=42,
        verbose=False
    )
    full_cat_model.fit(X, y)
    test_preds_log = full_cat_model.predict(X_test)


Generating final predictions...


In [32]:
# Create and save submission file
submission = pd.DataFrame({
    'Item_Store_ID': test['Item_Store_ID'],
    'Predicted_Returns': test_predictions
})

submission_filename = f"submission_{best_model_name.lower()}_optimized.csv"
submission.to_csv(submission_filename, index=False)

print(f"Submission saved to: {submission_filename}")
print(f"Submission shape: {submission.shape}")
print("\nFirst few predictions:")



Submission saved to: submission_catboost_optimized.csv
Submission shape: (3532, 2)

First few predictions:


In [33]:
submission.head(10)

Unnamed: 0,Item_Store_ID,Predicted_Returns
0,DRA59_BABATUNJI010,1448.060402
1,DRA59_BABATUNJI013,10100.657712
2,DRB01_BABATUNJI013,5259.024335
3,DRB13_BABATUNJI010,1154.333716
4,DRB13_BABATUNJI013,4969.162411
5,DRB25_BABATUNJI017,4494.5467
6,DRB25_BABATUNJI027,6218.621464
7,DRB25_BABATUNJI035,4898.263514
8,DRB48_BABATUNJI017,2252.006141
9,DRB48_BABATUNJI027,5273.486595
