In [None]:
!pip install lightgbm
!pip install xgboost


In [14]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor

import xgboost as xgb
import lightgbm as lgb

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)


In [15]:

# ---------------------
# 1. Load the datasets
# ---------------------
train = pd.read_csv('/Users/manas/Desktop/train_v9rqX0R.csv')
test = pd.read_csv('/Users/manas/Desktop/test_AbJTz2l.csv')

# Preserve original identifiers for submission
test_ids = test[['Item_Identifier', 'Outlet_Identifier']].copy()



In [16]:

# ----------------------------
# 2. Data Cleaning & Missing Values
# ----------------------------
train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True)
test['Item_Weight'].fillna(test['Item_Weight'].median(), inplace=True)

train['Outlet_Size'].fillna('Unknown', inplace=True)
test['Outlet_Size'].fillna('Unknown', inplace=True)

# Standardize Item_Fat_Content labels
train['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, inplace=True)
test['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, inplace=True)

# Correct zero Item_Visibility values using median per Item_Identifier
visibility_median = train.groupby('Item_Identifier')['Item_Visibility'].median()
train.loc[train['Item_Visibility'] == 0, 'Item_Visibility'] = train['Item_Identifier'].map(visibility_median)
test.loc[test['Item_Visibility'] == 0, 'Item_Visibility'] = test['Item_Identifier'].map(visibility_median)


In [17]:

# ----------------------------
# 3. Feature Engineering
# ----------------------------
# Create Outlet_Age and drop the original establishment year
train['Outlet_Age'] = 2023 - train['Outlet_Establishment_Year']
test['Outlet_Age'] = 2023 - test['Outlet_Establishment_Year']
train.drop('Outlet_Establishment_Year', axis=1, inplace=True)
test.drop('Outlet_Establishment_Year', axis=1, inplace=True)

# Interaction Feature: Multiply Item_Visibility by Item_MRP
train['Visibility_MRP_Interaction'] = train['Item_Visibility'] * train['Item_MRP']
test['Visibility_MRP_Interaction'] = test['Item_Visibility'] * test['Item_MRP']

# Log-transform Item_Visibility to reduce skewness
train['Item_Visibility_Log'] = np.log1p(train['Item_Visibility'])
test['Item_Visibility_Log'] = np.log1p(test['Item_Visibility'])

# Log-transform the target variable for stability
train['Log_Sales'] = np.log1p(train['Item_Outlet_Sales'])

In [18]:

# ----------------------------
# 4. Prepare Modeling Data
# ----------------------------
# Drop identifier columns and also drop the target column ("Item_Outlet_Sales" and "Log_Sales")
cols_to_drop = ['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales', 'Log_Sales']
train_model = train.drop(columns=cols_to_drop)
test_model = test.drop(columns=['Item_Identifier', 'Outlet_Identifier'])

# Identify categorical columns for one-hot encoding.
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

# One-hot encode categorical features for both training and test sets.
train_model = pd.get_dummies(train_model, columns=categorical_cols, drop_first=True)
test_model = pd.get_dummies(test_model, columns=categorical_cols, drop_first=True)

# Align the train and test sets to ensure they have the same features.
# (Since we dropped "Log_Sales" from train_model, it won't be added to test_model.)
train_model, test_model = train_model.align(test_model, join='left', axis=1, fill_value=0)

# Define features and target.
X = train_model.copy()  # All features after encoding and alignment.
y = train['Log_Sales']   # Use the log-transformed target from the original train DataFrame


In [19]:

# ----------------------------
# 5. Scale Numerical Features
# ----------------------------
# Identify numerical columns (including engineered features)
num_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age', 
            'Visibility_MRP_Interaction', 'Item_Visibility_Log']

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
test_model[num_cols] = scaler.transform(test_model[num_cols])


In [None]:

# ----------------------------
# 6. Advanced Modeling: Stacking Ensemble
# ----------------------------
# Define base estimators with tuned hyperparameters
estimators = [
    ('rf', RandomForestRegressor(n_estimators=300, max_depth=12, random_state=42)),
    ('xgb', xgb.XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05, 
                             random_state=42, objective='reg:squarederror')),
    ('lgb', lgb.LGBMRegressor(n_estimators=300, max_depth=12, learning_rate=0.05, random_state=42))
]

# Use Ridge as the final estimator in the stacking ensemble.
stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    n_jobs=-1
)

# Train the stacking ensemble on the full training data.
stack_model.fit(X, y)


In [None]:

# ----------------------------
# 7. Test Predictions & Submission
# ----------------------------
# Predict log-sales on the test set and then inverse the log transformation.
test_preds_log = stack_model.predict(test_model)
test_preds = np.expm1(test_preds_log)

# Create the submission file using the original identifiers.
submission = test_ids.copy()
submission['Item_Outlet_Sales'] = test_preds

# Save submission file in the current working directory.
submission_file_path = os.path.join(os.getcwd(), "Advanced_BigMart_Sales_Predictions.csv")
submission.to_csv(submission_file_path, index=False)
print(f"Submission file saved at: {submission_file_path}")