In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# --- 1. Load Data ---
file_path = "blinkit_grocery_data_orignal.xlsx"  # <--- Make sure this path is correct
try:
    # When reading, pandas handles spaces in column names automatically
    df = pd.read_excel(file_path)
    print("Dataset loaded successfully.")
    # Strip any leading/trailing whitespace from column names just in case
    df.columns = df.columns.str.strip()
    print("Original column names:", df.columns.tolist())
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
    exit()

# --- 2. Basic Cleaning & Feature Engineering ---
print("\n--- Cleaning Data ---")

# Standardize 'Item Fat Content'
print("Original 'Item Fat Content' values:", df["Item Fat Content"].unique())
df["Item Fat Content"] = df["Item Fat Content"].replace(
    {"LF": "Low Fat", "low fat": "Low Fat", "reg": "Regular"}
)
print("Cleaned 'Item Fat Content' values:", df["Item Fat Content"].unique())

# Feature Engineering: Extract Outlet Age
current_year = pd.Timestamp.now().year
if "Outlet Establishment Year" in df.columns:
    df["Outlet Age"] = current_year - df["Outlet Establishment Year"]
    print("'Outlet Age' feature created.")
else:
    print("Warning: 'Outlet Establishment Year' not found, cannot create 'Outlet Age'.")


# --- 3. Feature Selection ---
# Drop identifiers and the original year column
# 'Item Identifier' and 'Outlet Identifier' usually don't help predict sales directly
# and have too many unique values (high cardinality) for simple encoding.
features_to_drop = ["Item Identifier", "Outlet Identifier"]
# Only drop Outlet Establishment Year if Outlet Age was created
if "Outlet Age" in df.columns:
    features_to_drop.append("Outlet Establishment Year")

# Check which columns actually exist before dropping
existing_cols_to_drop = [col for col in features_to_drop if col in df.columns]
df.drop(columns=existing_cols_to_drop, inplace=True)
print(f"Dropped columns: {existing_cols_to_drop}")

# Define target and features
TARGET = "Sales"
FEATURES = [col for col in df.columns if col != TARGET]

X = df[FEATURES]
y = df[TARGET]

print(f"\nPredicting '{TARGET}' using features: {FEATURES}")

# --- 4. Preprocessing ---
print("\n--- Setting up Preprocessing ---")

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include="object").columns.tolist()
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

# Check if 'Outlet Size' is categorical and handle its order if needed
if "Outlet Size" in categorical_features:
    print("'Outlet Size' identified as categorical.")
    # Note: If treating as ordinal, you might use OrdinalEncoder separately
    # For simplicity here, we'll use OneHotEncoder which works for nominal/ordinal.

print(f"Categorical Features: {categorical_features}")
print(f"Numerical Features: {numerical_features}")

# Create preprocessing pipelines for numerical and categorical features

# Numerical Pipeline:
# 1. Impute missing values (e.g., 'Item Weight', 'Rating') with the median
# 2. Scale features to have zero mean and unit variance
numerical_pipeline = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

# Categorical Pipeline:
# 1. Impute missing values (e.g., 'Outlet Size') with a constant value like 'Missing' or the mode
# 2. One-Hot Encode categorical variables
categorical_pipeline = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="most_frequent"),
        ),  # Or strategy='constant', fill_value='Missing'
        (
            "onehot",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
        ),  # handle_unknown ignores categories only seen in test data
    ]
)

# Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_features),
        ("cat", categorical_pipeline, categorical_features),
    ],
    remainder="passthrough",  # Keep any columns not specified (shouldn't be any here)
)

# --- 5. Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(
    f"\nData split into Train ({X_train.shape[0]} samples) and Test ({X_test.shape[0]} samples)"
)

# --- 6. Model Training ---
# We'll try two models: Linear Regression and RandomForestRegressor

# --- Model 1: Linear Regression ---
print("\n--- Training Linear Regression ---")
lr_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)

lr_pipeline.fit(X_train, y_train)
print("Linear Regression training complete.")

# --- Model 2: Random Forest Regressor ---
print("\n--- Training Random Forest Regressor ---")
# For Random Forest, scaling is less critical, but the pipeline structure is convenient
rf_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        # n_estimators=100 is a common default, n_jobs=-1 uses all available CPU cores
        (
            "regressor",
            RandomForestRegressor(
                n_estimators=100,
                random_state=42,
                n_jobs=-1,
                max_depth=15,
                min_samples_split=10,
                min_samples_leaf=5,
            ),
        ),  # Added some hyperparameters
    ]
)

rf_pipeline.fit(X_train, y_train)
print("Random Forest training complete.")


# --- 7. Evaluation ---
print("\n--- Model Evaluation ---")

models = {"Linear Regression": lr_pipeline, "Random Forest": rf_pipeline}
results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}
    print(f"\n{name}:")
    print(f"  R-squared (R2): {r2:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.2f}")
    print(f"  Root Mean Squared Error (RMSE): {rmse:.2f}")

# --- Summary ---
print("\n--- Evaluation Summary ---")
results_df = pd.DataFrame(results).T  # Transpose for better viewing
print(results_df.round(4))

best_r2_model = results_df["R2"].idxmax()
print(f"\nModel with the best R-squared score on the test set: {best_r2_model}")

lr_model_filename = "linear_regression_sales_model.joblib"
rf_model_filename = "random_forest_sales_model.joblib"

# Save the Linear Regression pipeline (preprocessor + model)
try:
    joblib.dump(lr_pipeline, lr_model_filename)
    print(
        f"Linear Regression model pipeline saved successfully to '{lr_model_filename}'"
    )
except Exception as e:
    print(f"Error saving Linear Regression model: {e}")

# Save the Random Forest pipeline (preprocessor + model)
try:
    joblib.dump(rf_pipeline, rf_model_filename)
    print(f"Random Forest model pipeline saved successfully to '{rf_model_filename}'")
except Exception as e:
    print(f"Error saving Random Forest model: {e}")


Dataset loaded successfully.
Original column names: ['Item Fat Content', 'Item Identifier', 'Item Type', 'Outlet Establishment Year', 'Outlet Identifier', 'Outlet Location Type', 'Outlet Size', 'Outlet Type', 'Item Visibility', 'Item Weight', 'Sales', 'Rating']

--- Cleaning Data ---
Original 'Item Fat Content' values: ['Regular' 'Low Fat' 'low fat' 'LF' 'reg']
Cleaned 'Item Fat Content' values: ['Regular' 'Low Fat']
'Outlet Age' feature created.
Dropped columns: ['Item Identifier', 'Outlet Identifier', 'Outlet Establishment Year']

Predicting 'Sales' using features: ['Item Fat Content', 'Item Type', 'Outlet Location Type', 'Outlet Size', 'Outlet Type', 'Item Visibility', 'Item Weight', 'Rating', 'Outlet Age']

--- Setting up Preprocessing ---
'Outlet Size' identified as categorical.
Categorical Features: ['Item Fat Content', 'Item Type', 'Outlet Location Type', 'Outlet Size', 'Outlet Type']
Numerical Features: ['Item Visibility', 'Item Weight', 'Rating', 'Outlet Age']

Data split into