# MODEL TRAINING AND TESTING 

In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
# Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import  KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor

In [25]:
df = pd.read_csv('Supermart_cleaned_dataset.csv')
df.head()

Unnamed: 0,Category,Sub Category,City,Region,Sales,Discount,Profit,Order_Day,Order_Month,Order_Year,Order_Weekday,Order_Quarter,High Profit,Revenue_per_Discount,Category_Sales_Rank,Discount_level,Profit_margin_%,Profit_zscore,Profit_Outlier
0,5,14,21,2,1254,0.12,401.28,8.0,11.0,2017.0,6,4.0,1,10449.999913,7.0,medium,32.0,0.109798,0
1,1,13,8,3,749,0.18,149.8,8.0,11.0,2017.0,6,4.0,0,4161.111088,6.0,medium,20.0,-0.938381,0
2,3,0,13,4,2360,0.21,165.2,12.0,6.0,2017.0,1,2.0,0,11238.095185,3.0,high,7.0,-0.874193,0
3,4,12,4,3,896,0.25,89.6,11.0,10.0,2016.0,5,4.0,0,3583.999986,5.0,high,10.0,-1.189297,0
4,3,18,12,3,2355,0.26,918.45,11.0,10.0,2016.0,5,4.0,1,9057.692273,3.0,high,39.0,2.265384,0


In [26]:
# defining input and output features (X) and (y) target
X = df.drop(columns=['Sales', 'Profit'])
y = df[['Sales','Profit']]

In [27]:
X.head(3)

Unnamed: 0,Category,Sub Category,City,Region,Discount,Order_Day,Order_Month,Order_Year,Order_Weekday,Order_Quarter,High Profit,Revenue_per_Discount,Category_Sales_Rank,Discount_level,Profit_margin_%,Profit_zscore,Profit_Outlier
0,5,14,21,2,0.12,8.0,11.0,2017.0,6,4.0,1,10449.999913,7.0,medium,32.0,0.109798,0
1,1,13,8,3,0.18,8.0,11.0,2017.0,6,4.0,0,4161.111088,6.0,medium,20.0,-0.938381,0
2,3,0,13,4,0.21,12.0,6.0,2017.0,1,2.0,0,11238.095185,3.0,high,7.0,-0.874193,0


In [28]:
y.head(3)

Unnamed: 0,Sales,Profit
0,1254,401.28
1,749,149.8
2,2360,165.2


In [29]:
if 'Discount_level' in X.columns:
    X['Discount_level'] = X['Discount_level'].astype('category').cat.codes

In [30]:
X.head(4)

Unnamed: 0,Category,Sub Category,City,Region,Discount,Order_Day,Order_Month,Order_Year,Order_Weekday,Order_Quarter,High Profit,Revenue_per_Discount,Category_Sales_Rank,Discount_level,Profit_margin_%,Profit_zscore,Profit_Outlier
0,5,14,21,2,0.12,8.0,11.0,2017.0,6,4.0,1,10449.999913,7.0,2,32.0,0.109798,0
1,1,13,8,3,0.18,8.0,11.0,2017.0,6,4.0,0,4161.111088,6.0,2,20.0,-0.938381,0
2,3,0,13,4,0.21,12.0,6.0,2017.0,1,2.0,0,11238.095185,3.0,0,7.0,-0.874193,0
3,4,12,4,3,0.25,11.0,10.0,2016.0,5,4.0,0,3583.999986,5.0,0,10.0,-1.189297,0


In [31]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [32]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [33]:
# Define all models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor()
}

In [34]:
results = {
    "Model": [],
    "Target": [],
    "R2_Score": [],
    "MSE": [],
    "MAE": []
}

for name, base_model in models.items():
    print(f"🔍 Training {name}...")
    model = MultiOutputRegressor(base_model)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # ✅ This MUST be inside the model loop
    for i, col in enumerate(y.columns):  # y.columns = ['Sales', 'Profit']
        r2 = r2_score(y_test[col], y_pred[:, i])
        mse = mean_squared_error(y_test[col], y_pred[:, i])
        mae = mean_absolute_error(y_test[col], y_pred[:, i])

        results["Model"].append(name)
        results["Target"].append(col)
        results["R2_Score"].append(r2)
        results["MSE"].append(mse)
        results["MAE"].append(mae)

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Display full result
pd.set_option('display.max_rows', None)
print(results_df)


🔍 Training Linear Regression...
🔍 Training Ridge...
🔍 Training Lasso...
🔍 Training ElasticNet...
🔍 Training Decision Tree...
🔍 Training Random Forest...
🔍 Training Gradient Boosting...
🔍 Training AdaBoost...
🔍 Training XGBoost...
🔍 Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 729
[LightGBM] [Info] Number of data points in the train set: 7995, number of used features: 16
[LightGBM] [Info] Start training from score 1496.281551
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 729
[LightGBM] [Info] Number of data points in the train set: 7995, number of used features: 16
[LightGBM] [Info] Start training from score 375.088433




🔍 Training CatBoost...
🔍 Training SVR...
🔍 Training KNN...
                Model  Target  R2_Score           MSE           MAE
0   Linear Regression   Sales  0.898198  3.357602e+04  1.340760e+02
1   Linear Regression  Profit  1.000000  1.450657e-25  3.056844e-13
2               Ridge   Sales  0.898198  3.357589e+04  1.340742e+02
3               Ridge  Profit  1.000000  7.741248e-03  6.522501e-02
4               Lasso   Sales  0.898254  3.355753e+04  1.340946e+02
5               Lasso  Profit  0.999982  1.047533e+00  8.416846e-01
6          ElasticNet   Sales  0.731003  8.871961e+04  2.487727e+02
7          ElasticNet  Profit  0.924424  4.514122e+03  4.927332e+01
8       Decision Tree   Sales  0.999439  1.851651e+02  9.415208e+00
9       Decision Tree  Profit  0.999998  1.311103e-01  1.765333e-01
10      Random Forest   Sales  0.999710  9.569909e+01  6.852136e+00
11      Random Forest  Profit  0.999999  3.887456e-02  9.842471e-02
12  Gradient Boosting   Sales  0.993309  2.206869e+03  3.

In [35]:
results_df

Unnamed: 0,Model,Target,R2_Score,MSE,MAE
0,Linear Regression,Sales,0.898198,33576.02,134.076
1,Linear Regression,Profit,1.0,1.450657e-25,3.056844e-13
2,Ridge,Sales,0.898198,33575.89,134.0742
3,Ridge,Profit,1.0,0.007741248,0.06522501
4,Lasso,Sales,0.898254,33557.53,134.0946
5,Lasso,Profit,0.999982,1.047533,0.8416846
6,ElasticNet,Sales,0.731003,88719.61,248.7727
7,ElasticNet,Profit,0.924424,4514.122,49.27332
8,Decision Tree,Sales,0.999439,185.1651,9.415208
9,Decision Tree,Profit,0.999998,0.1311103,0.1765333


# conclusion:-
since random forest and XGBoost gives the best performance selecting both the models