Task1:- Optimizing Regression Model Performance

Apply lasso in dataset(https://www.kaggle.com/datasets/nishathakkar/100-sales) and calculate MSE,MAE,RMSE and it value should be minimum value come in range(0.111 or 1-12)

In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')

In [39]:
data = pd.read_csv("100_Sales.csv").drop(columns=['Unnamed: 9','Unnamed: 10'])
data

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Ship_Date,Unit_Cost,Total_Revenue,Total_Profit
0,Australia and Oceania,Tuvalu,Baby Food,Offline,H,27/06/2010,159.42,2533654.00,951410.50
1,Central America and the Caribbean,Grenada,Cereal,Online,C,15/09/2012,117.11,576782.80,248406.36
2,Europe,Russia,Office Supplies,Offline,L,05/08/2014,524.96,1158502.59,224598.75
3,Sub_Saharan Africa,Sao Tome and Principe,Fruits,Online,C,07/05/2014,6.92,75591.66,19525.82
4,Sub_Saharan Africa,Rwanda,Office Supplies,Offline,L,02/06/2013,524.96,3296425.02,639077.50
...,...,...,...,...,...,...,...,...,...
95,Sub_Saharan Africa,Mali,Clothes,Online,M,09/03/2011,35.84,97040.64,65214.72
96,Asia,Malaysia,Fruits,Offline,L,28/12/2011,6.92,58471.11,15103.47
97,Sub_Saharan Africa,Sierra Leone,Vegetables,Offline,C,29/06/2016,90.93,228779.10,93748.05
98,North America,Mexico,Personal Care,Offline,M,08/08/2015,56.67,471336.91,144521.02


In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Region          100 non-null    object 
 1   Country         100 non-null    object 
 2   Item_Type       100 non-null    object 
 3   Sales_Channel   100 non-null    object 
 4   Order_Priority  100 non-null    object 
 5   Ship_Date       100 non-null    object 
 6   Unit_Cost       100 non-null    float64
 7   Total_Revenue   100 non-null    float64
 8   Total_Profit    100 non-null    float64
dtypes: float64(3), object(6)
memory usage: 7.2+ KB


In [41]:
data.isnull().sum()

Region            0
Country           0
Item_Type         0
Sales_Channel     0
Order_Priority    0
Ship_Date         0
Unit_Cost         0
Total_Revenue     0
Total_Profit      0
dtype: int64

In [42]:
data.head()

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Ship_Date,Unit_Cost,Total_Revenue,Total_Profit
0,Australia and Oceania,Tuvalu,Baby Food,Offline,H,27/06/2010,159.42,2533654.0,951410.5
1,Central America and the Caribbean,Grenada,Cereal,Online,C,15/09/2012,117.11,576782.8,248406.36
2,Europe,Russia,Office Supplies,Offline,L,05/08/2014,524.96,1158502.59,224598.75
3,Sub_Saharan Africa,Sao Tome and Principe,Fruits,Online,C,07/05/2014,6.92,75591.66,19525.82
4,Sub_Saharan Africa,Rwanda,Office Supplies,Offline,L,02/06/2013,524.96,3296425.02,639077.5


In [43]:
data.tail()

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Ship_Date,Unit_Cost,Total_Revenue,Total_Profit
95,Sub_Saharan Africa,Mali,Clothes,Online,M,09/03/2011,35.84,97040.64,65214.72
96,Asia,Malaysia,Fruits,Offline,L,28/12/2011,6.92,58471.11,15103.47
97,Sub_Saharan Africa,Sierra Leone,Vegetables,Offline,C,29/06/2016,90.93,228779.1,93748.05
98,North America,Mexico,Personal Care,Offline,M,08/08/2015,56.67,471336.91,144521.02
99,Sub_Saharan Africa,Mozambique,Household,Offline,L,15/02/2012,502.54,3586605.09,889472.91


In [44]:
# Feature engineering
data['Profit_Margin'] = data['Total_Profit'] / data['Total_Revenue']
data['Revenue_per_Unit'] = data['Total_Revenue'] / data['Unit_Cost']

# Prepare the features (X) and the target (y)
X = data[['Unit_Cost', 'Total_Revenue', 'Profit_Margin', 'Revenue_per_Unit']]
y = data['Total_Profit']

# Log transform the target variable to handle skewed distributions
y_log = np.log1p(y)  # log1p applies log(1 + y) to avoid issues with 0 profits

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [45]:
# Define a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Fit the model on the training data
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return mse, rmse, mae

In [46]:
# Polynomial features with regularized models
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y_log, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

In [48]:
# Lasso with polynomial features
lasso_poly = make_pipeline(StandardScaler(), Lasso(random_state=42))
lasso_params = {'lasso__alpha': np.logspace(-6, 6, 13)}
lasso_grid = GridSearchCV(lasso_poly, lasso_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid.fit(X_train_poly, y_train)
best_lasso = lasso_grid.best_estimator_
lasso_mse, lasso_rmse, lasso_mae = evaluate_model(best_lasso, X_train_poly, X_test_poly, y_train, y_test)

In [50]:
# Print results
print("Lasso Regression (Poly) - MSE: {:.6f}, RMSE: {:.6f}, MAE: {:.6f}".format(lasso_mse, lasso_rmse, lasso_mae))

Lasso Regression (Poly) - MSE: 0.189604, RMSE: 0.435435, MAE: 0.266815


In [51]:
# Print best hyperparameters
print("Best Lasso alpha:", best_lasso.named_steps['lasso'].alpha)

Best Lasso alpha: 0.001


In [52]:
# Convert log-scale RMSE back to original scale
def rmse_to_original_scale(rmse):
    return np.sqrt(np.expm1(rmse**2))

print("\nRMSE values in original scale:")
print("Lasso Regression (Poly):", rmse_to_original_scale(lasso_rmse))


RMSE values in original scale:
Lasso Regression (Poly): 0.4569142670560878
