In [1]:
import pandas as pd
# reading the dataset
df = pd.read_csv('BigMartTrain.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [2]:
y_train = df['Item_Outlet_Sales']
x_train = df.drop(columns=['Item_Outlet_Sales'])

## Featrure Engineering

In [3]:
x_train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [4]:
x_train['Item_Weight'].fillna(x_train['Item_Weight'].mean(), inplace = True)

x_train['Outlet_Size'].fillna(x_train['Outlet_Size'].mode()[0], inplace = True)


In [5]:
x_train.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [6]:
item_type = pd.get_dummies(x_train['Item_Type'], prefix='Item_Type')
outlet_id = pd.get_dummies(x_train['Outlet_Identifier'], prefix='Outlet_Identifier')
outlet_size = pd.get_dummies(x_train['Outlet_Size'], prefix='Outlet_Size')
outlet_location_type = pd.get_dummies(x_train['Outlet_Location_Type'], prefix='Outlet_Location_Type')
outlet_type = pd.get_dummies(x_train['Outlet_Type'], prefix='Outlet_Type')

x_train = x_train.drop(columns=['Item_Type', 'Outlet_Identifier',
                                'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

x_train = pd.concat([x_train, item_type, outlet_id,
                     outlet_size, outlet_location_type, outlet_type], axis=1)

In [7]:
from sklearn.preprocessing import LabelEncoder

item_id_le = LabelEncoder()
x_train['Item_Identifier'] = item_id_le.fit_transform(x_train['Item_Identifier'])

item_fat_le = LabelEncoder()
x_train['Item_Fat_Content'] = item_fat_le.fit_transform(x_train['Item_Fat_Content'])

In [8]:
x_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,156,9.3,1,0.016047,249.8092,1999,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,8,5.92,2,0.019278,48.2692,2009,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,662,17.5,1,0.01676,141.618,1999,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,1121,19.2,2,0.0,182.095,1998,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
4,1297,8.93,1,0.0,53.8614,1987,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0


## model

In [9]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

In [10]:
import plotly.express as px 

bar_chart = pd.DataFrame({'Feature': x_train.columns, 'Coefficient': lr.coef_})

fig = px.bar(bar_chart, x='Coefficient', y='Feature')

fig.update_layout(height=1200,
                  yaxis={'categoryorder':'total ascending'})
fig.show()

# Applying Regularizaion methods 

## 1. Ridge Regression

In [31]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

ridgeReg1 = Ridge(alpha=0.05)
ridgeReg2 = Ridge(alpha=0.5)
ridgeReg3 = Ridge(alpha=300)
ridgeReg4 = Ridge(alpha=1000)

ridgeReg1.fit(x_train_scaled,y_train)
ridgeReg2.fit(x_train_scaled,y_train)
ridgeReg3.fit(x_train_scaled,y_train)
ridgeReg4.fit(x_train_scaled,y_train)

In [32]:
bar_chart = pd.DataFrame({'Feature': x_train.columns,
                          '0.05': ridgeReg1.coef_,
                          '0.5': ridgeReg2.coef_,
                          '300': ridgeReg3.coef_,
                          '1000': ridgeReg4.coef_})

fig = px.bar(bar_chart, x=['0.05', '0.5', '300', '1000'], y='Feature')

fig.update_layout(legend=dict(title="Alpha"),
                  barmode='group',
                  height=1200,
                  yaxis={'categoryorder':'total ascending'})
fig.show()

## Lasso Regression

In [33]:
from sklearn.linear_model import Lasso

lassoReg1 = Lasso(alpha=0.05)
lassoReg2 = Lasso(alpha=0.5)

lassoReg1.fit(x_train_scaled, y_train)
lassoReg2.fit(x_train_scaled, y_train)


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.546e+08, tolerance: 2.482e+06


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.078e+07, tolerance: 2.482e+06



In [34]:
bar_chart = pd.DataFrame({'Feature': x_train.columns,
                          '0.05': lassoReg1.coef_,
                          '0.5': lassoReg2.coef_})

fig = px.bar(bar_chart, x=['0.05', '0.5'], y='Feature')

fig.update_layout(legend=dict(title="Alpha"),
                  barmode='group',
                  height=1200,
                  yaxis={'categoryorder':'total ascending'})
fig.show()

## 3. Elastic net Regression 

In [35]:
from sklearn.linear_model import ElasticNet

ENReg1 = ElasticNet(alpha=1, l1_ratio=0.5)
ENReg2 = ElasticNet(alpha=1, l1_ratio=1)

ENReg1.fit(x_train, y_train)
ENReg2.fit(x_train, y_train)

In [36]:
bar_chart = pd.DataFrame({'Feature': x_train.columns,
                          '0.05': ENReg1.coef_,
                          '0.5': ENReg2.coef_})

fig = px.bar(bar_chart, x=['0.05', '0.5'], y='Feature')

fig.update_layout(legend=dict(title="l1_ratio"),
                  barmode='group',
                  height=1200,
                  yaxis={'categoryorder':'total ascending'})
fig.show()