In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression,Ridge,Lasso

from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV

from sklearn.metrics import r2_score, mean_absolute_error

In [3]:
df= pd.read_csv('./cleaned_data.csv')

In [4]:
df.shape

(8519, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8519 entries, 0 to 8518
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 8519 non-null   int64  
 1   Item_Identifier            8519 non-null   object 
 2   Item_Weight                8519 non-null   float64
 3   Item_Fat_Content           8519 non-null   object 
 4   Item_Visibility            8519 non-null   float64
 5   Item_Type                  8519 non-null   object 
 6   Item_MRP                   8519 non-null   float64
 7   Outlet_Identifier          8519 non-null   object 
 8   Outlet_Establishment_Year  8519 non-null   int64  
 9   Outlet_Size                8519 non-null   object 
 10  Outlet_Location_Type       8519 non-null   object 
 11  Outlet_Type                8519 non-null   object 
 12  Item_Outlet_Sales          8519 non-null   float64
dtypes: float64(4), int64(2), object(7)
memory usage:

In [6]:
df.columns

Index(['Unnamed: 0', 'Item_Identifier', 'Item_Weight', 'Item_Fat_Content',
       'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [7]:
df = df.drop(columns=['Unnamed: 0', 'Item_Identifier','Outlet_Identifier'])

In [8]:
cat_cols = [col for col in df.select_dtypes('object')]

for col in cat_cols:
    print(f'{col}:{df[col].nunique()}')

Item_Fat_Content:2
Item_Type:16
Outlet_Size:3
Outlet_Location_Type:3
Outlet_Type:4


In [11]:
df_X = df.drop(columns=['Item_Outlet_Sales'])
df_y = df['Item_Outlet_Sales'].ravel()

In [12]:
df_Fat_Content = pd.get_dummies(df_X['Item_Fat_Content'],sparse =False,drop_first=True)
df_Outlet_Size = pd.get_dummies(df_X['Outlet_Size'],sparse =False,drop_first=True)
df_Outlet_Location_Type = pd.get_dummies(df_X['Outlet_Location_Type'],sparse =False,drop_first=True)
df_Outlet_Type = pd.get_dummies(df_X['Outlet_Type'],sparse =False,drop_first=True)

In [13]:
df_X = pd.concat([df_X,df_Fat_Content,df_Outlet_Size,df_Outlet_Location_Type,df_Outlet_Type],axis=1)
df_X.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Regular,Medium,Small,Tier 2,Tier 3,Supermarket Type1,Supermarket Type2,Supermarket Type3
0,9.3,Low Fat,0.01592,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,0,1,0,0,0,1,0,0
1,5.92,Regular,0.019095,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,1,1,0,0,1,0,1,0
2,17.5,Low Fat,0.016621,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1,0,1,0,0,0,1,0,0
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,Medium,Tier 3,Grocery Store,1,1,0,0,1,0,0,0
4,8.93,Low Fat,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,0,0,0,0,1,1,0,0


In [14]:
df_X = df_X.drop(columns=['Item_Fat_Content','Outlet_Size','Outlet_Location_Type','Outlet_Type'])

In [15]:
le = LabelEncoder()
df_X['Item_Type'] = le.fit_transform(df_X['Item_Type'])

In [16]:
df_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8519 entries, 0 to 8518
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                8519 non-null   float64
 1   Item_Visibility            8519 non-null   float64
 2   Item_Type                  8519 non-null   int32  
 3   Item_MRP                   8519 non-null   float64
 4   Outlet_Establishment_Year  8519 non-null   int64  
 5   Regular                    8519 non-null   uint8  
 6   Medium                     8519 non-null   uint8  
 7   Small                      8519 non-null   uint8  
 8   Tier 2                     8519 non-null   uint8  
 9   Tier 3                     8519 non-null   uint8  
 10  Supermarket Type1          8519 non-null   uint8  
 11  Supermarket Type2          8519 non-null   uint8  
 12  Supermarket Type3          8519 non-null   uint8  
dtypes: float64(3), int32(1), int64(1), uint8(8)
memo

In [17]:
sc = StandardScaler()

df_X = sc.fit_transform(df_X)

In [19]:
X_train,X_test,y_train,y_test = train_test_split(df_X,df_y,test_size=0.3,random_state=32)

In [20]:
lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()

In [21]:
lr_cv = cross_val_score(estimator=lr,
                        X = X_train,
                        y = y_train,
                        cv = 10,
                        scoring='r2')

In [23]:
ridge_cv = cross_val_score(estimator =ridge,
                           X = X_train,
                           y = y_train,
                           cv = 10,
                           scoring='r2')

In [24]:
lasso_cv = cross_val_score(estimator = lasso,
                           X=X_train,
                           y = y_train,
                           cv=10,
                           scoring='r2')

In [25]:
print(f'cross val avg. r2 score for Linear Regression:{lr_cv.mean()}')
print(f'cross val avg. r2 score for Ridge Regression:{ridge_cv.mean()}')
print(f'cross val avg. r2 score for Lasso Regression:{lasso_cv.mean()}')

cross val avg. r2 score for Linear Regression:0.6855669020857615
cross val avg. r2 score for Ridge Regression:0.6855726045802115
cross val avg. r2 score for Lasso Regression:0.22917873533788527


In [27]:
model = ridge.fit(X_train,y_train)
y_pred = model.predict(X_test)

print(f'r2_score:{r2_score(y_test,y_pred)}')
print(f'mse:{np.sqrt(mean_absolute_error(y_test,y_pred))}')

r2_score:0.6745744189734083
mse:1.2656741493290102


In [28]:
param_grid = {'alpha':[0.001,0.01,0.1,1]}

grid_cv= GridSearchCV(estimator= ridge,
                      param_grid = param_grid,
                      cv = 10,
                      scoring='r2')

grid_cv.fit(X_train,y_train)

print(f'best_score:{grid_cv.best_score_}')
print(f'best_params:{grid_cv.best_params_}')

best_score:0.6855726045802115
best_params:{'alpha': 1}
