In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
import matplotlib.pyplot as plt

In [2]:
train=pd.read_csv('bigmart_train.csv')
test=pd.read_csv('bigmart_test.csv')

In [3]:
print(train.shape)
print(train.columns)

(8523, 12)
Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')


In [4]:
print(test.shape)
print(test.columns)

(5681, 11)
Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type'],
      dtype='object')


In [5]:
train['source']='bigmart_train'
test['source']='bigmart_test'
data=pd.concat([train,test],ignore_index=True)
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,bigmart_train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,bigmart_train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,bigmart_train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,bigmart_train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,bigmart_train


In [10]:
# Finding the Null values
data.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [12]:
data.describe()
data.shape

(14204, 13)

In [67]:
print(data["Item_Fat_Content"].unique())
print(data["Outlet_Establishment_Year"].unique())
print(data["Outlet_Size"].unique())

['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
[1999 2009 1998 1987 1985 2002 2007 1997 2004]
['Medium' nan 'High' 'Small']


In [69]:
print(data["Item_Fat_Content"].value_counts())
print()
print(data["Outlet_Size"].value_counts())

Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64

Medium    4655
Small     3980
High      1553
Name: Outlet_Size, dtype: int64


In [70]:
#Filling Null values
data["Outlet_Size"]=data["Outlet_Size"].fillna(data["Outlet_Size"].mode()[0])
data["Item_Weight"]=data["Item_Weight"].fillna(data["Item_Weight"].mean())

In [74]:
# Removing outliers in the data
Q1=data["Item_Visibility"].quantile(0.25)
Q3=data["Item_Visibility"].quantile(0.75)
IQR= Q3 - Q1
fill_data=data.query('(@Q1-1.5*@IQR) <= Item_Visibility <= (@Q3+1.5*@IQR)')
fill_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,bigmart_train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,bigmart_train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,bigmart_train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38,bigmart_train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,bigmart_train


In [75]:
data=fill_data
data.shape

(13943, 13)

In [76]:
# Converting numerical data to categorical data Item_Visibility
data["Item_Visibility_bins"]=pd.cut(data["Item_Visibility"],[0.000,0.065,0.13,0.2],labels=['Low Viz','Viz','High Viz'])
data["Item_Visibility_bins"].value_counts()

Low Viz     7363
Viz         4283
High Viz    1418
Name: Item_Visibility_bins, dtype: int64

In [77]:
# Replacing the values 
data["Item_Visibility_bins"]=data["Item_Visibility_bins"].replace(np.nan,"Low Viz",regex=True)
data["Item_Fat_Content"]=data["Item_Fat_Content"].replace(["low fat","LF"],"Low Fat")
data["Item_Fat_Content"]=data["Item_Fat_Content"].replace("reg","Regular")

In [78]:
# Label encoding


# Coding all categorical values as numeric
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

# Transforming all categorical values
data["Item_Fat_Content"]=le.fit_transform(data["Item_Fat_Content"])
data["Item_Visibility_bins"]=le.fit_transform(data["Item_Visibility_bins"])
data["Outlet_Size"]=le.fit_transform(data["Outlet_Size"])
data["Outlet_Location_Type"]=le.fit_transform(data["Outlet_Location_Type"])



In [80]:
print(data["Item_Identifier"])
print()
print(data["Item_Identifier"].value_counts())

0        FDA15
1        DRC01
2        FDN15
3        FDX07
4        NCD19
         ...  
14199    FDB58
14200    FDD47
14201    NCO17
14202    FDJ26
14203    FDU37
Name: Item_Identifier, Length: 13943, dtype: object

FDE33    10
FDM12    10
FDY47    10
FDT03    10
FDO01    10
         ..
FDA10     7
FDO33     7
FDZ60     7
NCW54     7
FDG21     7
Name: Item_Identifier, Length: 1559, dtype: int64


In [91]:
# Combined two columns(item_identifier & item_type) into Item_Type_Combined

data["Item_Type_Combined"]=data["Item_Identifier"].apply(lambda x : x[0:2])
data["Item_Type_Combined"]=data["Item_Type_Combined"].map({'FD':'Food','NC':'Non_Consumable','DR':'Drinks'})
print(data["Item_Type_Combined"])
print()
print(data["Item_Type_Combined"].value_counts())

0                  Food
1                Drinks
2                  Food
3                  Food
4        Non_Consumable
              ...      
14199              Food
14200              Food
14201    Non_Consumable
14202              Food
14203              Food
Name: Item_Type_Combined, Length: 13943, dtype: object

Food              9991
Non_Consumable    2652
Drinks            1300
Name: Item_Type_Combined, dtype: int64


In [94]:
train2=pd.read_csv("train_modified.csv")
test2=pd.read_csv("test_modified.csv")

In [46]:
train2.dtypes

Item_Identifier                       object
Item_Weight                          float64
Item_Visibility                      float64
Item_MRP                             float64
Outlet_Identifier                     object
Item_Outlet_Sales                    float64
Item_Visibility_bins                   int64
Item_Fat_Content_0                     int64
Item_Fat_Content_1                     int64
Outlet_Location_Type_0                 int64
Outlet_Location_Type_1                 int64
Outlet_Location_Type_2                 int64
Outlet_Size_0                          int64
Outlet_Size_1                          int64
Outlet_Size_2                          int64
Outlet_Type_Grocery Store              int64
Outlet_Type_Supermarket Type1          int64
Outlet_Type_Supermarket Type2          int64
Outlet_Type_Supermarket Type3          int64
Item_Type_Combined_Drinks              int64
Item_Type_Combined_Food                int64
Item_Type_Combined_Non_Consumable      int64
dtype: obj

In [97]:
x_train=train2.drop(["Item_Outlet_Sales","Outlet_Identifier","Item_Identifier"],axis=1)
y_train=train2.Item_Outlet_Sales

In [98]:
x_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Visibility_bins,Item_Fat_Content_0,Item_Fat_Content_1,Outlet_Location_Type_0,Outlet_Location_Type_1,Outlet_Location_Type_2,Outlet_Size_0,Outlet_Size_1,Outlet_Size_2,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Type_Combined_Drinks,Item_Type_Combined_Food,Item_Type_Combined_Non_Consumable
0,9.3,0.016047,249.8092,1,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0
1,5.92,0.019278,48.2692,1,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0
2,17.5,0.01676,141.618,1,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0
3,19.2,0.0,182.095,3,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0
4,8.93,0.0,53.8614,3,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1


In [99]:
y_train.head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

In [49]:
x_test=test2.drop(["Outlet_Identifier","Item_Identifier"],axis=1)

In [100]:
x_test.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Visibility_bins,Item_Fat_Content_0,Item_Fat_Content_1,Outlet_Location_Type_0,Outlet_Location_Type_1,Outlet_Location_Type_2,Outlet_Size_0,Outlet_Size_1,Outlet_Size_2,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Type_Combined_Drinks,Item_Type_Combined_Food,Item_Type_Combined_Non_Consumable
0,20.75,0.007565,107.8622,1,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0
1,8.3,0.038428,87.3198,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0
2,14.6,0.099575,241.7538,2,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1
3,7.315,0.015388,155.034,1,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
4,12.792854,0.118599,234.23,2,0,1,0,0,1,0,1,0,0,0,0,1,0,1,0


In [102]:
# Train-Test split
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
xtrain,xtest,ytrain,ytest=model_selection.train_test_split(x_train,y_train,test_size=0.3,random_state=42)
lr.fit(xtrain,ytrain)

In [103]:
# Prediction
y_pred=lr.predict(xtest)
print(y_pred)

[2078.5717376  3704.4024433  2887.98953951 ... 3555.48774677 3464.34763541
 1237.58710242]


In [104]:
# Coefficients 
lr.coef_

array([-1.85658150e+00, -3.40412875e+02,  1.58748414e+01, -4.58148552e+00,
       -1.32769560e+00,  1.32769560e+00, -2.46140269e+00,  5.25870778e+01,
       -5.01256751e+01,  5.31871497e+01, -4.99205402e+01, -3.26660954e+00,
       -1.71797975e+03,  1.87038625e+02, -8.06856157e+01,  1.61162674e+03,
        9.19925375e+00,  3.45983137e+01, -4.37975674e+01])

In [105]:
# Intercepts
lr.intercept_

-58.69624146451906

In [106]:
# RMSE
import math
RMSE=math.sqrt(mean_squared_error(ytest,y_pred))
RMSE

1128.0046994656184

In [107]:
# Cross validation(K-Fold)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, xtrain, ytrain, scoring='r2', cv=5)
print(scores)  

[0.56321818 0.56701039 0.54052049 0.55103301 0.53670522]
