# Regression Benchmark 

### Importing Libraries

In [2]:
#importing libraries 

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

# Disable warnings
import warnings
warnings.filterwarnings("ignore")

### Importing Dataset

In [3]:
df=pd.read_csv('dataBM.csv')

In [4]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### Shuffling and Creating Train and Test Set

In [5]:
from sklearn.utils import shuffle

# Shuffling the Dataset
data = shuffle(df, random_state = 42)

#creating 4 divisions
div = int(data.shape[0]/4)

# 3 parts to train set and 1 part to test set
train = data.iloc[:3*div + 1]
test = data.iloc[3*div + 1:]

## Simple Mean ( mean of Item_Outlet_Sales)

In [106]:
# storing simple mean in a new column in the test set as "simple_mean"
test['simple_mean'] = train['Item_Outlet_Sales'].mean()

In [107]:
train['Item_Outlet_Sales'].mean()

2138.713585733518

In [109]:
#calculating mean absolute error
from sklearn.metrics import mean_absolute_error as MAE

simple_mean_error = MAE(test['Item_Outlet_Sales'] , test['simple_mean'])
simple_mean_error

1355.4481105570344

In [113]:
train['Outlet_Type'].unique()

array(['Supermarket Type1', 'Supermarket Type3', 'Supermarket Type2',
       'Grocery Store'], dtype=object)

In [114]:
train.groupby('Outlet_Type')['Item_Outlet_Sales'].mean()

Outlet_Type
Grocery Store         332.606025
Supermarket Type1    2245.228499
Supermarket Type2    1938.684847
Supermarket Type3    3776.185098
Name: Item_Outlet_Sales, dtype: float64

In [116]:
pd.pivot_table(train, values='Item_Outlet_Sales', index = ['Outlet_Type'], aggfunc=np.mean)

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Type,Unnamed: 1_level_1
Grocery Store,332.606025
Supermarket Type1,2245.228499
Supermarket Type2,1938.684847
Supermarket Type3,3776.185098


In [115]:
?pd.pivot_table

## Mean Item Outlet Sales with respect to Outlet_Type

In [117]:
out_type = pd.pivot_table(train, values='Item_Outlet_Sales', index = ['Outlet_Type'], aggfunc=np.mean)
out_type

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Type,Unnamed: 1_level_1
Grocery Store,332.606025
Supermarket Type1,2245.228499
Supermarket Type2,1938.684847
Supermarket Type3,3776.185098


In [125]:
# initializing new column to zero
test['Out_type_mean'] = 0

# For every unique entry in Outlet_Identifier
for i in train['Outlet_Type'].unique():
  # Assign the mean value corresponding to unique entry
  test['Out_type_mean'][test['Outlet_Type'] == str(i)] = train['Item_Outlet_Sales'][train['Outlet_Type'] == str(i)].mean()

In [126]:
#calculating mean absolute error
out_type_error = MAE(test['Item_Outlet_Sales'] , test['Out_type_mean'] )
out_type_error

1138.8026221064356

## Mean Item Outlet Sales with respect to Outlet_Establishment_Year

In [127]:
out_year = pd.pivot_table(train, values='Item_Outlet_Sales', index = ['Outlet_Establishment_Year'], aggfunc=np.mean)
out_year

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Establishment_Year,Unnamed: 1_level_1
1985,2547.909733
1987,2160.883029
1997,2200.803276
1998,320.792473
1999,2245.345807
2002,2136.727766
2004,2415.594646
2007,2320.598343
2009,1938.684847


In [128]:
# initializing new column to zero
test['Out_year_mean'] = 0

# For every unique entry in Outlet_Identifier
for i in train['Outlet_Establishment_Year'].unique():
  # Assign the mean value corresponding to unique entry
  test['Out_year_mean'][test['Outlet_Establishment_Year'] == str(i)] = train['Item_Outlet_Sales'][train['Outlet_Establishment_Year'] == str(i)].mean()

In [129]:
#calculating mean absolute error
out_year_error = MAE(test['Item_Outlet_Sales'] , test['Outlet_Establishment_Year'] )
out_year_error

1341.1462690113135

## Mean Item Outlet Sales with respect to Outlet_Location_Type

In [130]:
out_loc = pd.pivot_table(train, values = 'Item_Outlet_Sales', index = ['Outlet_Location_Type'], aggfunc = np.mean)
out_loc

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Location_Type,Unnamed: 1_level_1
Tier 1,1804.051997
Tier 2,2288.475485
Tier 3,2255.793325


In [131]:
# Initializing empty column
test['out_loc_mean'] = 0

# For every unique entry in Item_Identifier
for i in train['Outlet_Location_Type'].unique():
  # calculate and assign mean corresponding to the uniques entries
  test['out_loc_mean'][test['Outlet_Location_Type'] == str(i)] = train['Item_Outlet_Sales'][train['Outlet_Location_Type'] == str(i)].mean()

In [132]:
#calculating mean absolute error
out_loc_error = MAE(test['Item_Outlet_Sales'] , test['out_loc_mean'] )
out_loc_error

1348.8509267072284

## Mean Item_Outlet_Sales with respect to both Outlet_Location_Type and Outlet_Establishment_Year

In [133]:
combo = pd.pivot_table(train, values = 'Item_Outlet_Sales', index = ['Outlet_Location_Type','Outlet_Establishment_Year'], aggfunc = np.mean)
combo

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Outlet_Sales
Outlet_Location_Type,Outlet_Establishment_Year,Unnamed: 2_level_1
Tier 1,1985,344.882031
Tier 1,1997,2200.803276
Tier 1,1999,2245.345807
Tier 2,2002,2136.727766
Tier 2,2004,2415.594646
Tier 2,2007,2320.598343
Tier 3,1985,3776.185098
Tier 3,1987,2160.883029
Tier 3,1998,320.792473
Tier 3,2009,1938.684847


In [140]:
# Initiating new empty column
test['Super_mean'] = 0

# Assigning variables to strings ( to shorten code length)
s2 = 'Outlet_Location_Type'
s1 = 'Outlet_Establishment_Year'

# For every Unique Value in s1
for i in test[s1].unique():
  # For every Unique Value in s2
  for j in test[s2].unique():
    # Calculate and Assign mean to new column, corresponding to both unique values of s1 and s2 simultaneously
    test['Super_mean'][(test[s1] == i) & (test[s2]==str(j))] = train['Item_Outlet_Sales'][(train[s1] == i) & (train[s2]==str(j))].mean()

1999 Tier 1
1999 Tier 3
1999 Tier 2
1985 Tier 1
1985 Tier 3
1985 Tier 2
1987 Tier 1
1987 Tier 3
1987 Tier 2
1997 Tier 1
1997 Tier 3
1997 Tier 2
2007 Tier 1
2007 Tier 3
2007 Tier 2
2009 Tier 1
2009 Tier 3
2009 Tier 2
2002 Tier 1
2002 Tier 3
2002 Tier 2
2004 Tier 1
2004 Tier 3
2004 Tier 2
1998 Tier 1
1998 Tier 3
1998 Tier 2


In [137]:
test['Super_mean'][(test[s1] == 1999) & (test[s2]=='Tier 3')] = train['Item_Outlet_Sales'][(train[s1] == i) & (train[s2]==str(j))].mean()

nan

In [135]:
#calculating mean absolute error
super_mean_error = MAE(test['Item_Outlet_Sales'] , test['Super_mean'] )
super_mean_error

1140.0522313200124

In [6]:
combo_train = pd.pivot_table(train, values = 'Item_Outlet_Sales', index = ['Outlet_Location_Type','Outlet_Establishment_Year'], aggfunc = np.mean)
combo_test = pd.pivot_table(test, values = 'Item_Outlet_Sales', index = ['Outlet_Location_Type','Outlet_Establishment_Year'], aggfunc = np.mean)

In [7]:
combo_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Outlet_Sales
Outlet_Location_Type,Outlet_Establishment_Year,Unnamed: 2_level_1
Tier 1,1985,350.670086
Tier 1,1997,2213.285178
Tier 1,1999,2316.032587
Tier 2,2002,2163.635786
Tier 2,2004,2373.476783
Tier 2,2007,2295.841815
Tier 3,1985,3735.211455
Tier 3,1987,2255.42999
Tier 3,1998,328.811495
Tier 3,2009,1961.889047
