####Preliminary Steps

In [2]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn import set_config
set_config(display = 'diagram')

In [3]:
#load data
df = pd.read_csv('/content/sales_predictions.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


####Data Inspection

In [4]:
#Checking for duplicates
df.duplicated().any()

False

There are no duplicates

In [5]:
#Checking where missing values are
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


None

We have two columns containing missing values: one numeric ('Item_Weight) and categorical ('Outlet_Size'). 

For 'Item_Weight' I'll impute using the mean strategy. I'll impute 'Outlet_Size' using the most frequent value.

In [6]:
for col in df.columns:
    if df[col].dtype == 'object':
        print(col,':')
        print(df[col].unique(),'\n-')

Item_Identifier :
['FDA15' 'DRC01' 'FDN15' ... 'NCF55' 'NCW30' 'NCW05'] 
-
Item_Fat_Content :
['Low Fat' 'Regular' 'low fat' 'LF' 'reg'] 
-
Item_Type :
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood'] 
-
Outlet_Identifier :
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019'] 
-
Outlet_Size :
['Medium' nan 'High' 'Small'] 
-
Outlet_Location_Type :
['Tier 1' 'Tier 3' 'Tier 2'] 
-
Outlet_Type :
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3'] 
-


Some things I notice include: useless/droppable 'Item_Identifier' and
'Outlet_Identifier' columns, inconsistent values in the 'Item_Fat_Content' column, an 'Outlet_Size' column with values that need to be encoded ordinally, and two columns, 'Outlet_Location_Type' and 'Outlet_Type' that at first glance seem to need ordinal encoding as well, but don't. They'll need to be OneHotEncoded after our ordinal encoding.

In [7]:
#fixing inconsistent values using a dictionary
replace = {'low fat':'Low Fat','LF':'Low Fat','reg':'Regular'}
df.Item_Fat_Content.replace(replace, inplace = True)
#checking
df.Item_Fat_Content.unique()

array(['Low Fat', 'Regular'], dtype=object)

In [8]:
#ordinal encoding
order = {'Medium': 1,'High': 2,'Small':0}
df.Outlet_Size.replace(order, inplace = True)
#checking
df.Outlet_Size.value_counts()

1.0    2793
0.0    2388
2.0     932
Name: Outlet_Size, dtype: int64

In [9]:
#Validation split
y = df.Item_Outlet_Sales
X = df.drop(columns = ['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier'])#dropping columns composed entirely of unique IDs in addition to our target column
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

####Preprocessing

In [10]:
##transformers:
 #numerical
mean_imputer = SimpleImputer(strategy = 'mean')
scaler = StandardScaler()
 #categorical
mf_imputer = SimpleImputer(strategy = 'most_frequent')
ohe = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
#pipelines
num_pipe = make_pipeline(mean_imputer, scaler)
cat_pipe = make_pipeline(mf_imputer, ohe)
##col transformer params:
 #column selectors
num_selector = make_column_selector(dtype_include = 'number')
cat_selector = make_column_selector(dtype_include = 'object')
 #tuples
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)
#preprocessor
transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')
#tansformer.fit(X_train)#fitting on training data
#X_train_processed = tansformer.transform(X_train)#transforming data
#X_test_processed = tansformer.transform(X_test)

In [11]:
display(transformer)
#print('\nThere are',np.isnan(X_train_processed).sum(),'missing values in the transformed data.\n\nTransformed data:')
#X_train_processed

####Building a linear regression model to predict sales

In [12]:
#making model pipeline with LinearRegression()
linreg = LinearRegression()
linreg_pipe = make_pipeline(transformer, linreg)
#fitting on train
linreg_pipe.fit(X_train, y_train)
#getting predictions
linreg_train_preds = linreg_pipe.predict(X_train)
linreg_test_preds = linreg_pipe.predict(X_test)
#printing R² scores
print('Train R² Scores:', round(linreg_pipe.score(X_train, y_train),3))
print('Test R² Scores:', round(linreg_pipe.score(X_test, y_test),3))
print('Train RMSE Scores:',round(np.sqrt(mean_squared_error(y_train, linreg_train_preds)),3))
print('Test RMSE Scores:',round(np.sqrt(mean_squared_error(y_test, linreg_test_preds)),3))

Train R² Scores: 0.561
Test R² Scores: 0.566
Train RMSE Scores: 1140.388
Test RMSE Scores: 1094.455


This linear regression model doesn't give us very high R² scores for both our training and test sets. The RMSE scores aren't very pleasant either. 

####Building a decision tree regression model to predict sales

In [13]:
dec_tree = DecisionTreeRegressor(random_state = 42)
#making model pipeline 
dec_tree_pipe = make_pipeline(transformer, dec_tree)
dec_tree_pipe.fit(X_train, y_train)
#predictions for metrics
dec_train_preds = dec_tree_pipe.predict(X_train)
dec_test_preds = dec_tree_pipe.predict(X_test)
#R² scores 
dec_train_scores = dec_tree_pipe.score(X_train, y_train)
dec_test_scores = dec_tree_pipe.score(X_test, y_test)
#default Decision Tree R² scores
print('Train R² Scores:',round(dec_train_scores,3))
print('Test R² Scores:',round(dec_test_scores,3))
print('Train RMSE Scores:',round(np.sqrt(mean_squared_error(y_train, dec_train_preds)),3))
print('Test RMSE Scores:',round(np.sqrt(mean_squared_error(y_test, dec_test_preds)),3))

Train R² Scores: 1.0
Test R² Scores: 0.216
Train RMSE Scores: 0.0
Test RMSE Scores: 1470.764


Tuning decision tree

In [19]:
dec_tree.get_depth()#determing a range of max depths to iterate through

40

In [26]:
depths = range(1,41)
scores = pd.DataFrame(index = depths, columns = ['Train', 'Test'])
for depth in depths:
    dec = DecisionTreeRegressor(max_depth = depth, random_state = 42)
    dec_pipe = make_pipeline(transformer, dec)
    dec_pipe.fit(X_train, y_train)
    train_r2 = dec_pipe.score(X_train, y_train)
    test_r2 = dec_pipe.score(X_test, y_test)
    scores.loc[depth, 'Train'] = train_r2
    scores.loc[depth, 'Test'] = test_r2
results = scores.sort_values(by = 'Test', ascending = False).head()
results

Unnamed: 0,Train,Test
5,0.60394,0.59471
4,0.582625,0.584005
6,0.615072,0.582587
7,0.626454,0.576843
8,0.642714,0.560245


Our best score is coming from a max depth of 5.

In [31]:
dec_tree_tuned = DecisionTreeRegressor(max_depth = 5, random_state = 42)
dec_pipe_tuned = make_pipeline(transformer, dec_tree_tuned)
dec_pipe_tuned.fit(X_train, y_train)

#predictions for metrics
dec_tuned_train_preds = dec_pipe_tuned.predict(X_train)
dec_tuned_test_preds = dec_pipe_tuned.predict(X_test)
#R2 scores
dec_tuned_train_scores = dec_pipe_tuned.score(X_train, y_train)
dec_tuned_test_scores = dec_pipe_tuned.score(X_test, y_test)

print('Train R² Scores:',round(dec_tuned_train_scores,3))
print('Test R² Scores:',round(dec_tuned_test_scores,3))
print('Train RMSE Scores:',round(np.sqrt(mean_squared_error(y_train, dec_tuned_train_preds)),3))
print('Test RMSE Scores:',round(np.sqrt(mean_squared_error(y_test, dec_tuned_test_preds)),3))

Train R² Scores: 0.604
Test R² Scores: 0.595
Train RMSE Scores: 1082.646
Test RMSE Scores: 1057.443


Our default model was extremely underfit. After tuning its max depth we are able to get a good balance of 0.60394 for our Train R2 score and a nearly identical 0.59471 for on our Test. This is much better. Not incredibly under or over fit. The model R2 isn't perfect but it seems to be the best this regressor can do after tuning. Much better than the LinearRegression model. 


Also, while our train RMSE is not as well fit as it was in our default regressor, after tuning our test RMSE got better at 1057.443. The tuned regressor can make around $37 less in error. 

Considering both the R2 and RMSE scores for both regressors, I would recommend using the tuned decision tree regressor because it has higher predictive power than that of the linear regressor. 
The tuned decision tree regressor's room for error is less than that of that of the decision tree with a Test RMSE of 1057.443 compared to the linear regressor's Test RMSE of 1094.455. The decision tree is the better model. 