# SUPERMARKET DATA PROJECT PREDICTIVE MODEL NOTEBOOK

# Importing Packages

In [91]:
#importing packages
import pandas as pd
import numpy as np
 

#importing visualization metrics
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns

# Import the warnings
import warnings

# Import statsmodels
import statsmodels.formula.api as smf

# Importing classifiers from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the metrics
from sklearn.metrics import confusion_matrix

# configuration settings
%matplotlib inline
sns.set(color_codes=True)
warnings.filterwarnings('ignore') ## Surpress the warnings
pd.options.display.max_columns = None # Display all columns

# For Statistics
import scipy.stats as stats
import pandas_profiling as pp

%matplotlib inline 

# Importing Data

In [92]:
dataset =  pd.read_csv("supermarket_dataset_used.csv")

# EXPLORATORY DATA ANALYSIS

## 1. General Analysis

In [93]:
# viewing the top 5 rows of the dataset
dataset.head()

Unnamed: 0,SHOP_WEEK,Year,Month,Date,SHOP_WEEKDAY,SHOP_HOUR,QUANTITY,SPEND,PROD_CODE,PROD_CODE_10,PROD_CODE_20,PROD_CODE_30,PROD_CODE_40,CUST_CODE,CUST_PRICE_SENSITIVITY,CUST_LIFESTAGE,BASKET_ID,BASKET_SIZE,BASKET_PRICE_SENSITIVITY,BASKET_TYPE,BASKET_DOMINANT_MISSION,STORE_CODE,STORE_FORMAT,STORE_REGION
0,200607,2006,4,13,5,20,1,103,PRD0900097,CL00001,DEP00001,G00001,D00001,CUST0000634693,LA,YF,994100000000000,L,LA,Top Up,Fresh,STORE00001,LS,E02
1,200607,2006,4,12,4,19,1,28,PRD0900353,CL00070,DEP00020,G00007,D00002,CUST0000634693,LA,YF,994100000000000,M,MM,Small Shop,Fresh,STORE00001,LS,E02
2,200607,2006,4,13,5,20,3,84,PRD0900550,CL00167,DEP00055,G00016,D00003,CUST0000634693,LA,YF,994100000000000,L,LA,Top Up,Fresh,STORE00001,LS,E02
3,200607,2006,4,12,4,19,1,221,PRD0901647,CL00010,DEP00003,G00002,D00001,CUST0000634693,LA,YF,994100000000000,M,MM,Small Shop,Fresh,STORE00001,LS,E02
4,200607,2006,4,13,5,20,1,334,PRD0902064,CL00073,DEP00021,G00007,D00002,CUST0000634693,LA,YF,994100000000000,L,LA,Top Up,Fresh,STORE00001,LS,E02


In [94]:
#getting the statistics and summary of the whole dataset
pp.ProfileReport(dataset)

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [127]:
#variables in the dataset
dataset.columns

Index(['SHOP_WEEK', 'Year', 'Month', 'Date', 'SHOP_WEEKDAY', 'SHOP_HOUR',
       'QUANTITY', 'SPEND', 'PROD_CODE', 'PROD_CODE_10', 'PROD_CODE_20',
       'PROD_CODE_30', 'PROD_CODE_40', 'CUST_CODE', 'CUST_PRICE_SENSITIVITY',
       'CUST_LIFESTAGE', 'BASKET_ID', 'BASKET_SIZE',
       'BASKET_PRICE_SENSITIVITY', 'BASKET_TYPE', 'BASKET_DOMINANT_MISSION',
       'STORE_CODE', 'STORE_FORMAT', 'STORE_REGION'],
      dtype='object')

In [82]:
# checking the datatypes
dataset.dtypes

SHOP_WEEK                    int64
Year                         int64
Month                        int64
Date                         int64
SHOP_WEEKDAY                 int64
SHOP_HOUR                    int64
QUANTITY                     int64
SPEND                        int64
PROD_CODE                   object
PROD_CODE_10                object
PROD_CODE_20                object
PROD_CODE_30                object
PROD_CODE_40                object
CUST_CODE                   object
CUST_PRICE_SENSITIVITY      object
CUST_LIFESTAGE              object
BASKET_ID                    int64
BASKET_SIZE                 object
BASKET_PRICE_SENSITIVITY    object
BASKET_TYPE                 object
BASKET_DOMINANT_MISSION     object
STORE_CODE                  object
STORE_FORMAT                object
STORE_REGION                object
dtype: object

In [83]:
#Total number of rows and columns
#We have 119 rows and 24 columns
dataset.shape

(119, 24)

In [84]:
# Rows containing duplicate data
duplicate_rows = dataset[dataset.duplicated()]
print("Duplicate rows: ",duplicate_rows)

Duplicate rows:  Empty DataFrame
Columns: [SHOP_WEEK, Year, Month, Date, SHOP_WEEKDAY, SHOP_HOUR, QUANTITY, SPEND, PROD_CODE, PROD_CODE_10, PROD_CODE_20, PROD_CODE_30, PROD_CODE_40, CUST_CODE, CUST_PRICE_SENSITIVITY, CUST_LIFESTAGE, BASKET_ID, BASKET_SIZE, BASKET_PRICE_SENSITIVITY, BASKET_TYPE, BASKET_DOMINANT_MISSION, STORE_CODE, STORE_FORMAT, STORE_REGION]
Index: []

[0 rows x 24 columns]


In [95]:
#code to delete dupicates if we have any
#datset=dataset.drop_duplicates()

In [96]:
#The count of each column
# CUST_CODE, CUST_PRICE_SENSITIVITY and CUST_LIFESTAGE have missing values
dataset.count()

SHOP_WEEK                   119
Year                        119
Month                       119
Date                        119
SHOP_WEEKDAY                119
SHOP_HOUR                   119
QUANTITY                    119
SPEND                       119
PROD_CODE                   119
PROD_CODE_10                119
PROD_CODE_20                119
PROD_CODE_30                119
PROD_CODE_40                119
CUST_CODE                    73
CUST_PRICE_SENSITIVITY       73
CUST_LIFESTAGE               73
BASKET_ID                   119
BASKET_SIZE                 119
BASKET_PRICE_SENSITIVITY    119
BASKET_TYPE                 119
BASKET_DOMINANT_MISSION     119
STORE_CODE                  119
STORE_FORMAT                119
STORE_REGION                119
dtype: int64

In [97]:
#lokking for columns with missing values
dataset.isnull().sum()

SHOP_WEEK                    0
Year                         0
Month                        0
Date                         0
SHOP_WEEKDAY                 0
SHOP_HOUR                    0
QUANTITY                     0
SPEND                        0
PROD_CODE                    0
PROD_CODE_10                 0
PROD_CODE_20                 0
PROD_CODE_30                 0
PROD_CODE_40                 0
CUST_CODE                   46
CUST_PRICE_SENSITIVITY      46
CUST_LIFESTAGE              46
BASKET_ID                    0
BASKET_SIZE                  0
BASKET_PRICE_SENSITIVITY     0
BASKET_TYPE                  0
BASKET_DOMINANT_MISSION      0
STORE_CODE                   0
STORE_FORMAT                 0
STORE_REGION                 0
dtype: int64

## 2. Exploring The Target Variable

The objective is to learn more about the target varibale which is the store region. This is through visualization and conducting description analysis

In [99]:
#viewing the top 5 of the store region variable
store_region_data = dataset["STORE_REGION"]
store_region_data.head()

0    E02
1    E02
2    E02
3    E02
4    E02
Name: STORE_REGION, dtype: object

In [126]:
# the store regions available
available_store_regions = store_region_data.unique()
available_store_regions

array(['E02', 'W01', 'N01', 'N02', 'W02', 'S02', 'S01', 'N03', 'S03'],
      dtype=object)

In [129]:
# finding out the distribution of the number the customers bought in those store regions
dataset["STORE_REGION"].value_counts()

N01    21
W01    17
N02    16
S01    15
N03    14
E02    12
W02     9
S03     8
S02     7
Name: STORE_REGION, dtype: int64

In [None]:
#creating the visualization to shore the distribution of the number the customers bought in those store regions
