<a href="https://colab.research.google.com/github/PigeonLore/Sales-Prediction-Modeling/blob/main/Outlet_Sales_Predictions_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Before splitting your data, you can drop duplicates and fix inconsistencies in categorical data. (*There is a way to do this after the split, but for this project, you may perform this step before the split*)


Identify the features (X) and target (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.
Perform a train test split


Create a preprocessing object to prepare the dataset for Machine Learning


Make sure your imputation of missing values occurs after the train test split using SimpleImputer.

In [331]:
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer

In [332]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [333]:
df = pd.read_csv('/content/drive/MyDrive/Coding Dojo | Data Science/Stack 2: Machine Learning/Week 1: Machine Learning/DataSets/sales_predictions.csv')

# 1. Inspecting Data

In [334]:
# Display Column names, counts of non-null values, and respective datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [335]:
# Display statistics for numerical columns
df.describe(include='number')

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [336]:
# Display 
df.describe(include='object')

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


# 2. Correct Discrepancies

In [337]:
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

## 1. This standardizes all elements representing low fat and regular to discrete items

In [338]:
# Correct 

df['Item_Fat_Content'].replace({'low fat':0,'LF':0,'Low Fat':0,'reg':1,'Regular':1},inplace=True)

In [339]:
df['Item_Fat_Content'].value_counts()

0    5517
1    3006
Name: Item_Fat_Content, dtype: int64

## 2. Check for duplicates

In [340]:
# Display duplicated data

print('There are',df.duplicated().sum(),'duplicates')

There are 0 duplicates


## 3. Check for missing values

In [341]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [342]:
#Displays the percentage of missing values of each column

print(df.isna().sum()/len(df)*100,'%')

Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64 %


## These values can be imputed during 

# 3. Define target and feature sets then perform a train test split.

## 1. Spliting data

In [343]:
X = df.drop(columns=(['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Identifier']))

y = df['Item_Outlet_Sales']

In [344]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [345]:
X_train.isna().any()

Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
dtype: bool

## 2. PreProcessing

In [346]:
# X_train['Outlet_Size'].replace({'High':0, 'Medium':1,'Small':2}, inplace = True)
X_train['Outlet_Size'].unique()

array(['Medium', 'Small', nan, 'High'], dtype=object)

### Column Selectors

In [347]:
#Instantiate column selectors

num_selector = make_column_selector(dtype_include='number')

cat_selector = make_column_selector(dtype_include='object')

In [348]:
# Instantiate Columns selectors

num_columns = num_selector(X_train)
cat_columns = cat_selector(X_train)

In [349]:
# Instantiate imputers

freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

# Instantiate Scalers

scaler = StandardScaler()

# Instantiate OneHotEncoding

ohe = OneHotEncoder(sparse_output = False, handle_unknown='ignore')

In [350]:
cat_selector(X_train)

['Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

In [351]:
# create subset for catagorical data

train_cat_data = X_train[cat_selector(X_train)]

test_cat_data = X_test[cat_selector(X_test)]

train_cat_data

Unnamed: 0,Item_Type,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,Household,Medium,Tier 3,Supermarket Type2
7510,Snack Foods,Medium,Tier 3,Supermarket Type2
5828,Meat,Medium,Tier 1,Supermarket Type1
5327,Baking Goods,Small,Tier 2,Supermarket Type1
4810,Frozen Foods,,Tier 2,Supermarket Type1
...,...,...,...,...
5734,Fruits and Vegetables,,Tier 3,Grocery Store
5191,Frozen Foods,,Tier 2,Supermarket Type1
5390,Health and Hygiene,,Tier 2,Supermarket Type1
860,Snack Foods,,Tier 2,Supermarket Type1


In [352]:
ohe.fit(train_cat_data)

#transform both the training and the testing data

train_ohe = ohe.transform(train_cat_data)

train_ohe

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [353]:
mean_imputer.fit(X_train[num_columns])

In [354]:
X_train.loc[:, num_columns] = mean_imputer.transform(X_train[num_columns])
X_test.loc[:, num_columns] = mean_imputer.transform(X_test[num_columns])

## Create Pipelines

In [355]:
num_pipe = make_pipeline(mean_imputer, scaler)
num_pipe

In [356]:
cat_pipe = make_pipeline(freq_imputer, ohe)
cat_pipe

### Tuples

In [357]:
# Create Tupes

num_tuple = (scaler, num_selector)

cat_tuple = (ohe, cat_selector)

### PreProcessor

In [358]:
preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')
preprocessor

In [359]:
preprocessor.fit(X_train)

# 5. Pipelines

In [360]:
X_train_processed = preprocessor.transform(X_train)

X_test_processed = preprocessor.transform(X_test)

X_train_processed[:10]

array([[ 8.17248678e-01, -7.40320602e-01, -7.12775072e-01,
         1.82810922e+00,  1.32784893e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00],
       [ 5.56339503e-01,  1.35076614e+00, -1.29105225e+00,
         6.03368881e-01,  1.32784893e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00

In [361]:
X_train_df = pd.DataFrame(X_train_processed)

X_train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.817249,-0.740321,-0.712775,1.828109,1.327849,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.556340,1.350766,-1.291052,0.603369,1.327849,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.131512,1.350766,1.813319,0.244541,0.136187,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.169219,-0.740321,-1.004931,-0.952591,0.732018,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.528819,-0.740321,-0.965484,-0.336460,0.493686,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6387,-0.832409,1.350766,4.309657,-0.044657,0.017021,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
6388,0.639356,-0.740321,1.008625,-1.058907,1.089517,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6389,1.113736,-0.740321,-0.920527,1.523027,0.493686,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6390,1.766009,-0.740321,-0.227755,-0.383777,1.089517,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [362]:
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in testing data')
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype)
print('All data in X_test_processed are', X_test_processed.dtype)
print('\n')
print('shape of data is', X_train_processed.shape)
print('\n')
X_train_processed.shape

0 missing values in training data
0 missing values in testing data


All data in X_train_processed are float64
All data in X_test_processed are float64


shape of data is (6392, 32)




(6392, 32)

# Linear Regression modeling for Outlet Sales prediction

In [363]:
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [364]:
# Function from the codalong 

def regression_metrics(model ,X_train=X_train_processed, X_test=X_test_processed, y_train=y_train, y_test=y_test):

    # Left commented since fitting will be outside of function

    # model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test_processed)
    
    # calculate the regression metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # print the results
    print(model)
    print("Mean Absolute Error (MAE): {:.4f}".format(mae))
    print("Mean Squared Error (MSE): {:.4f}".format(mse))
    print("R^2 Score: {:.4f}".format(r2))
    print("Root Mean Squared Error (RMSE): {:.4f}".format(rmse))
    
    # return the metrics as a dictionary
    return {'MAE': mae, 'MSE': mse, 'R2': r2, 'RMSE': rmse}

In [365]:
def model_score(model, X_train=X_train_processed, X_test=X_test_processed, y_train=y_train, y_test=y_test):
  train_score = model.score(X_train_processed, y_train)
  test_score = model.score(X_test_processed, y_test)
  print(train_score)
  print(test_score)

# Random Forest Regression

In [366]:
rfr = RandomForestRegressor()

In [367]:
rfr.fit(X_train_processed, y_train)

In [368]:
regression_metrics(rfr)

RandomForestRegressor()
Mean Absolute Error (MAE): 773.5087
Mean Squared Error (MSE): 1234541.4423
R^2 Score: 0.5525
Root Mean Squared Error (RMSE): 1111.0992


{'MAE': 773.5087374059126,
 'MSE': 1234541.4423041395,
 'R2': 0.552536569343932,
 'RMSE': 1111.0992045286232}

## RFR model tuning

In [369]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [370]:
rfr_est_depths = [estimator.get_depth() for estimator in rfr.estimators_]
max(rfr_est_depths)

46

In [371]:
model_score(rfr)

0.9381899279184717
0.552536569343932


In [372]:
print(max(rfr_est_depths))
print(min(rfr_est_depths))

46
29


In [373]:
rfr_depths = range(1, max(rfr_est_depths))
rfr_scores = pd.DataFrame(index=rfr_depths, columns=['Test Score'])
for depth in rfr_depths:    
   model = RandomForestRegressor(max_depth=depth)
   model.fit(X_train_processed, y_train)
   rfr_scores.loc[depth, 'Train Score'] = model.score(X_train_processed, y_train)
   rfr_scores.loc[depth, 'Test Score'] = model.score(X_test_processed, y_test)
rfr_scores.head

%timeit I = [i for i in range(1)]

KeyboardInterrupt: ignored

In [None]:
rfr_scores.head(10)

In [None]:
fig, ax = plt.subplots(figsize=(12,10))

plt.plot(rfr_scores['Test Score'])
plt.show()

In [None]:
rfr_scores_sorted = rfr_scores.sort_values(by='Test Score', ascending= False)

In [None]:
rfr_scores_sorted.head()

In [None]:
rfr_tune_param = range(1,10)
rfr_tune = pd.DataFrame(index=rfr_tune_param, columns=['Test Score'])

for depth in rfr_tune_param:
  model = RandomForestRegressor(max_depth=depth)
  model.fit(X_train_processed, y_train)
  rfr_tune.loc[depth, 'Train Score'] = model.score(X_train_processed, y_train)
  rfr_tune.loc[depth, 'Test Score'] = model.score(X_test_processed, y_test)

rfr_tune

In [None]:
fig, ax = plt.subplots(figsize=(12,10))

plt.plot(rfr_tune['Test Score'])
plt.show()

# Unfinished