<a href="https://colab.research.google.com/github/Nanungi/Prediction-of-Product-Sales/blob/main/final_core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#importing libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#loading data

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config
set_config(transform_output='pandas')
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression

In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics
def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)
  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )
  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [4]:
link='/content/drive/MyDrive/CodingDojo/01-Fundamentals/Week02/Data/sales_predictions_2023 (1).csv'
df=pd.read_csv(link)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
#identifying duplicates
df.duplicated().sum()

0

In [6]:
#dropping item identifier.
df.drop(columns=['Item_Identifier'],inplace=True)
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [7]:
#addressing inconsistencies
cat_columns= df.select_dtypes('object')
cat_columns

Unnamed: 0,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,Low Fat,Dairy,OUT049,Medium,Tier 1,Supermarket Type1
1,Regular,Soft Drinks,OUT018,Medium,Tier 3,Supermarket Type2
2,Low Fat,Meat,OUT049,Medium,Tier 1,Supermarket Type1
3,Regular,Fruits and Vegetables,OUT010,,Tier 3,Grocery Store
4,Low Fat,Household,OUT013,High,Tier 3,Supermarket Type1
...,...,...,...,...,...,...
8518,Low Fat,Snack Foods,OUT013,High,Tier 3,Supermarket Type1
8519,Regular,Baking Goods,OUT045,,Tier 2,Supermarket Type1
8520,Low Fat,Health and Hygiene,OUT035,Small,Tier 2,Supermarket Type1
8521,Regular,Snack Foods,OUT018,Medium,Tier 3,Supermarket Type2


In [8]:
cat_columns.value_counts()

Item_Fat_Content  Item_Type      Outlet_Identifier  Outlet_Size  Outlet_Location_Type  Outlet_Type      
Low Fat           Household      OUT013             High         Tier 3                Supermarket Type1    99
                                 OUT046             Small        Tier 1                Supermarket Type1    94
                                 OUT027             Medium       Tier 3                Supermarket Type3    93
                                 OUT035             Small        Tier 2                Supermarket Type1    89
                                 OUT018             Medium       Tier 3                Supermarket Type2    89
                                                                                                            ..
low fat           Hard Drinks    OUT035             Small        Tier 2                Supermarket Type1     1
                                 OUT027             Medium       Tier 3                Supermarket Type3     1
       

In [9]:
#item fat content
item_fat_content=df['Item_Fat_Content'].value_counts()
item_fat_content

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

In [10]:
df['Item_Fat_Content']=df['Item_Fat_Content'].replace({'low fat':'Low Fat','LF':'Low Fat','reg':'Regular','regular':'Regular'})
df['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5517
Regular    3006
Name: count, dtype: int64

In [11]:
#defining features
y=df['Item_Outlet_Sales']
X=df.drop(columns='Item_Outlet_Sales')

In [12]:
#test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [13]:
#inspecting data
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6392 entries, 4776 to 7270
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                5285 non-null   float64
 1   Item_Fat_Content           6392 non-null   object 
 2   Item_Visibility            6392 non-null   float64
 3   Item_Type                  6392 non-null   object 
 4   Item_MRP                   6392 non-null   float64
 5   Outlet_Identifier          6392 non-null   object 
 6   Outlet_Establishment_Year  6392 non-null   int64  
 7   Outlet_Size                4580 non-null   object 
 8   Outlet_Location_Type       6392 non-null   object 
 9   Outlet_Type                6392 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 549.3+ KB


#ORDINAL COLUMNS

In [14]:
X_train.select_dtypes(object)

Unnamed: 0,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,Low Fat,Household,OUT018,Medium,Tier 3,Supermarket Type2
7510,Regular,Snack Foods,OUT018,Medium,Tier 3,Supermarket Type2
5828,Regular,Meat,OUT049,Medium,Tier 1,Supermarket Type1
5327,Low Fat,Baking Goods,OUT035,Small,Tier 2,Supermarket Type1
4810,Low Fat,Frozen Foods,OUT045,,Tier 2,Supermarket Type1
...,...,...,...,...,...,...
5734,Regular,Fruits and Vegetables,OUT010,,Tier 3,Grocery Store
5191,Low Fat,Frozen Foods,OUT017,,Tier 2,Supermarket Type1
5390,Low Fat,Health and Hygiene,OUT045,,Tier 2,Supermarket Type1
860,Low Fat,Snack Foods,OUT017,,Tier 2,Supermarket Type1


In [15]:
#item fat content
ord_cols =['Item_Fat_Content','Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type']
impute_common = SimpleImputer(strategy='most_frequent')
qual_cond_order1= ['Low Fat','Regular']
qual_cond_order2=['OUT010','OUT013','OUT017','OUT018','OUT019','OUT027','OUT035','OUT045','OUT046','OUT049']
qual_cond_order3=['Small','Medium','High']
qual_cond_order4=['Tier 1','Tier 2','Tier 3']
qual_cond_order5=['Supermarket Type1','Supermarket Type2','Supermarket Type3','Grocery Store']
ordinal_category_orders = [qual_cond_order1, qual_cond_order2, qual_cond_order3, qual_cond_order4,qual_cond_order5]
ord_encoder = OrdinalEncoder(categories=ordinal_category_orders)
scaler = StandardScaler()
ord_pipe = make_pipeline(impute_common, ord_encoder, scaler)
ord_pipe

#NUMERIC COLUMNS

In [16]:
num_cols = X_train.select_dtypes("number").columns
X_train[num_cols].isna().sum()

Item_Weight                  1107
Item_Visibility                 0
Item_MRP                        0
Outlet_Establishment_Year       0
dtype: int64

In [17]:
impute_mean = SimpleImputer(strategy='mean')
scaler = StandardScaler()
num_pipe = make_pipeline(impute_mean, scaler)
num_pipe


#CATEGORICAL COLUMNS

In [18]:
ohe_cols = X_train.select_dtypes('object').drop(columns=ord_cols).columns
impute_na = SimpleImputer(strategy='constant', fill_value = "MISSING")
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_pipe = make_pipeline(impute_na, ohe_encoder)
ohe_pipe

#COLUMN TRANSFER

In [19]:
# Making a numeric tuple for ColumnTransformer
num_tuple = ('numeric', num_pipe, num_cols)
num_tuple

('numeric',
 Pipeline(steps=[('simpleimputer', SimpleImputer()),
                 ('standardscaler', StandardScaler())]),
 Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
        'Outlet_Establishment_Year'],
       dtype='object'))

In [20]:
# Making an ordinal_tuple for ColumnTransformer
ord_tuple = ('ordinal', ord_pipe, ord_cols)
ord_tuple

('ordinal',
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('ordinalencoder',
                  OrdinalEncoder(categories=[['Low Fat', 'Regular'],
                                             ['OUT010', 'OUT013', 'OUT017',
                                              'OUT018', 'OUT019', 'OUT027',
                                              'OUT035', 'OUT045', 'OUT046',
                                              'OUT049'],
                                             ['Small', 'Medium', 'High'],
                                             ['Tier 1', 'Tier 2', 'Tier 3'],
                                             ['Supermarket Type1',
                                              'Supermarket Type2',
                                              'Supermarket Type3',
                                              'Grocery Store']])),
                 ('standardscaler', StandardScaler())]),
 ['Item_Fat_Content',
  'Outlet_Identifier',


In [21]:
# Making a ohe_tuple for ColumnTransformer
ohe_tuple = ('categorical', ohe_pipe, ohe_cols)
ohe_tuple

('categorical',
 Pipeline(steps=[('simpleimputer',
                  SimpleImputer(fill_value='MISSING', strategy='constant')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
 Index(['Item_Type'], dtype='object'))

In [22]:
# Instantiate with verbose_feature_names_out=False
preprocessor= ColumnTransformer([num_tuple, ord_tuple, ohe_tuple],
                                    verbose_feature_names_out=False)
preprocessor

#linear regression

In [23]:
# Fit the preprocessor on training data
preprocessor.fit(X_train)
# Transform the training and test data
X_train_tf = preprocessor.transform(X_train)
X_test_tf = preprocessor.transform(X_test)

In [24]:
#instantiate, train , make predictions
lin_reg = LinearRegression()
lin_reg
lin_reg.fit(X_train_tf, y_train)
y_predictions_train = lin_reg.predict(X_train_tf)
y_predictions_test = lin_reg.predict(X_test_tf)


In [25]:
#evaluation
prediction_df = X_test_tf.copy()
prediction_df['True Item_Outlet_Sales'] = y_test
prediction_df['Predicted Item_Outlet_Sales'] = y_predictions_test.round(1)
prediction_df['Error'] = (y_predictions_test - y_test).round(1)
prediction_df.head(10)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Baking Goods,...,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,True Item_Outlet_Sales,Predicted Item_Outlet_Sales,Error
7503,0.3310089,-0.776646,-0.998816,-1.293807,-0.740321,-1.316038,1.958796,1.084948,-0.654429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1743.0644,1832.6,89.5
2957,-1.179892,0.100317,-1.585194,-0.102145,-0.740321,1.163636,-1.384048,-1.384777,-0.654429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,356.8688,801.6,444.8
7031,0.3784469,-0.482994,-1.595784,0.136187,1.350766,1.517875,0.287374,-1.384777,-0.654429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,377.5086,826.4,448.9
1084,4.213344e-16,-0.41544,0.506592,-1.532139,1.350766,0.100918,0.287374,1.084948,1.185514,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5778.4782,3508.5,-2269.9
856,-0.6426567,-1.047426,0.886725,0.732018,1.350766,0.455157,-1.384048,-0.149914,-0.654429,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2356.932,3603.7,1246.7
4304,-0.8075039,-0.470511,-1.748367,0.136187,-0.740321,1.517875,0.287374,-1.384777,-0.654429,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,865.54,547.0,-318.6
2132,4.213344e-16,1.189692,1.070615,-1.532139,-0.740321,0.100918,0.287374,1.084948,1.185514,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4613.994,3948.7,-665.3
1385,-0.5703138,-1.025995,0.000559,1.327849,1.350766,-0.60756,0.287374,1.084948,0.265542,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2410.8618,2215.2,-195.7
5239,0.2598518,-0.824923,-0.620321,1.327849,-0.740321,-0.60756,0.287374,1.084948,0.265542,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1948.1308,1554.0,-394.2
6516,-1.042322,-0.974654,0.801084,1.327849,-0.740321,-0.60756,0.287374,1.084948,0.265542,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1937.478,2936.5,999.0


#RANDOM FOREST

In [26]:
#train and evaluate in random forest.
rf = RandomForestRegressor(random_state = 42)
rf_pipe = make_pipeline(preprocessor, rf)
rf_pipe.fit(X_train, y_train)


In [27]:
#evaluating default random forest
evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 297.149
- MSE = 183,326.014
- RMSE = 428.166
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 769.739
- MSE = 1,232,766.141
- RMSE = 1,110.300
- R^2 = 0.553


R squared is higher in training data than testing data.

#in comparison with linear regression it is easier to analyse the values of random forest.

#tuning with GridSearchCV

In [28]:

# Define , instatiate,fit
params = {'randomforestregressor__max_depth': [None,10,15,20],
          'randomforestregressor__n_estimators':[10,100,150,200],
          'randomforestregressor__min_samples_leaf':[2,3,4],
          'randomforestregressor__max_features':['sqrt','log2',None],
          'randomforestregressor__oob_score':[True,False],
          }
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 3, verbose=1)
gridsearch.fit(X_train, y_train)


Fitting 3 folds for each of 288 candidates, totalling 864 fits


In [29]:
# Define and refit best model
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train, y_train, X_test, y_test)


------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 637.406
- MSE = 824,314.583
- RMSE = 907.918
- R^2 = 0.721

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 739.538
- MSE = 1,113,345.154
- RMSE = 1,055.152
- R^2 = 0.596


#the model did not improve.

#CRISP-DM EVALUATION
##i would recommend random forest that is not tuned.
##this is because it fits better on both the test and training data seen with the difference in MAE in both the tuned and not tuned.
##Considering R2 again, this model is still somewhat overfit, but is an improvement over the default model.

##R2 on the tuned test data has not improved comparing 0.553(default) and 0.596(tuned).

##On the tuned test data, the predicted price has an MAE of about 769.739 dollars in tuned and 739.538 dollars in the default.

##RMSE, the error reduced in tuned test model to 1,000dollars.



the model fits better on default model than the tuned model.