<a href="https://colab.research.google.com/github/JouhainaB/SalesPredictions/blob/main/sales_predictions_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config
set_config(transform_output='pandas')

In [3]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_predictions_2023 (1).csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
df=df.drop(columns='Item_Identifier')

In [5]:
df.shape

(8523, 11)

In [6]:
df.duplicated().sum()

0

In [7]:
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [8]:
df['Item_Fat_Content']=df['Item_Fat_Content'].replace('LF','low fat')
df['Item_Fat_Content']=df['Item_Fat_Content'].replace('reg','regular')
df['Item_Fat_Content']=df['Item_Fat_Content'].str.lower()
df['Item_Fat_Content'].value_counts()

low fat    5517
regular    3006
Name: Item_Fat_Content, dtype: int64

In [9]:
df['Item_Type'].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [10]:
df['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [11]:
df['Outlet_Establishment_Year'].value_counts()

1985    1463
1987     932
1999     930
1997     930
2004     930
2002     929
2009     928
2007     926
1998     555
Name: Outlet_Establishment_Year, dtype: int64

In [12]:
df['Item_Weight'].value_counts()

12.150    86
17.600    82
13.650    77
11.800    76
15.100    68
          ..
7.275      2
7.685      1
9.420      1
6.520      1
5.400      1
Name: Item_Weight, Length: 415, dtype: int64

In [13]:
x=df.drop(columns='Item_Outlet_Sales')
y=df['Item_Outlet_Sales']

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

In [15]:
x_train.dtypes

Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object

In [16]:
num_cols=x_train.select_dtypes('number').columns


In [17]:
ord_cols=df[['Outlet_Size','Item_Fat_Content']].columns
cat_cols=x_train.select_dtypes('object').drop(columns=['Outlet_Size','Item_Fat_Content']).columns

In [18]:
ord_cols

Index(['Outlet_Size', 'Item_Fat_Content'], dtype='object')

In [19]:
impute_mode=SimpleImputer(strategy='median')
scaler=StandardScaler()

In [20]:
impute_most_freq=SimpleImputer(strategy='most_frequent')
ord_Item_Fat_Content=['low fat','regular']
ord_Outlet_Size=['Small','Medium','High']
ord_encoder=OrdinalEncoder(categories=[ord_Outlet_Size,ord_Item_Fat_Content])


In [21]:
impute_missing=SimpleImputer(strategy='constant' , fill_value='Missing')
ohe_encoder=OneHotEncoder(sparse_output=False , handle_unknown='ignore')

In [22]:
num_pip=make_pipeline(impute_mode,scaler)
ord_pip=make_pipeline(impute_most_freq,ord_encoder)
ohe_pip=make_pipeline(impute_missing,ohe_encoder)

In [23]:
num_tuple=('numeric',num_pip,num_cols)
ord_tuple=('ordinal',ord_pip,ord_cols)
ohe_tuple=('nominal',ohe_pip,cat_cols)

In [24]:
col_tran=ColumnTransformer([num_tuple,ord_tuple,ohe_tuple],verbose_feature_names_out=False)
col_tran

In [31]:
reg=LinearRegression()
reg_pip=make_pipeline(col_tran,reg)


In [32]:
reg_pip.fit(x_train,y_train)

In [33]:
train_pred=reg_pip.predict(x_train)
test_pred=reg_pip.predict(x_test)

In [35]:
train_score=r2_score(y_train,train_pred)
test_score=r2_score(y_test,test_pred)
print(f'training R2 is {train_score.round(3)}')
print(f'testing R2 is {test_score.round(3)}')

training R2 is 0.562
testing R2 is 0.567


In [37]:
train_rmse=mean_squared_error(y_train,train_pred,squared=False)
test_rmse=mean_squared_error(y_test,test_pred,squared=False)
print(f'training RMSE is {train_rmse.round(3)}')
print(f'testing RMSE is {test_rmse.round(3)}')

training RMSE is 1139.101
testing RMSE is 1092.873


In [42]:
rf=RandomForestRegressor()
rf_pip=make_pipeline(col_tran,rf)

In [43]:
rf_pip.fit(x_train,y_train.values.ravel())

In [44]:
train_pred=rf_pip.predict(x_train)
test_pred=rf_pip.predict(x_test)

In [45]:
train_score=r2_score(y_train,train_pred)
test_score=r2_score(y_test,test_pred)
print(f'training R2 is {train_score.round(3)}')
print(f'testing R2 is {test_score.round(3)}')

training R2 is 0.938
testing R2 is 0.556


the model is overfitting

In [46]:
train_rmse=mean_squared_error(y_train,train_pred,squared=False)
test_rmse=mean_squared_error(y_test,test_pred,squared=False)
print(f'training RMSE is {train_rmse.round(3)}')
print(f'testing RMSE is {test_rmse.round(3)}')

training RMSE is 427.691
testing RMSE is 1106.571


the training RMSE is significantly lower than the testing RMSE, it indicates overfitting