In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("BigBasket Products.csv")

In [3]:
##

##### Describe the data

In [4]:
data.describe()

Unnamed: 0,index,sale_price,market_price,rating
count,27555.0,27555.0,27555.0,18929.0
mean,13778.0,322.514808,382.056664,3.94341
std,7954.58767,486.263116,581.730717,0.739063
min,1.0,2.45,3.0,1.0
25%,6889.5,95.0,100.0,3.7
50%,13778.0,190.0,220.0,4.1
75%,20666.5,359.0,425.0,4.3
max,27555.0,12500.0,12500.0,5.0


In [5]:
# cleaning the data to drop all na values
data.dropna(axis=0, inplace=True)

##### See the columns to find the features (X) and target (Y)

In [6]:
data.columns

Index(['index', 'product', 'category', 'sub_category', 'brand', 'sale_price',
       'market_price', 'type', 'rating', 'description'],
      dtype='object')

##### We see here that the sale_price seems to be a good target(Y) and other fields like category, sub_category, brand, market_price, type and rating can be features

In [7]:
y = data.sale_price

Now, we can use the features directly but non-numeric features will not give an accurate prediction. Because of this we need to convert the features into numbers which can be easily understood by the model.

In [8]:
# import labelEncoder to make new columns to convert string data to numeric
from sklearn import preprocessing

In [9]:
# now create a new label encoder object
le_category = preprocessing.LabelEncoder()
le_category.fit(data.category)

LabelEncoder()

In [10]:
# now we have a new label encoder for category, but we have not added it as a column in the data, we can do this by
data['category_label'] = le_category.transform(data.category)

In [11]:
# Check if the new column is added in the data
data.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,category_label
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...,2
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ...",7
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m...",4
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...,4
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,2


In [12]:
# the category label is added in the data, now let's add other labels in the similar way
le_sub_category = preprocessing.LabelEncoder()
le_sub_category.fit(data.sub_category)

# le_brand = preprocessing.LabelEncoder()
# le_brand.fit(data.brand)

le_type = preprocessing.LabelEncoder()
le_type.fit(data.type)

data['sub_category_label'] = le_sub_category.transform(data.sub_category)
# data['brand_label'] = le_brand.transform(data.brand)
data['type_label'] = le_type.transform(data.type)


In [13]:
data.columns

Index(['index', 'product', 'category', 'sub_category', 'brand', 'sale_price',
       'market_price', 'type', 'rating', 'description', 'category_label',
       'sub_category_label', 'type_label'],
      dtype='object')

In [14]:
data.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,category_label,sub_category_label,type_label
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...,2,42,169
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ...",7,73,352
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m...",4,62,207
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...,4,9,208
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,2,8,35


In [15]:
data['brand'] = data['brand'].astype(str)

In [16]:
le_brand = preprocessing.LabelEncoder()
le_brand.fit(data.brand)

data['brand_label'] = le_brand.transform(data.brand)

In [17]:
data.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,category_label,sub_category_label,type_label,brand_label
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...,2,42,169,1648
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ...",7,73,352,1044
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m...",4,62,207,1780
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...,4,9,208,1146
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,2,8,35,1209


In [18]:
features = ['category_label','sub_category_label', 'type_label', 'brand_label', 'market_price', 'rating']
X = data[features]
y = data.sale_price

In [19]:
from sklearn.tree import DecisionTreeRegressor

big_basket_sale_model = DecisionTreeRegressor()

In [20]:
from sklearn.model_selection import train_test_split

tr_X, val_X, tr_y, val_y = train_test_split(X, y, random_state=0)

In [21]:

big_basket_sale_model.fit(tr_X, tr_y)

DecisionTreeRegressor()

In [22]:
# Now we have trained our model on the training data
# let's get predictions of the model on the validation part that we separated during splitting

predicted_y = big_basket_sale_model.predict(val_X)

In [23]:
predicted_y
# Great now we successfully have the predicted values of y i.e the sale prices of all the products in validation 
# part i.e val_X

array([210.  ,  35.  ,  53.  , ..., 299.  , 214.53, 999.  ])

In [24]:
# Let's check what is the mean error between our prediction and actual values i.e |predicted_y - val_y|/n
from sklearn.metrics import mean_absolute_error
mean_absolute_error(val_y, predicted_y)

36.91728910120312

In [1]:
# the mean absolute error is about 37 rupees for every product which is a good finding

##### Accuracy of the model

In [None]:
from sklearn.metrics import accuracy_score
score = accuracy_score(val_y, predicted_y)
print("ACCURACY : ", score)