# Aim
This project is intended to fit Naive Bayes model to the same sales dataset so we can use this model to predict whether a product category is in the high-profit category or low-profit category

In [15]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [16]:
Orders = pd.read_csv('Orders_1.csv')
Orders.head(1)

Unnamed: 0,row,order_priority,order_date,order_id,discount,unit_price,order_quantity,sales,shipping_cost,product_base_margin,...,city,postal_code,ship_date,ship_mode,subregion,new_order_date,year_order_date,new_discount,new_margin,profit
0,3689,Medium,28/01/2011,97,3%,3.0,26,75.66,1.0,38.0%,...,Jakarta,,29/01/2011,Regular Air,,2011-01-28,2011,0.03,0.38,26.3


##### 1. Prepare data
Categorizing 'Profit' column so 1 mean high profit and 0 means low profit

In [17]:
Orders['Profit_level'] = np.where(Orders['profit'] > 1000, '1','0')

In [18]:
Orders.tail(10)

Unnamed: 0,row,order_priority,order_date,order_id,discount,unit_price,order_quantity,sales,shipping_cost,product_base_margin,...,postal_code,ship_date,ship_mode,subregion,new_order_date,year_order_date,new_discount,new_margin,profit,Profit_level
138,1277,Medium,12/05/2010,1314,0%,9.0,6,54.0,4.0,38.0%,...,,14/05/2010,Regular Air,,2010-05-12,2010,0.0,0.38,16.52,0
139,8769,High,18/05/2012,1317,6%,12.0,69,778.32,6.0,40.0%,...,,20/05/2012,Regular Air,Canada ...,2012-05-18,2012,0.06,0.4,275.52,0
140,12865,Low,15/04/2013,1344,1%,156.0,18,2779.92,9.0,58.0%,...,,19/04/2013,Regular Air,,2013-04-15,2013,0.01,0.58,1591.56,1
141,15909,Low,27/10/2013,1345,8%,101.0,50,4646.0,36.0,62.0%,...,,31/10/2013,Delivery Truck,,2013-10-27,2013,0.08,0.62,2691.0,1
142,2515,Medium,20/09/2010,960,7%,14.0,84,1093.68,7.0,54.0%,...,,21/09/2010,Regular Air,,2010-09-20,2010,0.07,0.54,545.72,0
143,13889,High,17/06/2013,1346,9%,101.0,72,6617.52,26.0,60.0%,...,,19/06/2013,Delivery Truck,,2013-06-17,2013,0.09,0.6,3682.72,1
144,5713,Low,16/08/2011,1382,6%,21.0,9,177.66,3.0,81.0%,...,,20/08/2011,Regular Air,,2011-08-16,2011,0.06,0.81,138.75,0
145,14771,Low,16/08/2013,1382,2%,221.0,62,13427.96,65.0,62.0%,...,,23/08/2013,Delivery Truck,,2013-08-16,2013,0.02,0.62,8156.2,1
146,4555,High,26/04/2011,1383,7%,416.0,43,16635.84,11.0,57.0%,...,,27/04/2011,Regular Air,,2011-04-26,2011,0.07,0.57,8933.0,1
147,3337,High,20/12/2010,1411,9%,161.0,72,10548.72,35.0,72.0%,...,,21/12/2010,Delivery Truck,,2010-12-20,2010,0.09,0.72,7267.96,1


##### 1-1. Prepare data
Categorizing 'category' column so numer 1 to 14 represent 14 different categories of products

In [19]:
Orders['Category_code'] = Orders['category']

Orders['Category_code']  = Orders['Category_code'].replace({'Labels':0,'Chairs & Chairmats':1, 'Scissors, Rulers and Trimmers':2,'Paper':3,'Office Machines':4,
                       'Telephones and Communication':5,'Pens & Art Supplies':6,'Scissors, Rulers and Trimmers':7,'Appliances':8,
                      'Binders and Binder Accessories':9,'Paper':10,'Envelopes':11,'Binders and Binder Accessories':12,'Bookcases':13,
                       'Tables':14, 'Rubber Bands':15,'Office Furnishings':16,'Computer Peripherals':17, 'Storage & Organization':18})
Orders['Category_code'].head(5)

0     0
1     1
2     7
3    10
4     4
Name: Category_code, dtype: int64

##### 1-2. Prepare data
Creating two lists that will be used for sklearn model

In [20]:
Data_1 = np.array(Orders['Category_code'])
Data_2 = np.array(Orders['unit_price'])

Data=list(zip(Data_1,Data_2))
Data[:5]

[(0, 3.0), (1, 151.0), (7, 2.0), (10, 6.0), (4, 151.0)]

In [21]:
Target = list(Orders['Profit_level'])
len(Target)

148

##### 2 Apply Sklearn model
Apply sklearn model to seperate data into test set and training set

In [22]:
# Import train_test_split function

from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(Data, Target, test_size=0.3,random_state=109) # 70% training and 30% test

##### 3. Apply Naive Bayes model
Apply sklearn model to seperate data into test set and training set

In [23]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)

In [24]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7777777777777778


##### 4. use Naive Bayes to predict and then compare the prediction result against supplied data

In [25]:
import pandas as pd
from pandas import Series,DataFrame
predict_column = []
i=0
for anything in Data:
    predicted= gnb.predict([Data[i]])
    i=i+1
    predict_column.append(predicted)

    
new_df= DataFrame(predict_column)
new_df.columns = ['Predicts']
new_df['original'] = Target
new_df.head()


Unnamed: 0,Predicts,original
0,0,0
1,0,1
2,0,0
3,0,0
4,1,0


##### 5. checking prediction accuracy

In [26]:
import numpy as np
new_df['Accuracy'] = np.where(new_df['Predicts'] == new_df['original'], '1','0')
new_df['Accuracy'].value_counts()

1    115
0     33
Name: Accuracy, dtype: int64

In [27]:
Accurate_rate= (115/148)
Accurate_rate

0.777027027027027

# Conclusion
In this project we use product category and unit price to predict whether a product category is high or low profit product. The accuracy is very close to 80%. It is suggested to use data from different columns and see which column of data leads to higher accuracy