In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
grocery_df = pd.read_csv("datasets/PROCESSED/groceries.csv", sep="\n", header=None)
household_df = pd.read_csv("datasets/PROCESSED/household.csv")

In [3]:
grocery_df.columns = ["Description"]
grocery_df = grocery_df[ grocery_df.index < 4233]
grocery_df["Label"] = "Grocery"
grocery_df

Unnamed: 0,Description,Label
0,"citrus fruit,semi-finished bread,margarine,rea...",Grocery
1,"tropical fruit,yogurt,coffee",Grocery
2,whole milk,Grocery
3,"pip fruit,yogurt,cream cheese,meat spreads",Grocery
4,"other vegetables,whole milk,condensed milk,lon...",Grocery
...,...,...
4228,other vegetables,Grocery
4229,"hamburger meat,citrus fruit,other vegetables,w...",Grocery
4230,"sausage,finished products,hamburger meat,citru...",Grocery
4231,"citrus fruit,other vegetables,whole milk,butte...",Grocery


In [4]:
household_df["Label"] = "Household"
household_df = household_df[["Label", "Description"]]
household_df

Unnamed: 0,Label,Description
0,Household,BATH BUILDING BLOCK WORD
1,Household,PAPER CHAIN KIT 50'S CHRISTMAS
2,Household,"HOT WATER BOTTLE TEA AND SYMPATHY, RED HANGING..."
3,Household,JAM MAKING SET PRINTED
4,Household,HOMEMADE JAM SCENTED CANDLES
...,...,...
4228,Household,RETROSPOT LARGE MILK JUG
4229,Household,15CM CHRISTMAS GLASS BALL 20 LIGHTS
4230,Household,RABBIT NIGHT LIGHT
4231,Household,"VICTORIAN GLASS HANGING T-LIGHT, ZINC T-LIGHT ..."


In [5]:
combined_df = grocery_df.append(household_df, ignore_index=True)

In [6]:
le = LabelEncoder()
cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', stop_words='english' )

In [7]:
le.fit(combined_df.Label)
le.classes_

array(['Grocery', 'Household'], dtype=object)

In [8]:
combined_df["Label_enc"] = le.transform(combined_df.Label)
combined_df

Unnamed: 0,Description,Label,Label_enc
0,"citrus fruit,semi-finished bread,margarine,rea...",Grocery,0
1,"tropical fruit,yogurt,coffee",Grocery,0
2,whole milk,Grocery,0
3,"pip fruit,yogurt,cream cheese,meat spreads",Grocery,0
4,"other vegetables,whole milk,condensed milk,lon...",Grocery,0
...,...,...,...
8461,RETROSPOT LARGE MILK JUG,Household,1
8462,15CM CHRISTMAS GLASS BALL 20 LIGHTS,Household,1
8463,RABBIT NIGHT LIGHT,Household,1
8464,"VICTORIAN GLASS HANGING T-LIGHT, ZINC T-LIGHT ...",Household,1


In [9]:
X_train, X_test, y_train, y_test = train_test_split(combined_df.Description, combined_df.Label_enc, random_state = 0, test_size = 0.2)

In [10]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [11]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_cv, y_train)
y_pred = nb_clf.predict(X_test_cv)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9988193624557261

In [15]:
print(classification_report(y_test, y_pred, target_names=le.classes_))

precision    recall  f1-score   support

     Grocery       1.00      1.00      1.00       865
   Household       1.00      1.00      1.00       829

    accuracy                           1.00      1694
   macro avg       1.00      1.00      1.00      1694
weighted avg       1.00      1.00      1.00      1694



In [17]:
print('Accuracy score: ', accuracy_score(y_test, y_pred))
print('Precision score: ', precision_score(y_test, y_pred))
print('Recall score: ', recall_score(y_test, y_pred))

Accuracy score:  0.9988193624557261
Precision score:  0.9975932611311673
Recall score:  1.0
