In [15]:
import joblib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

In [8]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [9]:
# bag of words object
vec = CountVectorizer(
    ngram_range=(1, 3)
)

# vectorize all x
X_train = vec.fit_transform(df_train.Text)
X_test = vec.transform(df_test.Text)

y_train = df_train.Category
y_test = df_test.Category

In [10]:
model = MultinomialNB()
model.fit(X_train, y_train)

preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

    Clothing       1.00      1.00      1.00         5
        Food       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [14]:
# predict with a sentence
test_text = "dia orangnya suka pake kacamata mulu"
bow = vec.transform([test_text])

pred = model.predict(bow)
conf = model.predict_proba(bow) # [clothing, food]
pred, conf

(array(['Clothing'], dtype='<U8'), array([[0.71439936, 0.28560064]]))

In [17]:
# save model and vectorization
joblib.dump(model, "model.joblib")
joblib.dump(vec, "vec.joblib")

['vec.joblib']

In [18]:
# load model and vec again
new_model = joblib.load("model.joblib")
new_vec = joblib.load("vec.joblib")

In [22]:
# test new loaded model (prediction and confidence should be the same)
test_text = "dia orangnya suka pake kacamata mulu"
new_bow = new_vec.transform([test_text])

new_pred = new_model.predict(new_bow)
new_conf = new_model.predict_proba(new_bow) # [clothing, food]
new_pred, new_conf

(array(['Clothing'], dtype='<U8'), array([[0.71439936, 0.28560064]]))