### General Concept

1. Train an NLP model until its optimal
2. Save NLP model
3. Load NLP model without the need to retrain whilst retaining its optimality

In [31]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

### 1. Splitting Dataset

In [32]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [33]:
# bag of words object
vec = CountVectorizer(
    ngram_range=(1, 3)
)

# vectorize all x
X_train = vec.fit_transform(df_train.Text)
X_test = vec.transform(df_test.Text)

y_train = df_train.Category
y_test = df_test.Category

### 2. Define Model

In [34]:
model = MultinomialNB()
model.fit(X_train, y_train)

preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

    Clothing       1.00      1.00      1.00         5
        Food       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [35]:
# predict with a sentence
test_text = "dia orangnya suka pake kacamata mulu"
bow = vec.transform([test_text])

pred = model.predict(bow)
conf = model.predict_proba(bow) # [clothing, food]
pred, conf

(array(['Clothing'], dtype='<U8'), array([[0.71439936, 0.28560064]]))

In [36]:
model.score(X_test, y_test)

1.0

### 3. Save Model

In [37]:
# save model and vectorization
joblib.dump(model, "model.joblib")
joblib.dump(vec, "vec.joblib")

['vec.joblib']

### 4. Test Loaded Model

In [38]:
# load model and vec again
new_model = joblib.load("model.joblib")
new_vec = joblib.load("vec.joblib")

In [39]:
# test new loaded model (prediction and confidence should be the same)
test_text = "dia orangnya suka pake kacamata mulu"
new_bow = new_vec.transform([test_text])

new_pred = new_model.predict(new_bow)
new_conf = new_model.predict_proba(new_bow) # [clothing, food]
new_pred, new_conf

(array(['Clothing'], dtype='<U8'), array([[0.71439936, 0.28560064]]))

In [40]:
# split new dataset
re = pd.read_csv("re.csv")

X_train, X_test, y_train, y_test = train_test_split(
    re.Sentence, re.Category, test_size=0.2)

# vectorize all x
X_train = new_vec.fit_transform(X_train)
X_test = new_vec.transform(X_test)

In [41]:
# retrain loaded model
new_model.fit(X_train, y_train)

In [42]:
# test retrained model
re_text = "dia orangnya suka pake kacamata mulu"
re_bow = new_vec.transform([re_text])

re_pred = new_model.predict(re_bow)
re_conf = new_model.predict_proba(re_bow)
re_pred, re_conf

(array(['clothing'], dtype='<U8'), array([[0.9070424, 0.0929576]]))