In [1]:
import numpy as np

from pathlib import Path
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

from skl2onnx import convert_sklearn, to_onnx
from skl2onnx.common.data_types import StringTensorType

DATASET_SIZE = 120
DATASET_IS_BALANCED = False


MAX_FEATURES = 20000        # max_features params for CountVectorizer

training_name = 'tfidf-rf-{}_{}k_{}'.format(
    MAX_FEATURES,
    DATASET_SIZE,
    'bal' if DATASET_IS_BALANCED else 'imbal'
)

training_storing_folder = Path(f"{training_name}/").resolve()
if not training_storing_folder.exists():
    training_storing_folder.mkdir(parents=True, exist_ok=True)

training_args_datetime = datetime(year=2023, month=12, day=20)

print('Training storing folder:')
print(training_storing_folder)
print('\n\n')

Training storing folder:
/Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/dev-workspace/sa/tfidf-rf_2023-12-16/tfidf-rf-20000_120k_imbal





In [2]:
# load the model

import pickle

rf_model_path = Path.joinpath(training_storing_folder, "{}_{}_model.sav".format(
    training_name,
    training_args_datetime.strftime("%Y-%m-%d")
))
model = pickle.load(open(rf_model_path, 'rb'))

count_vectorizer_path = Path.joinpath(training_storing_folder, "{}_{}_count_vectorizer.pkl".format(
    training_name,
    training_args_datetime.strftime("%Y-%m-%d")
))
vectorizer = pickle.load(open(count_vectorizer_path, 'rb'))

tfidf_transformer_path = Path.joinpath(training_storing_folder, "{}_{}_tfidf.pkl".format(
    training_name,
    training_args_datetime.strftime("%Y-%m-%d")
))
tfidf = pickle.load(open(tfidf_transformer_path, 'rb'))

print('Loaded model from {}'.format(rf_model_path))
print('Loaded count vectorizer from {}'.format(count_vectorizer_path))
print('Loaded tfidf transformer from {}'.format(tfidf_transformer_path))

Loaded model from /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/dev-workspace/sa/tfidf-rf_2023-12-16/tfidf-rf-20000_120k_imbal/tfidf-rf-20000_120k_imbal_2023-12-20_model.sav
Loaded count vectorizer from /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/dev-workspace/sa/tfidf-rf_2023-12-16/tfidf-rf-20000_120k_imbal/tfidf-rf-20000_120k_imbal_2023-12-20_count_vectorizer.pkl
Loaded tfidf transformer from /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/dev-workspace/sa/tfidf-rf_2023-12-16/tfidf-rf-20000_120k_imbal/tfidf-rf-20000_120k_imbal_2023-12-20_tfidf.pkl


In [3]:
pipeline_inference = Pipeline([
    ('vect', vectorizer),
    ('tfidf', tfidf),
    ('model', model),
])

In [4]:
onnx_options = {id(pipeline_inference): {'zipmap': False}}

model_onnx = convert_sklearn(
    pipeline_inference,
    initial_types=[('strfeat', StringTensorType([None, 1]))],
    options=onnx_options,
    target_opset=13     # supports version 1.13 or above (current ver is 1.16)
)

model_onnx_path = Path.joinpath(training_storing_folder, "{}_{}_pipeline.onnx".format(
    training_name,
    training_args_datetime.strftime("%Y-%m-%d")
))

with open(model_onnx_path, "wb") as f:
    f.write(model_onnx.SerializeToString())

In [5]:
test_data = [['I like the game'], ["I do not like it."], ["It crashes when I just run on my pc."]]

In [6]:
import onnxruntime as rt

sess = rt.InferenceSession(
    Path.joinpath(training_storing_folder, "{}_{}_pipeline.onnx".format(
        training_name, training_args_datetime.strftime("%Y-%m-%d"))),
    providers=['CPUExecutionProvider']
    )
input_name = [inp.name for inp in sess.get_inputs()][0]     # only one input in this model
label_names = [label.name for label in sess.get_outputs()]  # it outputs the label and the probability

In [7]:
pred_sklearn = []
pred_onnx = []

for i in range(len(test_data)):
    pred_sklearn.append(pipeline_inference.predict_proba(test_data[i]))

    pred_onnx.append(sess.run(label_names, {input_name: [test_data[i]]}))

print(pred_sklearn)
print(pred_onnx)

for i in range(len(pred_sklearn)):
    result_sklearn = pred_sklearn[i]
    result_onnx = pred_onnx[i][1]

    # test whether the results are the same
    np.testing.assert_allclose(result_sklearn, result_onnx, rtol=1e-5, atol=1e-5)

[array([[0., 1.]]), array([[0.18988889, 0.81011111]]), array([[0.0612605, 0.9387395]])]
[[array([1], dtype=int64), array([[-1.1920929e-07,  1.0000001e+00]], dtype=float32)], [array([1], dtype=int64), array([[0.18988895, 0.81011105]], dtype=float32)], [array([1], dtype=int64), array([[0.0612604, 0.9387396]], dtype=float32)]]


[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:    0.0s
