In [None]:
import json
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType
import joblib

In [None]:
!pip install skl2onnx onnx onnxruntime


Collecting skl2onnx
  Downloading skl2onnx-1.19.1-py3-none-any.whl.metadata (3.8 kB)
Collecting onnx
  Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading skl2onnx-1.19.1-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx

In [None]:
with open('text_intents_dataset.json', 'r') as f:
    data = json.load(f)

texts, labels = zip(*data)

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
label_names = list(label_encoder.classes_)
print("Classes:", label_names)

Classes: [np.str_('cycle_font_next'), np.str_('cycle_font_prev'), np.str_('decrease_contrast'), np.str_('decrease_font'), np.str_('decrease_letter_spacing'), np.str_('decrease_line_spacing'), np.str_('decrease_word_spacing'), np.str_('increase_contrast'), np.str_('increase_font'), np.str_('increase_letter_spacing'), np.str_('increase_line_spacing'), np.str_('increase_word_spacing'), np.str_('other'), np.str_('undo_changes')]


In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        lowercase=True,
        max_features=300,    # keep small for ONNX size
        stop_words='english'
    )),
    ('clf', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=300,
        C=3.0,
        random_state=42
    )),
])

pipeline.fit(texts, y)
print("Train accuracy:", pipeline.score(texts, y))

Train accuracy: 0.9593023255813954




In [None]:
onnx_model = convert_sklearn(
    pipeline,
    initial_types=[('input', StringTensorType([None, 1]))],
    options={id(pipeline): {'zipmap': False}}
)
with open("intent-classifier.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())
print("Saved intent-classifier.onnx")

Saved intent-classifier.onnx


In [None]:
with open("intent_labels.json", "w") as f:
    json.dump(label_names, f)

vocab = pipeline.named_steps['vect'].vocabulary_

In [None]:
inv_vocab = {idx: word for word, idx in vocab.items()}
sorted_vocab = [inv_vocab[i] for i in range(len(inv_vocab))]
with open("intent_vectorizer_vocab.json", "w") as f:
    json.dump(sorted_vocab, f)

print("Saved intent_labels.json and intent_vectorizer_vocab.json")

Saved intent_labels.json and intent_vectorizer_vocab.json


In [None]:
joblib.dump(pipeline, "intent_pipeline.joblib")
print("Done!")

Done!


In [None]:
pipeline = joblib.load("intent_pipeline.joblib")

In [None]:
vectorizer = pipeline.named_steps["vect"]          # CountVectorizer
clf = pipeline.named_steps["clf"]                  # LogisticRegression
label_encoder = None                               # We used LabelEncoder earlier

In [None]:
with open("intent_labels.json", "r") as f:
    label_names = json.load(f)

vocab = vectorizer.vocabulary_

In [None]:
inv_vocab = {idx: tok for tok, idx in vocab.items()}
ordered_vocab = [inv_vocab[i] for i in range(len(inv_vocab))]

weights = {
    "labels": label_names,
    "vocab": ordered_vocab,
    "coef": clf.coef_.tolist(),        # shape: [num_classes, vocab_size]
    "intercept": clf.intercept_.tolist()
}

with open("intent_logreg_weights.json", "w") as f:
    json.dump(weights, f)

print("Wrote intent_logreg_weights.json")

Wrote intent_logreg_weights.json
