Note: The Following code is just for reference. I am using dataset from Kaggle, Dataset link: https://www.kaggle.com/competitions/sentiment-analysis-company-reviews/data.

I am just using a small part of Dataset, as the main focus is on python SKlearn model to ONNX model

In [10]:
# Importing Required libraries

In [38]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import PorterStemmer


from sklearn.pipeline import Pipeline


from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Loading Data

In [39]:
path = r"sentiment-analysis-company-reviews"
df = pd.read_csv(path + "\\train.csv")
df.head(5)

Unnamed: 0,Id,Review,Rating
0,0,Very good value and a great tv very happy and ...,5
1,1,After 6 month still can't access my account,3
2,2,I couldn't make an official review on a produc...,1
3,3,"Fantastic! Extremely easy to use website, fant...",5
4,4,So far annoyed as hell with this bt monthly pa...,1


In [40]:
# Sampling Smaller part , please do uncomment below code if you want to use entire dataset

In [41]:
df = df.sample(frac=0.1, replace=False, random_state=1)

# Preprocessing

In [42]:
stemmer = PorterStemmer()
stopwords = stopwords.words('english')
# stopwords.extend(["we're", "i" , 'if', 'this', "im" , "cant","i'm"])
# print(stopwords)
def lower_text(text):
    return text.lower()

def remove_number(text):
    num = re.compile(r'[-+]?[.\d]*[\d]+[:,.\d]*')
    return num.sub(r'', text)

def remove_punct(text):
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    
    for p in punctuations:
#         text = text.replace(p, f' {p} ')
        text = text.replace(p,'')
    text = text.replace(",",'')
    text = text.replace(".",'')
    text = text.replace("'",'')  
    text = text.replace("'",'')   
    return text

def remove_quotes(text):
    text = text.replace('"','')
    return text

def remove_stopwords(text):
    text_list = text.split()
    text_out_list = []
    for word in text_list:
#         print("word", word)
        if word not in stopwords:
            text_out_list.append(word)
    out_text = ' '.join(text_out_list)
    return out_text

def stem(utterance):
    #Remove all single characters
    utterance = re.sub(r'\s+[a-zA-Z]\s+',' ',str(utterance))

    #Removing single characters from the start
    utterance = re.sub(r'^[a-zA-Z]\s+', ' ', utterance)

    #Substituting multiple spaces with single space
    utterance = re.sub(r'\s+', ' ', utterance,flags=re.I)

    utterance = utterance.lower()

    #Lemmatization
    utterance_list = utterance.split()
    utterance_out_list = []
    for word in utterance_list:
        utterance_out_list.append(stemmer.stem(word))

    utterance = ' '.join(utterance_out_list)
    return utterance


def clean_text(text):
    text = lower_text(text)
    text = remove_number(text)    
    text = remove_quotes(text)
# #     print("text before stop words removal: ")
# #     print(text)
#     text = remove_stopwords(text)
# #     print("text after stop words removal: ")
# #     print(text)    
    
    text = remove_punct(text)
# #     print("text before stemming: ")
# #     print(text)
    text = stem(text)
    
#     print("text after stemming: ")
#     print(text)
    
    return text

In [43]:
df["clean_input"] = df["Review"].apply(clean_text)

In [44]:
X = []
y = []
for index,row in df.iterrows():
    X.append(row['clean_input'])
    y.append(row['Rating'])
    
y = np.array(y)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Always use Pipeline of the components(Eg: vectorizer, classifier etc..) instead of training seperately for improved accuracy. Because pipeline helps in each components getting optimized together rather than seperately.
As this is sample code, I am not doing any preprocessing.

In [46]:
model_pipeline = Pipeline(steps=[('countVectorizer', CountVectorizer(max_features=1500, min_df=1, max_df=0.75,ngram_range = (1, 3))), 
                                 ('tfidfconverter',TfidfTransformer()),
                                 ('classifier', RandomForestClassifier(n_estimators=1000, random_state=0))
                                 ])

In [47]:
model_pipeline.fit(X_train, y_train)

In [48]:
y_pred = model_pipeline.predict(X_test)

Do Not mind accuarcy as its just sample modelling

In [49]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[346   0   0   0  35]
 [ 35   0   0   0  11]
 [ 14   0   0   0  10]
 [ 20   0   0   0  68]
 [ 29   0   0   0 632]]
              precision    recall  f1-score   support

           1       0.78      0.91      0.84       381
           2       0.00      0.00      0.00        46
           3       0.00      0.00      0.00        24
           4       0.00      0.00      0.00        88
           5       0.84      0.96      0.89       661

    accuracy                           0.81      1200
   macro avg       0.32      0.37      0.35      1200
weighted avg       0.71      0.81      0.76      1200

0.815


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Convert SKlearn to ONNX model

In [50]:
from skl2onnx.common.data_types import StringTensorType
import onnx
import onnxmltools
import numpy as np
import onnxruntime as rt

In [51]:
# Define input_type based on type of your data. string in our case
input_type = [('input', StringTensorType([]))]

# Convert the pipeline to ONNX model
onnx_model = onnxmltools.convert_sklearn(model_pipeline, initial_types=input_type)

In [52]:
onnx_path = r'models\ReviewSentimentAnalysis.onnx'

In [53]:
onnx.save(onnx_model, onnx_path)

# Inference from Onnx

Below is inference code where I used the trained ONNX model for prediction. Onnx suuports wide range of languages like Python, C++, C#, Java, JavaScript, and more.  
Based on your require you can rewrite the below inference code. I am just giving the reference code in python.


In [57]:
onnx_model = rt.InferenceSession(onnx_path)   #onnx_path has got our onnx model now

def predictOnnxNew(texts):
    input_data = np.array(texts, dtype=np.str).reshape(-1, 1)

    result = onnx_model.run(None, {'input': input_data})
    

    label_num = result[0][0]   
    probability = result[1][0][label_num]


#     print("Onnx" + " lable predicted " + str(label_num) + "    " + str(probability) + "    " + texts)
    return label_num,probability

In [58]:
output_list = []
Probability_list = []
for user_input  in X_test:
    y_label,Probability = predictOnnxNew(user_input)
    output_list.append(y_label)
    Probability_list.append(Probability)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  input_data = np.array(texts, dtype=np.str).reshape(-1, 1)


I am just ensuring sklearn model and Onnx model has got same accuracy in the test set

In [59]:
testy_pred = np.array(output_list)

print(confusion_matrix(y_test,testy_pred))
print(classification_report(y_test,testy_pred))
print(accuracy_score(y_test, testy_pred))

[[345   0   0   0  36]
 [ 35   0   0   0  11]
 [ 14   0   0   0  10]
 [ 20   0   0   0  68]
 [ 29   0   0   0 632]]
              precision    recall  f1-score   support

           1       0.78      0.91      0.84       381
           2       0.00      0.00      0.00        46
           3       0.00      0.00      0.00        24
           4       0.00      0.00      0.00        88
           5       0.83      0.96      0.89       661

    accuracy                           0.81      1200
   macro avg       0.32      0.37      0.35      1200
weighted avg       0.71      0.81      0.76      1200

0.8141666666666667


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
