In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn

In [3]:
# Let's load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')

# Let's specify the features and the target
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
# Select data based on variance (not the final version yet, for now just for testing)
selector = VarianceThreshold()

In [5]:
# Define a gradient boosting classifier
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [6]:
# Create a pipeline object with our selector and classifier
# NOTE: You can create custom pipeline objects but they must be registered to onnx or it will not recognise them
# Because of this we recommend using the onnx known objects as defined in the documentation
pipeline = Pipeline(steps=[('feature selection', selector), ('classification', classifier)])

In [7]:
# Let's train a simple model
pipeline.fit(X_train, y_train)

# Let's evaluate the model
y_pred = pipeline.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of the original model: ', original_accuracy)

Accuracy of the original model:  0.9456040480708412


In [None]:
# Define equivalent partitions
partitions = [
    {"name": "Men", "condition": lambda df: df['persoon_geslacht_vrouw'] == 0.0},
    {"name": "Women", "condition": lambda df: df['persoon_geslacht_vrouw'] == 1.0},
    {"name": "sick days <250", "condition": lambda df: df['ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden'] < 200.0},
    {"name": "sick days <1500", "condition": lambda df: df['ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden'] < 1500.0},
    {"name": "sick days >1500", "condition": lambda df: df['ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden'] >= 1500.0},
    {"name": "ghetto", "condition": lambda df: df['adres_recentste_buurt_groot_ijsselmonde'] = 1.0},
    {"name": "not ghetto", "condition": lambda df: df['adres_recentste_buurt_groot_ijsselmonde'] = 0.0},
]

In [None]:
X_test['ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden'][0:]

895       765.0
173      2102.0
6503      584.0
4375     1051.0
1602     1095.0
          ...  
5646     2056.0
10391    1578.0
4083      389.0
4023      537.0
2759      998.0
Name: ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden, Length: 3062, dtype: float32

In [59]:
# Apply equivalent partitioning
for partition in partitions:
    partition_data = X_test[partition["condition"](X_test)]
    partition_indices = partition_data.index  # Get the indices of the partition
    partition_labels = y_test.loc[partition_indices]  # Get the actual labels for the partition

    if not partition_data.empty:
        # Predictions using the model
        predictions = pipeline.predict(partition_data)

        # Calculate accuracy for this partition
        accuracy = accuracy_score(partition_labels, predictions)

        # Print partition details
        print(f"Partition: {partition['name']}")
        print(f"Number of data points: {len(partition_data)}")
        print(f"Accuracy: {accuracy:.2f}")
        res = np.unique(predictions, return_counts=True)
        print(f"Predictions: {res}\n")
        print(f"Ratio: {'{0:.3f}'.format(res[1][1]/res[1][0])}\n")

Partition: Men
Number of data points: 1664
Accuracy: 0.94
Predictions: (array([0, 1]), array([1532,  132]))

Ratio: 0.086

Partition: Women
Number of data points: 1498
Accuracy: 0.95
Predictions: (array([0, 1]), array([1388,  110]))

Ratio: 0.079

Partition: sick days <250
Number of data points: 90
Accuracy: 0.87
Predictions: (array([0, 1]), array([79, 11]))

Ratio: 0.139

Partition: sick days <1500
Number of data points: 1401
Accuracy: 0.92
Predictions: (array([0, 1]), array([1230,  171]))

Ratio: 0.139

Partition: sick days >1500
Number of data points: 1761
Accuracy: 0.96
Predictions: (array([0, 1]), array([1690,   71]))

Ratio: 0.042



In [8]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9456040480708412


In [9]:
# Let's save the model
onnx.save(onnx_model, "model/gboost.onnx")

# Let's load the model
new_session = rt.InferenceSession("model/gboost.onnx")

# Let's predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)


Accuracy of the ONNX model:  0.9456040480708412
