# <left> Benchmark Serialization </left> 

In [1]:
import time
import os

import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import pickle
import joblib

import onnxruntime
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from sklearn2pmml import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from pypmml import Model

from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

# –û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø—É—Ç–µ–π –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –º–æ–¥–µ–ª–µ–π
models_dir = "models"
models_path = os.path.join("..", models_dir)


In [2]:
def create_models_dir():
    """–°–æ–∑–¥–∞–Ω–∏–µ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏ –¥–ª—è –º–æ–¥–µ–ª–µ–π, –µ—Å–ª–∏ –æ–Ω–∞ –Ω–µ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç"""
    os.makedirs(models_path, exist_ok=True)
    print(f"\nüìÅ –î–∏—Ä–µ–∫—Ç–æ—Ä–∏—è –¥–ª—è –º–æ–¥–µ–ª–µ–π —Å–æ–∑–¥–∞–Ω–∞: {models_path}")

def generate_data(n_samples=10000, n_features=20):
    """–ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Å–∏–Ω—Ç–µ—Ç–∏—á–µ—Å–∫–∏—Ö –¥–∞–Ω–Ω—ã—Ö"""
    print("\nüîß –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Å–∏–Ω—Ç–µ—Ç–∏—á–µ—Å–∫–∏—Ö –¥–∞–Ω–Ω—ã—Ö...")
    X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=15,
        n_redundant=5,
        random_state=42
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print(f"‚úÖ –î–∞–Ω–Ω—ã–µ —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω—ã: {n_samples} —Å—ç–º–ø–ª–æ–≤, {n_features} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")
    return X_train, X_test, y_train, y_test

def create_pipeline():
    """–°–æ–∑–¥–∞–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞"""
    return Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=4)),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

def train_model(X_train, y_train, X_test, y_test):
    """–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏"""
    print("\nüöÄ –û–±—É—á–µ–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞...")
    model = create_pipeline()
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    print(f"‚úÖ –ü–∞–π–ø–ª–∞–π–Ω –æ–±—É—á–µ–Ω: Train accuracy = {train_score:.4f}, Test accuracy = {test_score:.4f}")
    return model

def save_models(model, X_train, y_train):
    """–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –≤ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö —Ñ–æ—Ä–º–∞—Ç–∞—Ö"""
    print("\nüíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –≤ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö —Ñ–æ—Ä–º–∞—Ç–∞—Ö...")
    create_models_dir()
    save_paths = {}
    
    # Pickle
    pickle_path = os.path.join(models_path, 'model.pkl')
    with open(pickle_path, 'wb') as f:
        pickle.dump(model, f)
    save_paths['pickle'] = pickle_path

    # Joblib
    joblib_path = os.path.join(models_path, 'model.joblib')
    joblib.dump(model, joblib_path)
    save_paths['joblib'] = joblib_path

    # ONNX
    initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
    onnx_model = convert_sklearn(model, initial_types=initial_type)
    onnx_path = os.path.join(models_path, 'model.onnx')
    with open(onnx_path, "wb") as f:
        f.write(onnx_model.SerializeToString())
    save_paths['onnx'] = onnx_path

    # PMML
    pmml_pipeline = PMMLPipeline([("pipeline", model)])
    pmml_pipeline.fit(X_train, y_train)
    pmml_path = os.path.join(models_path, 'model.pmml')
    sklearn2pmml(pmml_pipeline, pmml_path)
    save_paths['pmml'] = pmml_path

    print("‚úÖ –ü–∞–π–ø–ª–∞–π–Ω—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤–æ –≤—Å–µ—Ö —Ñ–æ—Ä–º–∞—Ç–∞—Ö")
    return save_paths

def benchmark_models(save_paths, X_test):
    """–ò–∑–º–µ—Ä–µ–Ω–∏–µ –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ –º–æ–¥–µ–ª–µ–π"""
    print("\nüìä –ù–∞—á–∞–ª–æ –±–µ–Ω—á–º–∞—Ä–∫–∏–Ω–≥–∞...")
    results = []

    for format_name, path in save_paths.items():
        # –ò–∑–º–µ—Ä–µ–Ω–∏–µ —Ä–∞–∑–º–µ—Ä–∞ —Ñ–∞–π–ª–∞
        size_mb = os.path.getsize(path) / (1024 * 1024)
        
        # –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏ –∏–∑–º–µ—Ä–µ–Ω–∏–µ –≤—Ä–µ–º–µ–Ω–∏ –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞
        if format_name == 'pickle':
            with open(path, 'rb') as f:
                model = pickle.load(f)
            start_time = time.time()
            predictions = model.predict(X_test)
            inference_time = (time.time() - start_time) * 1000

        elif format_name == 'joblib':
            model = joblib.load(path)
            start_time = time.time()
            predictions = model.predict(X_test)
            inference_time = (time.time() - start_time) * 1000

        elif format_name == 'onnx':
            session = onnxruntime.InferenceSession(path)
            input_name = session.get_inputs()[0].name
            start_time = time.time()
            predictions = session.run(None, {input_name: X_test.astype(np.float32)})[0]
            inference_time = (time.time() - start_time) * 1000

        elif format_name == 'pmml':
            model = Model.load(path)
            start_time = time.time()
            predictions = model.predict(X_test)
            inference_time = (time.time() - start_time) * 1000
        
        results.append([
            format_name,
            f"{size_mb:.2f}",
            f"{inference_time:.2f}"
        ])

    # –í—ã–≤–æ–¥ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –≤ –≤–∏–¥–µ —Ç–∞–±–ª–∏—Ü—ã
    headers = ["–§–æ—Ä–º–∞—Ç", "–†–∞–∑–º–µ—Ä (MB)", "–í—Ä–µ–º—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ (ms)"]
    print("\nüìà –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –±–µ–Ω—á–º–∞—Ä–∫–∞:")
    print(tabulate(results, headers=headers, tablefmt="grid"))
    return results

def cleanup_files(save_paths):
    """–û—á–∏—Å—Ç–∫–∞ –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö —Ñ–∞–π–ª–æ–≤"""
    for path in save_paths.values():
        if os.path.exists(path):
            os.remove(path)
    print("\nüßπ –í—Ä–µ–º–µ–Ω–Ω—ã–µ —Ñ–∞–π–ª—ã —É–¥–∞–ª–µ–Ω—ã")

In [3]:
# –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –¥–∞–Ω–Ω—ã—Ö
X_train, X_test, y_train, y_test = generate_data()

# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
model = train_model(X_train, y_train, X_test, y_test)

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤ —Ä–∞–∑–Ω—ã—Ö —Ñ–æ—Ä–º–∞—Ç–∞—Ö
save_paths = save_models(model, X_train, y_train)

# –ü—Ä–æ–≤–µ–¥–µ–Ω–∏–µ –±–µ–Ω—á–º–∞—Ä–∫–∞
results = benchmark_models(save_paths, X_test)

# –û—á–∏—Å—Ç–∫–∞ —Ñ–∞–π–ª–æ–≤ (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ)
cleanup_files(save_paths)


üîß –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Å–∏–Ω—Ç–µ—Ç–∏—á–µ—Å–∫–∏—Ö –¥–∞–Ω–Ω—ã—Ö...
‚úÖ –î–∞–Ω–Ω—ã–µ —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω—ã: 10000 —Å—ç–º–ø–ª–æ–≤, 20 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

üöÄ –û–±—É—á–µ–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞...
‚úÖ –ü–∞–π–ø–ª–∞–π–Ω –æ–±—É—á–µ–Ω: Train accuracy = 0.9999, Test accuracy = 0.7600

üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞ –≤ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö —Ñ–æ—Ä–º–∞—Ç–∞—Ö...

üìÅ –î–∏—Ä–µ–∫—Ç–æ—Ä–∏—è –¥–ª—è –º–æ–¥–µ–ª–µ–π —Å–æ–∑–¥–∞–Ω–∞: ../models
‚úÖ –ü–∞–π–ø–ª–∞–π–Ω—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤–æ –≤—Å–µ—Ö —Ñ–æ—Ä–º–∞—Ç–∞—Ö

üìä –ù–∞—á–∞–ª–æ –±–µ–Ω—á–º–∞—Ä–∫–∏–Ω–≥–∞...

üìà –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –±–µ–Ω—á–º–∞—Ä–∫–∞:
+----------+---------------+------------------------+
| –§–æ—Ä–º–∞—Ç   |   –†–∞–∑–º–µ—Ä (MB) |   –í—Ä–µ–º—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ (ms) |
| pickle   |         18.61 |                  30.52 |
+----------+---------------+------------------------+
| joblib   |         18.62 |                  32.22 |
+----------+---------------+------------------------+
| onnx     |          9.03 |       