In [None]:
import os
import time
import numpy as np
import pandas as pd
import torch
import onnx
import onnxruntime as ort
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import neural_compressor
from neural_compressor import quantization


In [None]:
csv_path = os.getenv("NBA_TEST_DATA", "nba_test.csv")
df = pd.read_csv(csv_path)

X = df.drop(columns=['score_diff']).values
y = df['score_diff'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

test_dataset = TensorDataset(X_tensor, y_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
def benchmark_session(ort_session):
    print(f"Execution provider: {ort_session.get_providers()}")

    total_mae = 0
    total = 0
    for features, labels in test_loader:
        outputs = ort_session.run(None, {ort_session.get_inputs()[0].name: features.numpy()})[0]
        mae = np.abs(outputs - labels.numpy()).sum()
        total_mae += mae
        total += labels.size(0)
    print(f"Mean Absolute Error (MAE): {total_mae / total:.2f}")

    num_trials = 100
    single_sample = X_tensor[0].unsqueeze(0).numpy()
    ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
    latencies = []
    for _ in range(num_trials):
        start = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
        latencies.append(time.time() - start)
    print(f"Inference Latency (median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Throughput: {num_trials / np.sum(latencies):.2f} FPS")

    num_batches = 50
    batch_input = X_tensor[:32].numpy()
    ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
    batch_times = []
    for _ in range(num_batches):
        start = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
        batch_times.append(time.time() - start)
    print(f"Batch Throughput: {(batch_input.shape[0] * num_batches) / np.sum(batch_times):.2f} FPS")


In [None]:
onnx_model_path = "models/nba_model.onnx"
optimized_model_path = "models/nba_model_optimized.onnx"

session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session_options.optimized_model_filepath = optimized_model_path

ort_session = ort.InferenceSession(onnx_model_path, sess_options=session_options, providers=['CPUExecutionProvider'])


In [None]:
onnx_model_path = "models/nba_model_optimized.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


In [None]:
model_path = "models/nba_model.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

config_ptq = neural_compressor.PostTrainingQuantConfig(approach="dynamic")

q_model = quantization.fit(model=fp32_model, conf=config_ptq)

q_model.save_model_to_file("models/nba_model_quantized_dynamic.onnx")


In [None]:
onnx_model_path = "models/nba_model_quantized_dynamic.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


In [None]:
calib_dataset = neural_compressor.data.datasets.NumpyDataset(X_tensor.numpy())
calib_dataloader = neural_compressor.data.DataLoader(framework='onnxruntime', dataset=calib_dataset)

config_static_aggressive = neural_compressor.PostTrainingQuantConfig(
    accuracy_criterion=neural_compressor.config.AccuracyCriterion(criterion="absolute", tolerable_loss=0.05),
    approach="static",
    device='cpu',
    quant_level=1,
    quant_format="QOperator",
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"},
    calibration_sampling_size=128
)

q_model = quantization.fit(
    model=fp32_model,
    conf=config_static_aggressive,
    calib_dataloader=calib_dataloader,
    eval_dataloader=calib_dataloader
)

q_model.save_model_to_file("models/nba_model_quantized_aggressive.onnx")


In [None]:
onnx_model_path = "models/nba_model_quantized_aggressive.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


In [None]:
config_static_conservative = neural_compressor.PostTrainingQuantConfig(
    accuracy_criterion=neural_compressor.config.AccuracyCriterion(criterion="absolute", tolerable_loss=0.01),
    approach="static",
    device='cpu',
    quant_level=0,
    quant_format="QOperator",
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"},
    calibration_sampling_size=128
)

q_model = quantization.fit(
    model=fp32_model,
    conf=config_static_conservative,
    calib_dataloader=calib_dataloader,
    eval_dataloader=calib_dataloader
)

q_model.save_model_to_file("models/nba_model_quantized_conservative.onnx")


In [None]:
onnx_model_path = "models/nba_model_quantized_conservative.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)
