In [1]:
import os
import time
import numpy as np
import pandas as pd
import torch
import onnx
import onnxruntime as ort
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler


In [2]:
import neural_compressor
from neural_compressor import quantization

In [3]:
# Read data
base_data_dir = os.getenv("NBA_DATA_DIR", "nba_data")

X1_train = pd.read_csv(os.path.join(base_data_dir, 'train/X_train_model1.csv'))
X1_train = X1_train.drop('gameId', axis=1)
X1_test = pd.read_csv(os.path.join(base_data_dir, 'test/X_test_model1.csv'))
X1_test = X1_test.drop('gameId', axis=1)

Y1_train = pd.read_csv(os.path.join(base_data_dir, 'train/Y_train_model1.csv'))
Y1_test = pd.read_csv(os.path.join(base_data_dir, 'test/Y_test_model1.csv'))
full1_df = pd.read_csv(os.path.join(base_data_dir, 'train/full_stats.csv'))

X_save_cols = X1_train.columns

# Convert to tensors, pass to dataloader
X1_train = torch.tensor(X1_train.values, dtype=torch.float32)
X1_test = torch.tensor(X1_test.values, dtype=torch.float32)
Y1_train = torch.tensor(Y1_train.values, dtype=torch.float32)
Y1_test = torch.tensor(Y1_test.values, dtype=torch.float32)

train1_data = TensorDataset(X1_train, Y1_train)
test1_data = TensorDataset(X1_test, Y1_test)

train1_loader = DataLoader(train1_data, batch_size=32, shuffle=True)
test1_loader = DataLoader(test1_data, batch_size=32, shuffle=False)

game_ids = full1_df['gameId'].values


In [4]:
model_path = "models/point_diff.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1 = torch.load(model_path, map_location=device)
model1.eval()

predictions = []
extract_df = full1_df[X_save_cols].values
X_tensor = torch.FloatTensor(extract_df).to(device)
with torch.no_grad():
    # Processsing in batches
    batch_size = 32

    for i in range(0, len(X_tensor), batch_size):
        batch = X_tensor[i:i+batch_size]

        # FF, convert to numpy
        batch_preds = model1(batch)
        predictions.append(batch_preds.cpu().numpy())

# Combine all predictions
all_predictions = np.concatenate(predictions, axis=0)

# Return results
result_df = pd.DataFrame({
    'gameId': game_ids,
    'predicted_point_diff': all_predictions.flatten()
})

  model1 = torch.load(model_path, map_location=device)


In [5]:
X2_train = pd.read_csv(os.path.join(base_data_dir, 'train/X_train_model2.csv'))
X2_test = pd.read_csv(os.path.join(base_data_dir, 'test/X_test_model2.csv'))
Y2_train = pd.read_csv(os.path.join(base_data_dir, 'train/Y_train_model2.csv'))
Y2_test = pd.read_csv(os.path.join(base_data_dir, 'test/Y_test_model2.csv'))

X2_train = X2_train.merge(result_df, on='gameId', how='inner')
X2_train = X2_train.drop('gameId', axis=1)
X2_test = X2_test.merge(result_df, on='gameId', how='inner')
X2_test = X2_test.drop('gameId', axis=1)
full2_df = pd.read_csv(os.path.join(base_data_dir, 'train/full_attendance.csv'))
full2_df = full2_df.merge(result_df, on='gameId', how='inner')

X2_train = torch.tensor(X2_train.values, dtype=torch.float32)
X2_test = torch.tensor(X2_test.values, dtype=torch.float32)
Y2_train = torch.tensor(Y2_train.values, dtype=torch.float32)
Y2_test = torch.tensor(Y2_test.values, dtype=torch.float32)

train2_data = TensorDataset(X2_train, Y2_train)
test2_data = TensorDataset(X2_test, Y2_test)
train2_loader = DataLoader(train2_data, batch_size=32, shuffle=True)
test2_loader = DataLoader(test2_data, batch_size=32, shuffle=False)

In [10]:
def benchmark_session(ort_session):
    print(f"Execution provider: {ort_session.get_providers()}")

    total_mse = 0
    total = 0
    for features, labels in test2_loader:
        outputs = ort_session.run(None, {ort_session.get_inputs()[0].name: features.numpy()})[0]
        mse = ((outputs - labels.numpy()) ** 2).sum()
        total_mse += mse
        total += labels.size(0)
    print(f"Mean Absolute Error (MAE): {total_mse / total:.2f}")

    num_trials = 100
    single_sample = X2_test[0].unsqueeze(0).numpy()
    ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
    latencies = []
    for _ in range(num_trials):
        start = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
        latencies.append(time.time() - start)
    print(f"Inference Latency (median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Throughput: {num_trials / np.sum(latencies):.2f} FPS")

    num_batches = 50
    batch_input = X2_test[:32].numpy()
    ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
    batch_times = []
    for _ in range(num_batches):
        start = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
        batch_times.append(time.time() - start)
    print(f"Batch Throughput: {(batch_input.shape[0] * num_batches) / np.sum(batch_times):.2f} FPS")


In [11]:
onnx_model2_path = "models/model2.onnx"
optimized_model2_path = "models/model2_optimized.onnx"

session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session_options.optimized_model_filepath = optimized_model2_path

ort_session = ort.InferenceSession(onnx_model2_path, sess_options=session_options, providers=['CPUExecutionProvider'])


In [12]:
onnx_model_path = "models/model2_optimized.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


Execution provider: ['CPUExecutionProvider']
Mean Absolute Error (MAE): 1363096.76
Inference Latency (median): 0.11 ms
Inference Throughput: 8211.08 FPS
Batch Throughput: 60010.79 FPS


In [13]:
model_path = "models/model2.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

config_ptq = neural_compressor.PostTrainingQuantConfig(approach="dynamic")

q_model = quantization.fit(model=fp32_model, conf=config_ptq)

q_model.save_model_to_file("models/model2_quantized_dynamic.onnx")


2025-05-12 00:27:37 [INFO] Start auto tuning.
2025-05-12 00:27:37 [INFO] Quantize model without tuning!
2025-05-12 00:27:37 [INFO] Quantize the model with default configuration without evaluating the model.                To perform the tuning process, please either provide an eval_func or provide an                    eval_dataloader an eval_metric.
2025-05-12 00:27:38 [INFO] Adaptor has 5 recipes.
2025-05-12 00:27:38 [INFO] 0 recipes specified by user.
2025-05-12 00:27:38 [INFO] 3 recipes require future tuning.
2025-05-12 00:27:38 [INFO] *** Initialize auto tuning
2025-05-12 00:27:38 [INFO] {
2025-05-12 00:27:38 [INFO]     'PostTrainingQuantConfig': {
2025-05-12 00:27:38 [INFO]         'AccuracyCriterion': {
2025-05-12 00:27:38 [INFO]             'criterion': 'relative',
2025-05-12 00:27:38 [INFO]             'higher_is_better': True,
2025-05-12 00:27:38 [INFO]             'tolerable_loss': 0.01,
2025-05-12 00:27:38 [INFO]             'absolute': None,
2025-05-12 00:27:38 [INFO]     

In [14]:
onnx_model_path = "models/model2_quantized_dynamic.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


Model Size on Disk: 0.75 MB
Execution provider: ['CPUExecutionProvider']
Mean Absolute Error (MAE): 1367935.12
Inference Latency (median): 0.09 ms
Inference Throughput: 11230.03 FPS
Batch Throughput: 58410.90 FPS


In [44]:
from neural_compressor.data import DataLoader
from neural_compressor import PostTrainingQuantConfig
from neural_compressor.config import AccuracyCriterion
from neural_compressor.metric import Metric

model_path = "models/model2.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

eval_dataloader = DataLoader(
    framework='onnxruntime',
    dataset=test2_data  # regression: X and y paired
)

# Configure the quantizer
config_ptq = PostTrainingQuantConfig(
    accuracy_criterion=AccuracyCriterion(
        criterion="relative",  
        tolerable_loss=0.1  # tolerance for regression metric loss (e.g., MSE increase)
    ),
    approach="static", 
    device='cpu', 
    quant_level=1,
    quant_format="QOperator", 
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"}, 
    calibration_sampling_size=128
)

# Use built-in regression metric or a custom eval_func
q_model = quantization.fit(
    model=fp32_model, 
    conf=config_ptq, 
    calib_dataloader=eval_dataloader,
    eval_dataloader=eval_dataloader, 
    eval_metric=neural_compressor.metric.Metric(name='RMSE')
)


q_model.save_model_to_file("models/model2_quantized_aggressive.onnx")

2025-05-12 00:55:03 [INFO] Start basic tuning.
2025-05-12 00:55:03 [INFO] Create evaluation function according to evaluation dataloader and metric                and Execute the tuning process.
2025-05-12 00:55:03 [INFO] Adaptor has 5 recipes.
2025-05-12 00:55:03 [INFO] 0 recipes specified by user.
2025-05-12 00:55:03 [INFO] 3 recipes require future tuning.
2025-05-12 00:55:03 [INFO] {
2025-05-12 00:55:03 [INFO]     'PostTrainingQuantConfig': {
2025-05-12 00:55:03 [INFO]         'AccuracyCriterion': {
2025-05-12 00:55:03 [INFO]             'criterion': 'relative',
2025-05-12 00:55:03 [INFO]             'higher_is_better': True,
2025-05-12 00:55:03 [INFO]             'tolerable_loss': 0.1,
2025-05-12 00:55:03 [INFO]             'absolute': None,
2025-05-12 00:55:03 [INFO]             'keys': <bound method AccuracyCriterion.keys of <neural_compressor.config.AccuracyCriterion object at 0x7ac254559d30>>,
2025-05-12 00:55:03 [INFO]             'relative': 0.1
2025-05-12 00:55:03 [INFO]     

In [45]:
onnx_model_path = "models/model2_quantized_aggressive.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


Model Size on Disk: 2.93 MB
Execution provider: ['CPUExecutionProvider']
Mean Absolute Error (MAE): 1365813.24
Inference Latency (median): 0.13 ms
Inference Throughput: 7724.89 FPS
Batch Throughput: 60007.03 FPS


In [48]:

model_path = "models/model2.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

config_ptq = PostTrainingQuantConfig(
    accuracy_criterion=AccuracyCriterion(
        criterion="relative",  
        tolerable_loss=0.1  # tolerance for regression metric loss (e.g., MSE increase)
    ),
    approach="static", 
    device='cpu', 
    quant_level=0,
    quant_format="QOperator", 
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"}, 
    calibration_sampling_size=128
)

q_model = quantization.fit(
    model=fp32_model, 
    conf=config_ptq, 
    calib_dataloader=eval_dataloader,
    eval_dataloader=eval_dataloader, 
    eval_metric=neural_compressor.metric.Metric(name='RMSE')
)

q_model.save_model_to_file("models/model2_quantized_conservative.onnx")


2025-05-12 00:58:20 [INFO] Start conservative tuning.
2025-05-12 00:58:20 [INFO] Create evaluation function according to evaluation dataloader and metric                and Execute the tuning process.
2025-05-12 00:58:20 [INFO] Adaptor has 5 recipes.
2025-05-12 00:58:20 [INFO] 0 recipes specified by user.
2025-05-12 00:58:20 [INFO] 3 recipes require future tuning.
2025-05-12 00:58:20 [INFO] *** Initialize conservative tuning
2025-05-12 00:58:20 [INFO] {
2025-05-12 00:58:20 [INFO]     'PostTrainingQuantConfig': {
2025-05-12 00:58:20 [INFO]         'AccuracyCriterion': {
2025-05-12 00:58:20 [INFO]             'criterion': 'relative',
2025-05-12 00:58:20 [INFO]             'higher_is_better': True,
2025-05-12 00:58:20 [INFO]             'tolerable_loss': 0.1,
2025-05-12 00:58:20 [INFO]             'absolute': None,
2025-05-12 00:58:20 [INFO]             'keys': <bound method AccuracyCriterion.keys of <neural_compressor.config.AccuracyCriterion object at 0x7ac25758f7d0>>,
2025-05-12 00:58:

In [49]:
onnx_model_path = "models/model2_quantized_conservative.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


Model Size on Disk: 2.93 MB
Execution provider: ['CPUExecutionProvider']
Mean Absolute Error (MAE): 1365811.42
Inference Latency (median): 0.13 ms
Inference Throughput: 5138.06 FPS
Batch Throughput: 35998.94 FPS
