In [None]:
import os
import time
import numpy as np
import pandas as pd
import torch
import onnx
import onnxruntime as ort
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import neural_compressor
from neural_compressor import quantization


In [None]:
# Read data
base_data_dir = os.getenv("NBA_DATA_DIR", "nba_data")

X1_train = pd.read_csv(os.path.join(base_data_dir, 'train/X_train_model1.csv'))
X1_test = pd.read_csv(os.path.join(base_data_dir, 'train/X_test_model1.csv'))
Y1_train = pd.read_csv(os.path.join(base_data_dir, 'train/Y_train_model1.csv'))
Y1_test = pd.read_csv(os.path.join(base_data_dir, 'train/Y_test_model1.csv'))
full1_df = pd.read_csv(os.path.join(base_data_dir, 'train/full_stats.csv'))

# Convert to tensors, pass to dataloader
X1_train = torch.tensor(X1_train.values, dtype=torch.float32)
X1_test = torch.tensor(X1_test.values, dtype=torch.float32)
Y1_train = torch.tensor(Y1_train.values, dtype=torch.float32)
Y1_test = torch.tensor(Y1_test.values, dtype=torch.float32)

train1_data = TensorDataset(X1_train, Y1_train)
test1_data = TensorDataset(X1_test, Y1_test)

train1_loader = DataLoader(train1_data, batch_size=32, shuffle=True)
test1_loader = DataLoader(test1_data, batch_size=32, shuffle=False)

game_ids = full1_df['gameId'].values


In [None]:
model_path = "models/point_diff.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1 = torch.load(model_path, map_location=device)
model1.eval()

predictions = []
with torch.no_grad():
    # Processsing in batches
    batch_size = 32

    for i in range(0, len(X1_test), batch_size):
        batch = X1_test[i:i+batch_size]

        # FF, convert to numpy
        batch_preds = model1(batch)
        predictions.append(batch_preds.cpu().numpy())

# Combine all predictions
all_predictions = np.concatenate(predictions, axis=0)

# Return results
result_df = pd.DataFrame({
    'gameId': game_ids,
    'predicted_point_diff': all_predictions.flatten()
})

In [None]:
X2_train = pd.read_csv(os.path.join(base_data_dir, 'train/X_train_model2.csv'))
X2_test = pd.read_csv(os.path.join(base_data_dir, 'test/X_test_model2.csv'))
Y2_train = pd.read_csv(os.path.join(base_data_dir, 'train/Y_train_model2.csv'))
Y2_test = pd.read_csv(os.path.join(base_data_dir, 'test/Y_test_model2.csv'))

X2_train = X2_train.merge(result_df, on='gameId', how='inner')
X2_train = X2_train.drop('gameId', axis=1)
X2_test = X2_test.merge(result_df, on='gameId', how='inner')
X2_test = X2_test.drop('gameId', axis=1)
full2_df = pd.read_csv(os.path.join(base_data_dir, 'train/full_attendance.csv'))
full2_df = full2_df.merge(result_df, on='gameId', how='inner')

# train2_data = TensorDataset(X2_train, Y2_train)
# test2_data = TensorDataset(X2_test, Y2_test)
# train2_loader = DataLoader(train2_data, batch_size=32, shuffle=True)
# test2_loader = DataLoader(test2_data, batch_size=32, shuffle=False)

In [None]:
def benchmark_session(ort_session):
    print(f"Execution provider: {ort_session.get_providers()}")

    total_mse = 0
    total = 0
    for features, labels in test1_loader:
        outputs = ort_session.run(None, {ort_session.get_inputs()[0].name: features.numpy()})[0]
        mse = ((outputs - labels.numpy()) ** 2).sum()
        total_mse += mse
        total += labels.size(0)
    print(f"Mean Absolute Error (MAE): {total_mse / total:.2f}")

    num_trials = 100
    single_sample = X2_test[0].unsqueeze(0).numpy()
    ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
    latencies = []
    for _ in range(num_trials):
        start = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
        latencies.append(time.time() - start)
    print(f"Inference Latency (median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Throughput: {num_trials / np.sum(latencies):.2f} FPS")

    num_batches = 50
    batch_input = X2_test[:32].numpy()
    ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
    batch_times = []
    for _ in range(num_batches):
        start = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
        batch_times.append(time.time() - start)
    print(f"Batch Throughput: {(batch_input.shape[0] * num_batches) / np.sum(batch_times):.2f} FPS")


In [None]:
onnx_model2_path = "models/mode2.onnx"
optimized_model2_path = "models/model2_optimized.onnx"

session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session_options.optimized_model_filepath = optimized_model2_path

ort_session = ort.InferenceSession(onnx_model2_path, sess_options=session_options, providers=['CPUExecutionProvider'])


In [None]:
onnx_model_path = "models/model2_optimized.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


In [None]:
model_path = "models/model2.onnx"
fp32_model = neural_compressor.model.onnx_model.ONNXModel(model_path)

config_ptq = neural_compressor.PostTrainingQuantConfig(approach="dynamic")

q_model = quantization.fit(model=fp32_model, conf=config_ptq)

q_model.save_model_to_file("models/model2_quantized_dynamic.onnx")


In [None]:
onnx_model_path = "models/model2_quantized_dynamic.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


In [None]:
calib_dataset = neural_compressor.data.datasets.NumpyDataset(X1_test.numpy())
calib_dataloader = neural_compressor.data.DataLoader(framework='onnxruntime', dataset=calib_dataset)

config_static_aggressive = neural_compressor.PostTrainingQuantConfig(
    accuracy_criterion=neural_compressor.config.AccuracyCriterion(criterion="absolute", tolerable_loss=0.05),
    approach="static",
    device='cpu',
    quant_level=1,
    quant_format="QOperator",
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"},
    calibration_sampling_size=128
)

q_model = quantization.fit(
    model=fp32_model,
    conf=config_static_aggressive,
    calib_dataloader=calib_dataloader,
    eval_dataloader=calib_dataloader
)

q_model.save_model_to_file("models/model2_quantized_aggressive.onnx")


In [None]:
onnx_model_path = "models/model2_quantized_aggressive.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


In [None]:
config_static_conservative = neural_compressor.PostTrainingQuantConfig(
    accuracy_criterion=neural_compressor.config.AccuracyCriterion(criterion="absolute", tolerable_loss=0.01),
    approach="static",
    device='cpu',
    quant_level=0,
    quant_format="QOperator",
    recipes={"graph_optimization_level": "ENABLE_EXTENDED"},
    calibration_sampling_size=128
)

q_model = quantization.fit(
    model=fp32_model,
    conf=config_static_conservative,
    calib_dataloader=calib_dataloader,
    eval_dataloader=calib_dataloader
)

q_model.save_model_to_file("models/model2_quantized_conservative.onnx")


In [None]:
onnx_model_path = "models/model2_quantized_conservative.onnx"
print(f"Model Size on Disk: {os.path.getsize(onnx_model_path) / 1e6:.2f} MB")
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)
