In [1]:
import os
import time
import numpy as np
import pandas as pd
import torch
import onnx
import onnxruntime as ort
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler


In [2]:
# Read data
base_data_dir = os.getenv("NBA_DATA_DIR", "nba_data")

X1_train = pd.read_csv(os.path.join(base_data_dir, 'train/X_train_model1.csv'))
X1_train = X1_train.drop('gameId', axis=1)
X1_test = pd.read_csv(os.path.join(base_data_dir, 'test/X_test_model1.csv'))
X1_test = X1_test.drop('gameId', axis=1)

Y1_train = pd.read_csv(os.path.join(base_data_dir, 'train/Y_train_model1.csv'))
Y1_test = pd.read_csv(os.path.join(base_data_dir, 'test/Y_test_model1.csv'))
full1_df = pd.read_csv(os.path.join(base_data_dir, 'train/full_stats.csv'))

X_save_cols = X1_train.columns

# Convert to tensors, pass to dataloader
X1_train = torch.tensor(X1_train.values, dtype=torch.float32)
X1_test = torch.tensor(X1_test.values, dtype=torch.float32)
Y1_train = torch.tensor(Y1_train.values, dtype=torch.float32)
Y1_test = torch.tensor(Y1_test.values, dtype=torch.float32)

train1_data = TensorDataset(X1_train, Y1_train)
test1_data = TensorDataset(X1_test, Y1_test)

train1_loader = DataLoader(train1_data, batch_size=32, shuffle=True)
test1_loader = DataLoader(test1_data, batch_size=32, shuffle=False)

game_ids = full1_df['gameId'].values


In [3]:
model_path = "models/point_diff.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1 = torch.load(model_path, map_location=device)
model1.eval()

predictions = []
extract_df = full1_df[X_save_cols].values
X_tensor = torch.FloatTensor(extract_df).to(device)
with torch.no_grad():
    # Processsing in batches
    batch_size = 32

    for i in range(0, len(X_tensor), batch_size):
        batch = X_tensor[i:i+batch_size]

        # FF, convert to numpy
        batch_preds = model1(batch)
        predictions.append(batch_preds.cpu().numpy())

# Combine all predictions
all_predictions = np.concatenate(predictions, axis=0)

# Return results
result_df = pd.DataFrame({
    'gameId': game_ids,
    'predicted_point_diff': all_predictions.flatten()
})

  model1 = torch.load(model_path, map_location=device)


In [4]:
X2_train = pd.read_csv(os.path.join(base_data_dir, 'train/X_train_model2.csv'))
X2_test = pd.read_csv(os.path.join(base_data_dir, 'test/X_test_model2.csv'))
Y2_train = pd.read_csv(os.path.join(base_data_dir, 'train/Y_train_model2.csv'))
Y2_test = pd.read_csv(os.path.join(base_data_dir, 'test/Y_test_model2.csv'))

X2_train = X2_train.merge(result_df, on='gameId', how='inner')
X2_train = X2_train.drop('gameId', axis=1)
X2_test = X2_test.merge(result_df, on='gameId', how='inner')
X2_test = X2_test.drop('gameId', axis=1)
full2_df = pd.read_csv(os.path.join(base_data_dir, 'train/full_attendance.csv'))
full2_df = full2_df.merge(result_df, on='gameId', how='inner')

X2_train = torch.tensor(X2_train.values, dtype=torch.float32)
X2_test = torch.tensor(X2_test.values, dtype=torch.float32)
Y2_train = torch.tensor(Y2_train.values, dtype=torch.float32)
Y2_test = torch.tensor(Y2_test.values, dtype=torch.float32)

train2_data = TensorDataset(X2_train, Y2_train)
test2_data = TensorDataset(X2_test, Y2_test)
train2_loader = DataLoader(train2_data, batch_size=32, shuffle=True)
test2_loader = DataLoader(test2_data, batch_size=32, shuffle=False)

In [5]:
def benchmark_session(ort_session):
    print(f"Execution provider: {ort_session.get_providers()}")

    total_mse = 0
    total = 0
    for features, labels in test2_loader:
        outputs = ort_session.run(None, {ort_session.get_inputs()[0].name: features.numpy()})[0]
        mse = ((outputs - labels.numpy()) ** 2).sum()
        total_mse += mse
        total += labels.size(0)
    print(f"Mean Absolute Error (MAE): {total_mse / total:.2f}")

    num_trials = 100
    single_sample = X2_test[0].unsqueeze(0).numpy()
    ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
    latencies = []
    for _ in range(num_trials):
        start = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: single_sample})
        latencies.append(time.time() - start)
    print(f"Inference Latency (median): {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"Inference Throughput: {num_trials / np.sum(latencies):.2f} FPS")

    num_batches = 50
    batch_input = X2_test[:32].numpy()
    ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
    batch_times = []
    for _ in range(num_batches):
        start = time.time()
        ort_session.run(None, {ort_session.get_inputs()[0].name: batch_input})
        batch_times.append(time.time() - start)
    print(f"Batch Throughput: {(batch_input.shape[0] * num_batches) / np.sum(batch_times):.2f} FPS")


In [6]:
onnx_model_path = "models/model2.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])
benchmark_session(ort_session)


Execution provider: ['CPUExecutionProvider']
Mean Absolute Error (MAE): 1363096.76
Inference Latency (median): 0.11 ms
Inference Throughput: 8797.15 FPS
Batch Throughput: 64025.40 FPS


In [None]:
onnx_model_path = "models/model2.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()


In [None]:
onnx_model_path = "models/model2.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['TensorrtExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()


In [None]:
onnx_model_path = "models/model2.onnx"
ort_session = ort.InferenceSession(onnx_model_path, providers=['OpenVINOExecutionProvider'])
benchmark_session(ort_session)
ort.get_device()
