note: exercises only to validate inference time, and for inference time improvement using parallelization techniques. So many wrong practices for training, and testing will be done (e.g. purposely diplicating rows to virtually increase the dataset, and testing on the same training set)

In [48]:
import os
import sys
repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(repo_root)

import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import datetime



NUM_ROWS = int(1e7) # Number of rows to read from the dataset
N_JOBS_MODEL = os.cpu_count() # Number of jobs supported within the model itself
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), "..","..",".."))
DATASET_PATH = os.path.join(ROOT_DIR, "data", "healthcare_noshows_appointments.csv")
MODELSAVE_PATH = os.path.join(ROOT_DIR, "src", "ml", "saved_models")

def pipeline(X,model):

    categorical_features = [col for col in X.columns if X[col].dtype == 'category']
    numeric_features = [col for col in X.columns if X[col].dtype in ['int64', 'float64', 'bool']]

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['Gender', 'Neighbourhood']),
            ('num', StandardScaler(), numeric_features)
        ]
    )
    modelwpipeline = Pipeline([
        ('preprocess', preprocessor),
        ('logreg', model)
    ])
    return modelwpipeline

#### basic data processing

dataset over or undersampled for speed testing only

In [49]:
df = pd.read_csv(DATASET_PATH, dtype={'PatientId': 'category',
                                      'AppointmentID': 'category',
                                      'Gender': 'category',
                                      'Neighbourhood': 'category',
                                      }, 
                                parse_dates=['ScheduledDay', 
                                             'AppointmentDay'])
df = df.drop(columns=['AppointmentID', 'PatientId'])
n_rows = df.shape[0]
nrepeats,remiainder = NUM_ROWS // n_rows , NUM_ROWS % n_rows
df = pd.concat([df]*nrepeats + [df.sample(remiainder)], ignore_index=True)
display(df.info())
display(df.head())
y = df["Showed_up"]
X = df.drop(columns=["Showed_up"])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 13 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Gender          category      
 1   ScheduledDay    datetime64[ns]
 2   AppointmentDay  datetime64[ns]
 3   Age             int64         
 4   Neighbourhood   category      
 5   Scholarship     bool          
 6   Hipertension    bool          
 7   Diabetes        bool          
 8   Alcoholism      bool          
 9   Handcap         bool          
 10  SMS_received    bool          
 11  Showed_up       bool          
 12  Date.diff       int64         
dtypes: bool(7), category(2), datetime64[ns](2), int64(2)
memory usage: 391.0 MB


None

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Showed_up,Date.diff
0,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,False,True,False,False,False,False,True,0
1,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,False,False,False,False,False,False,True,0
2,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,False,False,False,False,False,False,True,0
3,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,False,False,False,False,False,False,True,0
4,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,False,True,True,False,False,False,True,0


# SKLearn Models

#### Logistic Regression

In [61]:
# MODEL DEFINITION
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000,solver='lbfgs', class_weight="balanced", n_jobs=N_JOBS_MODEL)
modelp = pipeline(X,model)

# TRAINING
start_time = time.time()
modelp.fit(X, y)
end_time = time.time()
print(f"Training time: {end_time - start_time} seconds")

# SAVING
datetime_now = datetime.datetime.now().strftime("%H%M_%d%m%Y")
model_name = model.__class__.__name__ + f"_{datetime_now}.joblib"
model_save_path = os.path.join(MODELSAVE_PATH, model_name)
joblib.dump(modelp, model_save_path)


Training time: 5.957391023635864 seconds


['c:\\Users\\jjaramil\\OneDrive - InterSystems Corporation\\Documents\\model_parallelization\\models\\LogisticRegression_1150_28112025.joblib']

# LightGBM

## Additional data preparation for LighGBM

In [50]:
# DATA PREPARATION (slight difference for LightGBM)
date_cols = ["ScheduledDay", "AppointmentDay"]

# 1. Extract useful components
for col in date_cols:
    X[col + "_year"] = X[col].dt.year
    X[col + "_month"] = X[col].dt.month
    X[col + "_day"] = X[col].dt.day
    X[col + "_dow"] = X[col].dt.dayofweek         # 0=Mon, 6=Sun
    X[col + "_hour"] = X[col].dt.hour
    X[col + "_is_weekend"] = (X[col].dt.dayofweek >= 5).astype("int8")
    
    # Optional: Part-of-day feature
    X[col + "_part_of_day"] = pd.cut(
        X[col].dt.hour,
        bins=[-1, 6, 12, 17, 24],
        labels=[0, 1, 2, 3],        # 0=night,1=morning,2=afternoon,3=evening
        ordered=True
    ).astype("int8")
X = X.drop(columns=date_cols)

# DROP CATEGORICAL FEATURES (just for now)
X = X.drop(columns=["Gender", "Neighbourhood"])
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 22 columns):
 #   Column                      Dtype
---  ------                      -----
 0   Age                         int64
 1   Scholarship                 bool 
 2   Hipertension                bool 
 3   Diabetes                    bool 
 4   Alcoholism                  bool 
 5   Handcap                     bool 
 6   SMS_received                bool 
 7   Date.diff                   int64
 8   ScheduledDay_year           int32
 9   ScheduledDay_month          int32
 10  ScheduledDay_day            int32
 11  ScheduledDay_dow            int32
 12  ScheduledDay_hour           int32
 13  ScheduledDay_is_weekend     int8 
 14  ScheduledDay_part_of_day    int8 
 15  AppointmentDay_year         int32
 16  AppointmentDay_month        int32
 17  AppointmentDay_day          int32
 18  AppointmentDay_dow          int32
 19  AppointmentDay_hour         int32
 20  AppointmentDay_is_weeke

## Training and saving

In [51]:
import lightgbm as lgb
import gc


# MODEL DEFINITION
train_data = lgb.Dataset(X, label=y, free_raw_data=True)
validation_data = lgb.Dataset(X, label=y, reference=train_data)
train_data.raw_data = None
gc.collect()
base_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'n_jobs': N_JOBS_MODEL,
    
}

gpu_params = base_params | {
    "device_type": "cpu",
    # 'gpu_platform_id': 0,
    # 'gpu_device_id': 0,
    "max_bin": 15,
    "gpu_use_dp": False
}

# TRAINING
num_round = 10
start_time = time.time()
bst = lgb.train(gpu_params, train_data, num_round, valid_sets=[validation_data])
end_time = time.time()
print(f"Training time: {end_time - start_time} seconds")

# TEST INFERENCE TIME
start_time = time.time()
y_pred = bst.predict(X)
end_time = time.time()
print(f"Inference time: {end_time - start_time} seconds")

# SAVING
datetime_now = datetime.datetime.now().strftime("%H%M_%d%m%Y")
model_name = "LightGBM_" + f"{datetime_now}.txt"
model_save_path = os.path.join(MODELSAVE_PATH, model_name)
bst.save_model(model_save_path)

[LightGBM] [Info] Number of positive: 7973594, number of negative: 2026406
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.239529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 10000000, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.797359 -> initscore=1.369872
[LightGBM] [Info] Start training from score 1.369872
Training time: 4.935598611831665 seconds
Inference time: 0.9510607719421387 seconds


<lightgbm.basic.Booster at 0x23a7091b1d0>

## ONNX Acceleration

In [38]:
model_save_path # Change if need to load a different model

'c:\\Users\\jjaramil\\OneDrive - InterSystems Corporation\\Documents\\model_parallelization\\src\\ml\\saved_models\\LightGBM_1207_04122025.txt'

In [52]:
import numpy as np
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType

# CONVERSION TO ONNX
initial_type = [('input', FloatTensorType([None, X.shape[1]]))]
onnx_model = onnxmltools.convert_lightgbm(bst, initial_types=initial_type)

# SAVING ONNX MODEL
onnx_model_name = "LightGBM_" + f"{datetime_now}.onnx"
onnx_model_save_path = os.path.join(MODELSAVE_PATH, onnx_model_name)    
onnxmltools.utils.save_model(onnx_model, onnx_model_save_path)

In [53]:
# INFERENCE TEST
import onnxruntime as ort

# Load onnx model with providers
session = ort.InferenceSession(
    onnx_model_save_path,
    providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)

# Check which provider is actually used
print("Providers:", session.get_providers())
print("Using:", session.get_provider_options())
X_test_onnx = X.to_numpy(dtype=np.float32)   # or: X.values.astype(np.float32)

input_name = session.get_inputs()[0].name
print("Input name:", input_name)

start_time = time.time()
preds = session.run(
    None,  # means "return all outputs"
    {input_name: X_test_onnx}
)[0]
end_time = time.time()
print(f"ONNX Inference time: {end_time - start_time} seconds")



Providers: ['CPUExecutionProvider']
Using: {'CPUExecutionProvider': {}}
Input name: input
ONNX Inference time: 9.925181865692139 seconds


# FIL Acceleration

In [54]:
from cuml import ForestInference


ModuleNotFoundError: No module named 'cuml'