In [1]:
import yaml
import seaborn as sns
from jlab_datascience_toolkit.data_prep import make as make_prep
from jlab_datascience_toolkit.models import make as make_model
from jlab_datascience_toolkit.trainers import make as make_trainer
from jlab_datascience_toolkit.analysis import make as make_analysis

In [2]:
df = sns.load_dataset('iris')
classes_list = [(c, i) for i, c in enumerate(df['species'].unique().tolist())]
df['species_int'] = df['species'].map(dict(classes_list))
df.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_int
47,4.6,3.2,1.4,0.2,setosa,0
57,4.9,2.4,3.3,1.0,versicolor,1
116,6.5,3.0,5.5,1.8,virginica,2
140,6.7,3.1,5.6,2.4,virginica,2
56,6.3,3.3,4.7,1.6,versicolor,1
24,4.8,3.4,1.9,0.2,setosa,0
5,5.4,3.9,1.7,0.4,setosa,0
62,6.0,2.2,4.0,1.0,versicolor,1
106,4.9,2.5,4.5,1.7,virginica,2
126,6.2,2.8,4.8,1.8,virginica,2


In [3]:
prep = make_prep(
    "SplitDataFrame_v0",
    configs = {
        "feature_columns": ["sepal_length", "sepal_width", "petal_length", "petal_width"],
        "target_columns": "species_int",
        "rows_fractions": [0.7, 0.15, 0.15]
    }
)
x_train, x_val, x_test, y_train, y_val, y_test = prep.run(df)
x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape

Attempting to load jlab_datascience_toolkit.data_prep.split_dataframe_v0 with SplitDataFrame


((105, 4), (22, 4), (23, 4), (105,), (22,), (23,))

In [4]:
# TODO: Replace this with the registered standard scaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [5]:
with open('/home/ahmedm/projects/jlab_datascience_core/jlab_datascience_toolkit/cfgs/defaults/example_cfg.yaml', 'r') as file:
    configs = yaml.safe_load(file)
    model_configs = configs['model_configs']
    trainer_configs = configs['trainer_configs']
    analysis_configs = configs['analysis_configs']

In [6]:
model = make_model(model_configs['registered_name'], configs=model_configs)
trainer = make_trainer(trainer_configs['registered_name'], configs=trainer_configs)
history = trainer.fit(
    model=model, x=x_train, y=y_train, validation_data=(x_val, y_val)
)

Attempting to load jlab_datascience_toolkit.models.keras_mlp_v0 with KerasMLP


2024-11-13 10:17:21.385765: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-13 10:17:21.404522: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731511041.417688  900056 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731511041.421588  900056 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-13 10:17:21.438189: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Attempting to load jlab_datascience_toolkit.trainers.keras_trainer_v0 with Trainer
Epoch 1/400


2024-11-13 10:17:23.225578: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - loss: 1.3176 - val_loss: 0.9536 - learning_rate: 0.0100
Epoch 2/400
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.7311 - val_loss: 0.8194 - learning_rate: 0.0100
Epoch 3/400
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.5334 - val_loss: 0.7115 - learning_rate: 0.0100
Epoch 4/400
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.4608 - val_loss: 0.6369 - learning_rate: 0.0100
Epoch 5/400
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.3908 - val_loss: 0.5947 - learning_rate: 0.0100
Epoch 6/400
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.3138 - val_loss: 0.5562 - learning_rate: 0.0100
Epoch 7/400
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.2675 - val_loss: 0.5241 - learning_rate: 0.0100
Epoch 8/400
[1m4/

In [7]:
y_pred = model.predict(x_test)    # (n_samples, c_classes)
y_pred = y_pred.argmax(axis=1)    # (n_samples)
y_test, y_pred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step


(array([2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 0, 1, 1, 1, 2, 1, 2, 0,
        2]),
 array([2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 0, 1, 1, 1, 2, 1, 2, 0,
        2]))

In [8]:
multiclass_ana = make_analysis(analysis_configs["registered_name"], configs=analysis_configs)
multiclass_ana.run(y_test, y_pred, labels=list(range(len(classes_list))))

Attempting to load jlab_datascience_toolkit.analysis.multiclass_analysis_v0 with Analysis


[array([[ 3,  0,  0],
        [ 0,  7,  0],
        [ 0,  0, 13]]),
 1.0,
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0)]