In [1]:
import os
import sys

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.metrics import roc_auc_score

import tensorflow as tf

from armed.models.mlp_classifiers import BaseMLP, MixedEffectsMLP, MixedEffectsMLPNonlinearSlope

from armed.misc import expand_results_path, make_random_onehot

 The versions of TensorFlow you are currently using is 2.12.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [None]:
# !pip install --upgrade tensorflow==2.12 --user

In [2]:
RS = 42
test_ratio = 0.1
val_ratio = 0.1

In [4]:
# !pip install protobuf==3.20 --user



### Download and preprocess example data

In [3]:
import xmltodict
import os
import arff # make sure to pip install liac-arff
import urllib.request
import urllib3
import pandas as pd

def getxml(url):
    # From https://stackoverflow.com/questions/24124643/parse-xml-from-url-into-python-object
    http = urllib3.PoolManager()

    response = http.request('GET', url)
    try:
        data = xmltodict.parse(response.data)
    except:
        print("Failed to parse xml from response (%s)" % traceback.format_exc())
    return data

In [4]:
dataset_ids = {"road-safety-drivers-sex": 41447}
dataset_name = "road-safety-drivers-sex"        

if not os.path.exists(f"./{dataset_name}"):
    os.mkdir(f"./{dataset_name}")

if not os.path.exists(f"./{dataset_name}/{dataset_name}.csv"):
    print(f"Download {dataset_name} dataset...")
    xml_data = getxml(f"https://api.openml.org/api/v1/data/{dataset_ids[dataset_name]}")
    url = xml_data['oml:data_set_description']["oml:url"]

    urllib.request.urlretrieve(url, f"./{dataset_name}/{dataset_name}.arff")

    try:
        dataset = arff.load(open(f"./{dataset_name}/{dataset_name}.arff", 'rt'))
    except:
        print(f"Unable to download {dataset_name}")
    data = pd.DataFrame(dataset['data'], columns=[i[0] for i in dataset["attributes"]])

    data.to_csv(f"./{dataset_name}/{dataset_name}.csv")
    print(f"Finished {dataset_name} dataset Download")


In [5]:
df = pd.read_csv(f"./{dataset_name}/{dataset_name}.csv", index_col=0)

df.drop(["model"], axis=1, inplace = True)



In [6]:
df

Unnamed: 0,Was_Vehicle_Left_Hand_Drive,Engine_Capacity,Propulsion_Code,Age_of_Vehicle,make,Sex_of_Driver
0,1.0,,,,,male
1,1.0,124.0,1.0,4.0,YAMAHA,male
2,1.0,1461.0,2.0,1.0,MERCEDES,female
3,1.0,1390.0,1.0,5.0,VOLKSWAGEN,male
4,1.0,6692.0,2.0,7.0,OTHER BRITISH,male
...,...,...,...,...,...,...
233959,1.0,1598.0,1.0,14.0,MINI,female
233960,1.0,1598.0,2.0,,VAUXHALL,female
233961,1.0,,,,,male
233962,1.0,1796.0,1.0,13.0,VAUXHALL,male


In [7]:
df = df.dropna()
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Was_Vehicle_Left_Hand_Drive,Engine_Capacity,Propulsion_Code,Age_of_Vehicle,make,Sex_of_Driver
0,1.0,124.0,1.0,4.0,YAMAHA,male
1,1.0,1461.0,2.0,1.0,MERCEDES,female
2,1.0,1390.0,1.0,5.0,VOLKSWAGEN,male
3,1.0,6692.0,2.0,7.0,OTHER BRITISH,male
4,1.0,1596.0,1.0,11.0,BMW,male
...,...,...,...,...,...,...
176644,1.0,1598.0,2.0,1.0,NISSAN,female
176645,1.0,1422.0,2.0,8.0,SEAT,female
176646,1.0,1598.0,1.0,14.0,MINI,female
176647,1.0,1796.0,1.0,13.0,VAUXHALL,male


In [8]:
df_original = df.copy()

In [9]:
df_dummy = df_original.copy()

In [10]:
for dum in [f"dummy_{no}" for no in range(5)]:
    df_dummy[dum] = np.random.randint(0, np.random.randint(1, 6, size=1), size=len(df))
    df_dummy[dum] = df_dummy[dum].astype(float)

In [11]:
type(df_dummy.dummy_4.values[1])

numpy.float64

In [12]:
df = df_dummy

In [13]:
# 1. Identify target
y_col = "Sex_of_Driver"
# 2. Identify binary columns = zwei Ausprägungen
bin_cols = list(set(df.nunique()[df.nunique() == 2].index) - set([y_col]))
# 3. Identify high cardinality = dytpes==object & >hct Ausprägunge
z_cols = list(df.nunique()[np.logical_and(df.nunique() >= 10, df.dtypes == "object")].index)
# 4. Identify cat cols = Rest dytpes==object
cat_cols = list(set(df.dtypes[df.dtypes == "object"].index) - set([y_col] + bin_cols + z_cols))
# 5. Rest is numeric
numeric_cols = list(set(df.columns[df.dtypes != "object"]) - set([y_col]) - set(bin_cols))
# 6. Label encode dtypes==object

le_ = LabelEncoder()
df[y_col] = le_.fit_transform(df[y_col].astype(str))


In [14]:
bin_cols

['dummy_0', 'Was_Vehicle_Left_Hand_Drive']

In [15]:
encoder = OrdinalEncoder()
df['make'] = encoder.fit_transform(df['make'].astype(str).values.reshape(-1,1))
df

Unnamed: 0,Was_Vehicle_Left_Hand_Drive,Engine_Capacity,Propulsion_Code,Age_of_Vehicle,make,Sex_of_Driver,dummy_0,dummy_1,dummy_2,dummy_3,dummy_4
0,1.0,124.0,1.0,4.0,319.0,1,1.0,0.0,3.0,0.0,0.0
1,1.0,1461.0,2.0,1.0,190.0,0,0.0,0.0,0.0,0.0,2.0
2,1.0,1390.0,1.0,5.0,309.0,1,1.0,0.0,0.0,0.0,1.0
3,1.0,6692.0,2.0,7.0,223.0,1,1.0,0.0,2.0,0.0,1.0
4,1.0,1596.0,1.0,11.0,27.0,1,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
176644,1.0,1598.0,2.0,1.0,214.0,0,1.0,0.0,3.0,0.0,1.0
176645,1.0,1422.0,2.0,8.0,264.0,0,1.0,0.0,3.0,0.0,0.0
176646,1.0,1598.0,1.0,14.0,195.0,0,1.0,0.0,0.0,0.0,1.0
176647,1.0,1796.0,1.0,13.0,304.0,1,0.0,0.0,0.0,0.0,0.0


In [16]:
# Split data and target
y = df[y_col]
X = df.drop(y_col, axis=1)

test_indices = X.sample(frac=test_ratio, random_state=RS).index
split = [(np.array(list(set(X.index).difference(test_indices))), np.array(test_indices))]

for num, (train_indices, test_indices) in enumerate(split):
    X_train = X.loc[train_indices]
    y_train = y.loc[train_indices]
    X_test = X.loc[test_indices]
    y_test = y.loc[test_indices]
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, random_state=RS, shuffle=True)


In [17]:
X_train

Unnamed: 0,Was_Vehicle_Left_Hand_Drive,Engine_Capacity,Propulsion_Code,Age_of_Vehicle,make,dummy_0,dummy_1,dummy_2,dummy_3,dummy_4
152391,1.0,1968.0,2.0,3.0,309.0,0.0,0.0,0.0,0.0,3.0
88258,1.0,4398.0,1.0,13.0,27.0,1.0,0.0,0.0,0.0,3.0
45646,1.0,748.0,1.0,6.0,142.0,0.0,0.0,0.0,0.0,0.0
151722,1.0,2099.0,1.0,14.0,126.0,1.0,0.0,3.0,0.0,1.0
137293,1.0,2401.0,2.0,11.0,310.0,1.0,0.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
133152,1.0,2148.0,2.0,7.0,190.0,0.0,0.0,2.0,0.0,2.0
115191,1.0,2664.0,2.0,16.0,171.0,1.0,0.0,2.0,0.0,0.0
146588,1.0,2198.0,2.0,1.0,52.0,0.0,0.0,2.0,0.0,0.0
163195,1.0,1299.0,1.0,14.0,87.0,1.0,0.0,1.0,0.0,1.0


In [20]:
X_train.dummy_1.value_counts()

0.0    143085
Name: dummy_1, dtype: int64

In [None]:
numeric_cols = ["Engine_Capacity", "Age_of_Vehicle", ""]

In [18]:
scaler = StandardScaler()
# fit and transform scaler on X_train and X_test
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [19]:
X_train

Unnamed: 0,Was_Vehicle_Left_Hand_Drive,Engine_Capacity,Propulsion_Code,Age_of_Vehicle,make,dummy_0,dummy_1,dummy_2,dummy_3,dummy_4
152391,1.0,0.055392,0.547571,-0.991562,309.0,0.0,0.0,-1.347355,0.0,1.345900
88258,1.0,1.535496,-0.567974,0.954882,27.0,1.0,0.0,-1.347355,0.0,1.345900
45646,1.0,-0.687706,-0.567974,-0.407629,142.0,0.0,0.0,-1.347355,0.0,-1.339475
151722,1.0,0.135183,-0.567974,1.149526,126.0,1.0,0.0,1.340312,0.0,-0.444350
137293,1.0,0.319131,0.547571,0.565593,310.0,1.0,0.0,0.444423,0.0,-0.444350
...,...,...,...,...,...,...,...,...,...,...
133152,1.0,0.165029,0.547571,-0.212984,190.0,0.0,0.0,0.444423,0.0,0.450775
115191,1.0,0.479323,0.547571,1.538814,171.0,1.0,0.0,0.444423,0.0,-1.339475
146588,1.0,0.195484,0.547571,-1.380850,52.0,0.0,0.0,0.444423,0.0,-1.339475
163195,1.0,-0.352094,-0.567974,1.149526,87.0,1.0,0.0,-0.451466,0.0,-0.444350


In [188]:
unknown = []
X_val_make = X_val['make'].unique()
for val in X_val_make:
    if val not in X_train['make'].unique():
        unknown.append(val)
unknown

['HUONIAU', 'LONCIN', 'TOMOS', 'OTHER COUNTRIES']

In [152]:
X_test.loc[X_test['make'].isin(["CZ", "HYMER"]), 'make']

194911    HYMER
167470       CZ
Name: make, dtype: object

In [45]:
# le_ = OrdinalEncoder()
# X_train['make'] = le_.fit_transform(X_train['make'].astype(str).values.reshape(-1,1))
# X_test['make'] = le_.transform(X_test['make'].astype(str).values.reshape(-1,1))
for i in X_test['make'].astype(str).values.reshape(-1,1):
    print(type(i))
    break

<class 'numpy.ndarray'>


In [10]:
for col in cat_cols + z_cols + bin_cols:
    print(col)

make
Was_Vehicle_Left_Hand_Drive


In [184]:
# label encode categorical features
bin_impute = {}
for col in cat_cols + z_cols + bin_cols:
#     le_ = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2, encoded_missing_value=-1)
    le_ = LabelEncoder()
    X_train[col] = le_.fit_transform(X_train[col].astype(str).values.reshape(-1,1))
    X_val[col]   = le_.transform(X_val[col].astype(str).values.reshape(-1,1))
    X_test[col]  = le_.transform(X_test[col].astype(str).values.reshape(-1,1))

    if col in z_cols+cat_cols:
        # Recode categorical column missings as new category
        X_train.loc[X_train[col]==-1,col] = X_train[col].max()+1
        X_val.loc[X_val[col]==-1,col] = X_train[col].max()+1
        X_test.loc[X_test[col]==-1,col] = X_train[col].max()+1

        # Recode categorical column unknown categories as new category
        X_train.loc[X_train[col] == -2, col] = X_train[col].max() + 2
        X_val.loc[X_val[col] == -2, col] = X_train[col].max() + 2
        X_test.loc[X_test[col] == -2, col] = X_train[col].max() + 2
    elif col in bin_cols:
        # Impute binary columns with train mode
        u, c = np.unique(X_train[col][X_train[col]!=-1], return_counts=True)
        bin_impute[col] = u[np.argmax(c)]
        X_train.loc[X_train[col]==-1,col] = bin_impute[col]
        X_val.loc[X_val[col]==-1,col] = bin_impute[col]
        X_test.loc[X_test[col]==-1,col] = bin_impute[col]

    X_train[col] = X_train[col].astype(int)
    X_val[col] = X_val[col].astype(int)
    X_test[col] = X_test[col].astype(int)


# Impute continuous columns with train mean & standardize
cont_impute = {}
if len(numeric_cols) > 0:
    for col in numeric_cols:
        cont_impute[col] = X_train[col][~X_train[col].isna()].mean()
        X_train.loc[X_train[col].isna(),col] = cont_impute[col]
        X_val.loc[X_val[col].isna(),col] = cont_impute[col]
        X_test.loc[X_test[col].isna(),col] = cont_impute[col]

    # Standardize
    scaler = StandardScaler()
    # fit and transform scaler on X_train and X_test
    X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


ValueError: y contains previously unseen labels: 'HUONIAU'

In [154]:
X_train['make'].max()+2

367.0

In [155]:
X_train.make, X_test.make

(24835     365.0
 110393    365.0
 48486     340.0
 57363     266.0
 105943     96.0
           ...  
 133084    335.0
 115109    335.0
 146524    227.0
 163087     96.0
 135392    335.0
 Name: make, Length: 189511, dtype: float64,
 166883         MAZDA
 142586        NISSAN
 130010      MERCEDES
 193494      VAUXHALL
 193677          FORD
              ...    
 184198          MINI
 73142         NISSAN
 80164     VOLKSWAGEN
 37330           MINI
 141243           NaN
 Name: make, Length: 23396, dtype: object)

In [148]:
X_test.loc[X_test.index == 194911 , 'make']

194911    HYMER
Name: make, dtype: object

### XGB Baseline

In [21]:
from xgboost import XGBClassifier

In [22]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train,eval_set=[(X_val,y_val)],verbose=0)
y_test_pred = xgb.predict_proba(X_test)



In [23]:
y_test_pred

array([[0.3678329 , 0.6321671 ],
       [0.39878535, 0.60121465],
       [0.26765817, 0.7323418 ],
       ...,
       [0.5886309 , 0.4113691 ],
       [0.45475912, 0.5452409 ],
       [0.04946363, 0.9505364 ]], dtype=float32)

In [24]:
print(f"XGB AUC Performance: {roc_auc_score(y_test,y_test_pred[:,1])}")

XGB AUC Performance: 0.7403703200605267


### ARMED

In [25]:
Z_train = X_train["make"]
Z_val = X_val["make"]
Z_test = X_test["make"]

X_train.drop("make",axis=1,inplace=True)
X_val.drop("make",axis=1,inplace=True)
X_test.drop("make",axis=1,inplace=True)

Z_train_ohe = tf.one_hot(Z_train.values.ravel(),Z_train.nunique()).numpy()
Z_val_ohe = tf.one_hot(Z_val.values.ravel(),Z_train.nunique()).numpy()
Z_test_ohe = tf.one_hot(Z_test.values.ravel(),Z_train.nunique()).numpy()


In [26]:
dictBuild = {'n_features': X_train.shape[1],
             'n_clusters': Z_train.nunique(),
             'adversary_layer_units': [4, 4],
             'slope_posterior_init_scale': 0.3, 
             'intercept_posterior_init_scale': 0.1, 
             'slope_prior_scale': 0.3, 
             'intercept_prior_scale': 0.1,
             'kl_weight': 0.00001}
dictCompile = {'loss_class_fe_weight': 1.0, 
               'loss_gen_weight': 0.5, 
               'loss_class_me_weight': 1.0,
               'metric_class_me': tf.keras.metrics.BinaryAccuracy('acc'),
               'metric_class_fe': tf.keras.metrics.BinaryAccuracy('acc_fe'),
               'metric_adv': tf.keras.metrics.CategoricalAccuracy('adv_acc')}




In [27]:
batch_size = 256
epochs = 200

optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-4, decay=1e-6)
lsCallbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_acc', 
                                                mode='max',
                                                patience=5, 
                                                restore_best_weights=True)]


model_armed = MixedEffectsMLP(**dictBuild)
model_armed.compile(**dictCompile)

log = model_armed.fit((X_train,Z_train_ohe), y_train,
                validation_data=((X_val,Z_val_ohe), y_val),
                callbacks=lsCallbacks,
                epochs=epochs,
                batch_size=batch_size,                
                verbose=1,
#                 class_weight=dictClassWeights
               )

pred = model_armed.predict((X_test,Z_test_ohe),batch_size=128)[0]


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200




In [28]:
pred

array([[0.56365883],
       [0.553501  ],
       [0.722055  ],
       ...,
       [0.40625268],
       [0.54693913],
       [0.92173463]], dtype=float32)

In [29]:
print(f"ARMED AUC Performance: {roc_auc_score(y_test,pred)}")

ARMED AUC Performance: 0.7211081483239434
