In [1]:
!pip install efficientnet tqdm

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
%load_ext nb_black
import os
from pathlib import Path

# from tensorflow.keras.applications import (
#     MobileNetV2,
# )
from efficientnet.tfkeras import EfficientNetB0

<IPython.core.display.Javascript object>

In [3]:
# dataset_name = "santander-customer-satisfaction"
# IMAGE_SIZE = 640
dataset_name = "census-income"
IMAGE_SIZE = 224
target = "taxable income amount"
# dataset_name = "springleaf-marketing-response"
# IMAGE_SIZE = 1408
# dataset_name = "segment"
# IMAGE_SIZE = 160
# dataset_name = "rl"
# IMAGE_SIZE = 160
# dataset_name = "open-payments"
# target = "status"
# IMAGE_SIZE = 96
# dataset_name = "bank-marketing"
# IMAGE_SIZE = 160
# dataset_name = "springleaf-marketing-response"
# IMAGE_SIZE = 1408
# dataset_name = "bnp-cardif"
# IMAGE_SIZE = 384
# dataset_name = "albert"
# IMAGE_SIZE = 288
# dataset_name = "titanic"
# IMAGE_SIZE = 128
# target = "Survived"

# IMAGE_SIZE = 24
# dataset_name = "rl"
# target = "target"
# dataset_name = "bank-marketing"
# target = "y"

# dataset_name = "cat-in-the-dat-ii"
# IMAGE_SIZE = 96
# dataset_name = "give-me-some-credit"
# IMAGE_SIZE = 128

# IMAGE_SIZE = 160

DATASET_FOLDER = Path(os.getcwd()) / f"data/{dataset_name}"

BATCH_SIZE = 32
PREFETCH = 50


ONE_CHANNEL = True
NB_CHANNEL = 1 if ONE_CHANNEL else 3

# Target size for model to use
# TARGET_SIZE = IMAGE_SIZE
PRETRAINED_MODEL = EfficientNetB0  # MobileNetV2  # EfficientNetB0  # EfficientNetB4  # MobileNetV2  # EfficientNetB2  # EfficientNetB0  # MobileNetV2  # NASNetMobile

# Training params for only new layers

# Training params for refit
epochs = 200
patience = 20

# Model name => image size, last_block retrain
# NASNetMobile => 224,
# NASNetLarge => 331,
# MobileNetV2 => 96, ... 160..... 224, last layer => 128 (3 blocks), 137(2 blocks), 146(1 block)
# InceptionV3 => 299,
# Xception => 299,

<IPython.core.display.Javascript object>

## Import + utilities

In [4]:
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from sklearn.metrics import roc_auc_score, accuracy_score

from thc_net.image.pretrained_model import (
    build_dataset,
    build_process_path,
    build_model,
)

from thc_net.image.tabular_preproc import csv_to_pixel

%matplotlib inline

<IPython.core.display.Javascript object>

In [5]:
def plot_metric(history, metric):
    # Plot training & validation loss values
    plt.plot(history.history[metric])
    plt.plot(history.history[f"val_{metric}"])
    plt.title(f"Model {metric}")
    plt.ylabel(f"{metric}")
    plt.xlabel("Epoch")
    plt.legend(["Train", "Test"], loc="upper left")
    plt.show()

<IPython.core.display.Javascript object>

## Preparing datasets

In [6]:
import pandas as pd

panda_kwargs = {}
DATASET_FILENAME = "train_bench.csv"
dataset_path = DATASET_FOLDER / DATASET_FILENAME
TAILORED_COLUMN = "Set"

columns = pd.read_csv(dataset_path, **panda_kwargs, nrows=1).columns.tolist()
columns

['age',
 'class_of_worker',
 'industry_code',
 'occupation_code',
 'education',
 'adjusted gross income',
 'wage per hour',
 'enrolled in edu inst last wk',
 'marital status',
 'major industry code',
 'major occupation code',
 'mace',
 'hispanic Origin',
 'sex',
 'member of a labor union',
 'reason for unemployment',
 'full or part time employment stat',
 'capital gains',
 'capital losses',
 'divdends from stocks',
 'federal income tax liability',
 'tax filer status',
 'region of previous residence',
 'state of previous residence',
 'detailed household and family stat',
 'detailed household summary in household',
 'instance weight',
 'migration code-change in msa',
 'migration code-change in reg',
 'migration code-move within reg',
 'live in this house 1 year ago',
 'migration prev res in sunbelt',
 'num persons worked for employer',
 'family members under 18',
 'total person earnings',
 'country of birth father',
 'country of birth mother',
 'country of birth self',
 'citizenship',
 '

<IPython.core.display.Javascript object>

In [7]:
split = None
panda_kwargs = {}
if TAILORED_COLUMN not in columns:
    split = np.random.choice(
        ["train", "valid", "test"], p=[0.8, 0.1, 0.1], size=(NB_LINES,)
    )
else:
    split = pd.read_csv(
        dataset_path, **panda_kwargs, usecols=[TAILORED_COLUMN]
    ).values.reshape(-1)


train_indices = np.argwhere(split == "train").reshape(-1)
np.random.shuffle(train_indices)
valid_indices = np.argwhere(split == "valid").reshape(-1)
test_indices = np.argwhere(split == "test").reshape(-1)

<IPython.core.display.Javascript object>

In [8]:
used_columns = list(set(columns) - set([TAILORED_COLUMN, target]))
used_columns

['education',
 'wage per hour',
 'divdends from stocks',
 'reason for unemployment',
 'country of birth self',
 'adjusted gross income',
 'detailed household summary in household',
 'major industry code',
 'federal income tax liability',
 'migration code-move within reg',
 'migration prev res in sunbelt',
 'instance weight',
 'migration code-change in reg',
 'total person earnings',
 'citizenship',
 'full or part time employment stat',
 'capital gains',
 'region of previous residence',
 'occupation_code',
 'mace',
 'own business or self employed',
 'live in this house 1 year ago',
 'detailed household and family stat',
 'num persons worked for employer',
 'hispanic Origin',
 'major occupation code',
 'migration code-change in msa',
 'total person income',
 'age',
 'enrolled in edu inst last wk',
 'industry_code',
 'marital status',
 'country of birth mother',
 'sex',
 'tax filer status',
 'state of previous residence',
 'country of birth father',
 'class_of_worker',
 'member of a labor

<IPython.core.display.Javascript object>

In [9]:
df = pd.read_csv(dataset_path, low_memory=False)

<IPython.core.display.Javascript object>

In [10]:
X = df[used_columns].values
Y = df[target].values.reshape(-1)

extract_df = df[used_columns]
nb_idx = (extract_df.dtypes == "int64") | (extract_df.dtypes == "float64")
nb_idx = extract_df.columns[nb_idx]
is_numeric = np.isin(extract_df.columns.values, nb_idx.values)

<IPython.core.display.Javascript object>

In [11]:
(extract_df.nunique() / extract_df.shape[0]) < 0.02

education                                   True
wage per hour                               True
divdends from stocks                        True
reason for unemployment                     True
country of birth self                       True
adjusted gross income                       True
detailed household summary in household     True
major industry code                         True
federal income tax liability                True
migration code-move within reg              True
migration prev res in sunbelt               True
instance weight                             True
migration code-change in reg                True
total person earnings                       True
citizenship                                 True
full or part time employment stat           True
capital gains                               True
region of previous residence                True
occupation_code                             True
mace                                        True
own business or self

<IPython.core.display.Javascript object>

In [12]:
# https://www.tensorflow.org/tutorials/load_data/images
X, Y = csv_to_pixel(df, used_columns, target, ascii_only=ONE_CHANNEL)
IMAGE_SIZE = X.shape[1]
TARGET_SIZE = IMAGE_SIZE if IMAGE_SIZE > 32 else 32

Process ForkProcess-16:
Process ForkProcess-1:
Process ForkProcess-12:
Process ForkProcess-14:
Process ForkProcess-6:
Process ForkProcess-7:
Process ForkProcess-2:
Process ForkProcess-3:
Process ForkProcess-11:
Process ForkProcess-9:
Process ForkProcess-13:
Traceback (most recent call last):
Process ForkProcess-8:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 205, in _sendback_result
    exception=exception))
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 205, in _sendback_result
    exception=exception))
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 205, in _sendback_resu

<IPython.core.display.Javascript object>

  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 244, in _process_worker
    _sendback_result(result_queue, call_item.work_id, result=r)
KeyboardInterrupt
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 244, in _process_worker
    _sendback_result(result_queue, call_item.work_id, result=r)
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _boot

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
KeyboardInterrupt
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 244, in _process_worker
    _sendback_result(result_queue, call_item.work_id, result=r)
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 208, in _sendback_result
    result_queue.put(_ResultItem(work_id, exception=exc))
  File "/usr/local/lib/python3.7/multiprocessing/queues.py", line 363, in put
    with self._wlock:
  File "/usr/local/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/usr/local/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
Process ForkProcess-15:
P

  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 242, in _process_worker
    _sendback_result(result_queue, call_item.work_id, exception=exc)
Traceback (most recent call last):
KeyboardInterrupt
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 242, in _process_worker
    _sendback_result(result_queue, call_item.work_id, exception=exc)
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 208, in _sendback_result
    result_queue.put(_ResultItem(work_id, exception=exc))
  File "/usr/local/lib/python3.7/multiprocessing/queues.py", line 363, in put
    with self._wlock:
  File "/usr/local/lib/python3.7/concurrent/futures/process.py", line 208, in _sendback_result
    result_queue.put(_ResultItem(work_id, except

In [None]:
X.shape

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

Y = LabelEncoder().fit_transform(Y)

In [None]:
Y = to_categorical(Y.reshape(-1, 1))
Y

In [None]:
OUTPUT_DIM = len(np.unique(Y))
OUTPUT_DIM

In [None]:
X.nbytes / 1024 ** 2

In [None]:
from tensorflow.data import Dataset

In [None]:
dataset_train = (
    Dataset.from_tensor_slices((X[train_indices], Y[train_indices]))
    #     .repeat()
    .batch(BATCH_SIZE)
)
dataset_valid = (
    Dataset.from_tensor_slices((X[valid_indices], Y[valid_indices]))
    #     .repeat()
    .batch(BATCH_SIZE)
)
dataset_test = Dataset.from_tensor_slices((X[test_indices], Y[test_indices])).batch(
    BATCH_SIZE
)

In [None]:
for image, label in dataset_train.take(1):
    print("Image shape: ", image.numpy().shape)
    print("Label: ", label.numpy().shape)

### Images Example

In [None]:
def show_image(image, image_size, one_channel=False):
    if one_channel:
        imshow(image.reshape(IMAGE_SIZE, IMAGE_SIZE))
    else:
        imshow(image)

In [None]:
for image, label in dataset_train.take(1):
    print("Label: ", label[0].numpy())
    show_image(image[0].numpy(), IMAGE_SIZE, ONE_CHANNEL)

### Creating model, using existing one

In [None]:
model, callbacks = build_model(
    EfficientNetB0,
    image_size=IMAGE_SIZE,
    nb_channel=NB_CHANNEL,
    input_size=TARGET_SIZE,  # IMAGE_SIZE,
    output_dim=OUTPUT_DIM,
    weights=None,
    patience=5,
)

In [None]:
model.summary()

### Fit new layers

In [None]:
truth_test = []
for _, labels in dataset_test:
    truth_test.append(np.argmax(labels, axis=1))
truth_test = np.hstack(truth_test)
truth_test

truth_valid = []
for i, (_, labels) in enumerate(dataset_valid):
    truth_valid.append(np.argmax(labels, axis=1))
#     if i >= steps_per_epoch_val - 1:
#         break
truth_valid = np.hstack(truth_valid)
truth_valid.shape

### Unfreeze and fit more/all layers

In [None]:
# we train our model again (this time fine-tuning the top 2 inception blocks
# alongside the top Dense layers
history = model.fit(
    dataset_train,
    callbacks=callbacks,
    epochs=epochs,
    # steps_per_epoch=steps_per_epoch,
    validation_data=dataset_valid,
    # validation_steps=steps_per_epoch_val,
)

In [None]:
plot_metric(history, "loss")

## Evaluation

In [None]:
preds_valid = model.predict(dataset_valid)  # , steps=steps_per_epoch_val)
preds_valid.shape
if OUTPUT_DIM > 2:
    print(
        f"Accuracy valid: {accuracy_score(truth_valid, np.argmax(preds_valid, axis=1))}"
    )
if OUTPUT_DIM == 2:
    print(f"ROC AUC valid: {roc_auc_score(truth_valid, preds_valid[:, 1])}")
preds_test = model.predict(dataset_test)
preds_test.shape
if OUTPUT_DIM > 2:
    print(f"Accuracy test: {accuracy_score(truth_test, np.argmax(preds_test, axis=1))}")
if OUTPUT_DIM == 2:
    print(f"ROC AUC test: {roc_auc_score(truth_test, preds_test[:, 1])}")

In [None]:
# Open payments 13min, 0.9338321957865205 (batch 256)
# cat-in-the-dat-ii 0.7643929251726093 1h10 (batch 256)
# RL ROC AUC test: 0.9430279072306678 mobile net patience 20
# BNP cardif (image size : 96) ROC AUC test: 0.7168011573597879 1h16
# give-me-some-credit ROC AUC test: 0.8591709381962962 2h4

In [None]:
# Open payments 1h10
# ROC AUC valid: 0.9432323040315754
# ROC AUC test: 0.9345809216081824

In [None]:
# EfficientNetB0
# EfficientNetB2 2h => ROC AUC valid: 0.9357130861335854 ROC AUC test: 0.9330937133279599

In [None]:
# "santander-customer-satisfaction"
# 0.8164865598696714 => target size 96, whole re train
# 0.8141824599511267 => target size 224, whole re train
# ROC AUC valid: 0.8268400760249797 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout, earlystoping eevry time

# give me some credit
# 0.8462134942186483 => target size 160, whole train (batch 128, 2layers 1024, 128)

# ROC AUC valid: 0.8348917439829162
# RL
# ROC AUC valid: 0.892118469133795 => 160, whole, batch 64, 1layer 1024
# ROC AUC valid: 0.9233534348199217 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout
# ROC AUC valid: 0.9465346534653467 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout, earlystoping eevry time

# Open payment
# ROC AUC valid: 0.9186458210299415 => 96 =>160, whole, batch 64, 1layer 1024
# ROC AUC valid: 0.8778868370932499 => 96 => 96, whole, batch 64, 1layer 1024
# ROC AUC valid: 0.8917381493730192 => 96 => 224, whole, batch 64, 1layer 1024
# ROC AUC valid: 0.9090374872044725 => 96 => 96, whole, batch 64, 1layer 1024 -> 512 -> 128
# ROC AUC valid: 0.886417393797122 => 96 => 160, whole, batch 64, 1layer 1024 -> 512 -> 128
# ROC AUC valid: 0.9045819676568436 => 96 => 96, whole, batch 64, 1layer 1024 -> 128, no dropout
# ROC AUC valid: 0.9045819676568436 => 96 => 96, whole, batch 64, 1layer 1024 -> 128, no dropout
# ROC AUC valid: 0.9406275221953189 => 96 => 96, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout
# ROC AUC valid: 0.9399110034154216 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout, earlystoping eevry time
# ROC AUC valid: 0.8811034128677376 => 96 => 96, whole, batch 128, 1layer 1024 -> 512 -> 128, no dropout, earlystoping eevry time

# Bank marketing
# ROC AUC valid: 0.7970734141661526 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout

# Albert
# ROC AUC valid: 0.7500980687987842 => 288 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout

# bnp-cardif
# ROC AUC valid: 0.7206667869818926 => 384 => 96, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout


In [None]:
# "santander-customer-satisfaction"
# 0.833417731838137 => target size 96, whole train
# 0.8170226029745679 => target size 224, whole train
# ROC AUC test: 0.8348549041045967 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout, earlystoping eevry time

# give me some credit
# 0.8453497574694486 => target size 160, whole train (batch 128, 2layers 1024, 128)
# ROC AUC test: 0.8447058873195916

# RL
# ROC AUC test: 0.9051288159651395 => 160, whole, batch 64, 1layer 1024
# ROC AUC test: 0.9128674518211912 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout
# ROC AUC test: 0.9456874816987527 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout, earlystoping eevry time
# ROC AUC test: 0.9399110034154216 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout, earlystoping eevry time

# Open payment
# ROC AUC test: 0.916907759155486 => 96 =>160, whole, batch 64, 1layer 1024
# ROC AUC test: 0.8931101859362467 => 96 => 96, whole, batch 64, 1layer 1024
# ROC AUC test: 0.8938387451368033 => 96 => 224, whole, batch 64, 1layer 1024
# ROC AUC test: 0.9067445823812874 => 96 => 96, whole, batch 64, 1layer 1024 -> 512 -> 128
# ROC AUC test: 0.8954549557710788 => 96 => 160, whole, batch 64, 1layer 1024 -> 512 -> 128
# ROC AUC test: 0.9062895529860363 => 96 => 96, whole, batch 64, 1layer 1024 -> 128, no dropout
# ROC AUC test: 0.9403517762951931 => 96 => 96, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout

# Bank marketing
# ROC AUC valid: 0.7959000291791145 => 160 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout

# Albert
# ROC AUC test: 0.7487660412524685 => 288 => 160, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout

# bnp-cardif
# ROC AUC test: 0.725934546476426 => 384 => 96, whole, batch 32, 1layer 1024 -> 512 -> 128, no dropout
