In [None]:
%load_ext nb_black
import os
from pathlib import Path


In [None]:
import json
import io
from requests import get
from pathlib import Path
import shutil
import gzip
import gc

from itertools import repeat

from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

np.random.seed(0)

from PIL import Image, ImageDraw, ImageFont
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt

%matplotlib inline

import gc
import json
import tensorflow as tf

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    ZeroPadding2D,
    Dense,
    GlobalAveragePooling2D,
    AveragePooling2D,
    Input,
    Dropout,
)

from tensorflow_addons.optimizers import RectifiedAdam, Lookahead
from tensorflow_addons.activations import mish
from concurrent.futures import ProcessPoolExecutor as PoolExecutor

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score

np.random.seed(0)

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

%matplotlib inline

In [None]:
dataset_name = "open-payments"
target = "status"

panda_kwargs = {}

DATASET_FOLDER = Path(os.getcwd()) / f"data/{dataset_name}"
DATASET_FILENAME = "train_bench.csv"
dataset_path = DATASET_FOLDER / DATASET_FILENAME
FONT_FOLDER = Path(os.getcwd())

TAILORED_COLUMN = "Set"

In [None]:
IMAGE_SIZE = 96
CUT_LENGTH = None
ONE_CHANNEL = True
NB_CHANNEL = 1 if ONE_CHANNEL else 3
patience = 5
epochs = 200

In [None]:
columns = pd.read_csv(dataset_path, **panda_kwargs, nrows=1).columns.tolist()
columns

In [None]:
target_values = pd.read_csv(
    dataset_path, **panda_kwargs, usecols=[target]
).values.reshape(-1)
CLASSNAMES = np.unique(target_values).astype("str")
NB_LINES = target_values.shape[0]
del target_values
print(NB_LINES)
print(CLASSNAMES)

In [None]:
OUTPUT_DIM = len(CLASSNAMES)
LOSS = "binary_crossentropy" if OUTPUT_DIM == 2 else "categorical_crossentropy"
METRIC = "AUC" if OUTPUT_DIM == 2 else "accuracy"

In [None]:
split = None
if TAILORED_COLUMN not in columns:
    split = np.random.choice(
        ["train", "valid", "test"], p=[0.8, 0.1, 0.1], size=(NB_LINES,)
    )
else:
    split = pd.read_csv(
        dataset_path, **panda_kwargs, usecols=[TAILORED_COLUMN]
    ).values.reshape(-1)


train_indices = np.argwhere(split == "train").reshape(-1)
np.random.shuffle(train_indices)
valid_indices = np.argwhere(split == "valid").reshape(-1)
test_indices = np.argwhere(split == "test").reshape(-1)

In [None]:
used_columns = list(set(columns) - set([TAILORED_COLUMN, target]))
used_columns

In [None]:
df = pd.read_csv(dataset_path, **panda_kwargs, usecols=used_columns + [target])

In [None]:
X_train = df[used_columns].values[train_indices].astype("str")
Y_train = df[[target]].values[train_indices].astype("str")

X_valid = df[used_columns].values[train_indices].astype("str")
Y_valid = df[[target]].values[train_indices].astype("str")

X_test = df[used_columns].values[train_indices].astype("str")
Y_test = df[[target]].values[train_indices].astype("str")

In [None]:
BATCH_SIZE = 32
PREFETCH = 100000

In [None]:
steps_per_epoch = np.ceil(X_train.shape[0] / BATCH_SIZE)
steps_per_epoch_val = np.ceil(X_valid.shape[0] / BATCH_SIZE)

In [None]:
del df

## Functions

In [None]:
def download(url, out, force=False, verify=True):
    out.parent.mkdir(parents=True, exist_ok=True)
    if force:
        print(f"Removing file at {str(out)}")
        out.unlink()

    if out.exists():
        print("File already exists.")
        return
    print(f"Downloading {url} at {str(out)} ...")
    # open in binary mode
    with out.open(mode="wb") as file:
        # get request
        response = get(url, verify=verify)
        for chunk in response.iter_content(100000):
            # write to file
            file.write(chunk)

In [None]:
font_url = "https://ff.static.1001fonts.net/r/o/roboto-condensed.regular.ttf"

dataset_path = DATASET_FOLDER / DATASET_FILENAME
out_font = FONT_FOLDER / f"RobotoCondensed-Regular.ttf"

download(font_url, out_font)

In [None]:
def word_to_square_image(text, size, cut_length=None, one_channel=False):
    text = text.decode("utf-8")
    truncated = text[:cut_length] if cut_length is not None else text
    max_x = np.ceil(np.sqrt(len(truncated))).astype("int")
    character_size = np.floor(size / max_x).astype("int")
    padding = np.floor((size - (max_x * character_size)) / 2).astype("int")
    # Do we need pt to px conversion ? Seems like not
    # font_size =  int(np.floor(character_size*0.75))
    font_size = character_size

    fnt = ImageFont.truetype(out_font.as_posix(), font_size)

    # 1 (1-bit pixels, black and white, stored with one pixel per byte)
    # L (8-bit pixels, black and white)
    # RGB (3x8-bit pixels, true color)
    # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
    mode = "L" if one_channel else "RGB"
    WHITE = 1 if one_channel else (255, 255, 255)
    BLACK = 0 if one_channel else (0, 0, 0)

    image = Image.new(mode, (size, size), BLACK)
    # Obtention du contexte graphique
    draw = ImageDraw.Draw(image)
    x = 0
    y = 0
    for letter in truncated:
        draw.text(
            (padding + x * character_size, padding + y * character_size),
            letter,
            font=fnt,
            fill=WHITE,
        )
        if x + 1 < max_x:
            x += 1
        else:
            y += 1
            x = 0
    return np.array(image)

In [None]:
def features_to_square_image(
    features, image_size=224, cut_length=None, one_channel=False
):
    nb_channel = 1 if one_channel else 3
    square_nb = np.ceil(np.sqrt(len(features))).astype("int")
    word_size = np.floor(image_size / square_nb).astype("int")
    max_features = len(features)
    padding = np.floor((image_size - square_nb * word_size) / 2).astype("int")
    if one_channel:
        result_image = np.zeros((image_size, image_size), dtype="uint8")
    else:
        result_image = np.zeros((image_size, image_size, nb_channel), dtype="uint8")
    results = []
    i_feature = 0
    features_str = features  # .astype("str")
    for x in range(0, square_nb):
        if i_feature is None:
            break
        for y in range(0, square_nb):
            i_feature = x * (square_nb) + y
            if i_feature >= max_features:
                i_feature = None
                break
            x_pos = x * word_size + padding
            y_pos = y * word_size + padding
            result_image[
                x_pos : x_pos + word_size, y_pos : y_pos + word_size
            ] = word_to_square_image(
                features_str[i_feature],
                size=word_size,
                cut_length=cut_length,
                one_channel=one_channel,
            )
    return result_image

In [None]:
def features_to_square_image_params(X, Y):
    return (
        features_to_square_image(
            X, image_size=IMAGE_SIZE, cut_length=CUT_LENGTH, one_channel=ONE_CHANNEL,
        ),
        CLASSNAMES == Y,
    )

In [None]:
@tf.function(
    input_signature=[tf.TensorSpec(None, tf.string), tf.TensorSpec(None, tf.string)]
)
def tf_features_to_square_image_params(X, Y):
    img, label = tf.numpy_function(
        features_to_square_image_params, [X, Y], (tf.uint8, tf.bool),
    )
    return (
        tf.reshape(img, shape=(IMAGE_SIZE, IMAGE_SIZE, NB_CHANNEL)),
        tf.reshape(label, shape=(len(CLASSNAMES),)),
    )

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE


def build_dataset(X, Y, *, repeat, batch_size, prefetch):
    dataset = tf.data.Dataset.from_tensor_slices((X, Y))
    if repeat:
        dataset = dataset.repeat()

    dataset = dataset.map(
        tf_features_to_square_image_params, num_parallel_calls=AUTOTUNE
    )

    if batch_size is not None:
        dataset = dataset.batch(batch_size)

    if prefetch is not None:
        dataset = dataset.prefetch(prefetch)

    return dataset

In [None]:
dataset_train = build_dataset(
    X_train, Y_train, repeat=True, batch_size=BATCH_SIZE, prefetch=PREFETCH
)
dataset_valid = build_dataset(
    X_valid, Y_valid, repeat=True, batch_size=BATCH_SIZE, prefetch=PREFETCH
)
dataset_test = build_dataset(
    X_test, Y_test, repeat=False, batch_size=BATCH_SIZE, prefetch=PREFETCH
)

In [None]:
def show_image(image, image_size, one_channel=False):
    if one_channel:
        imshow(image.reshape(IMAGE_SIZE, IMAGE_SIZE))
    else:
        imshow(image)

In [None]:
for image, label in dataset_train.take(1):
    print("Label: ", label[0].numpy())
    show_image(image[0].numpy(), IMAGE_SIZE, ONE_CHANNEL)

In [None]:
for image, label in dataset_valid.take(1):
    print("Label: ", label[0].numpy())
    show_image(image[0].numpy(), IMAGE_SIZE, ONE_CHANNEL)

In [None]:
for image, label in dataset_test.take(1):
    print("Label: ", label[0].numpy())
    show_image(image[0].numpy(), IMAGE_SIZE, ONE_CHANNEL)

In [None]:
from efficientnet.tfkeras import (
    EfficientNetB0,
    EfficientNetB4,
    EfficientNetB2,
    EfficientNetB3,
)

In [None]:
activation = mish
optimizer = Lookahead(RectifiedAdam(), sync_period=6, slow_step_size=0.5)

In [None]:
# Now, we can use PRE TRAINED model
base_model = EfficientNetB0(
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, NB_CHANNEL),
    weights=None,  # "imagenet",
    # weights="noisy-student",
    # weights=None,
    include_top=False,
)


In [None]:
x = base_model.layers[-4].output
# add a global spatial average pooling layer
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
# x = Dense(1024, activation=activation, kernel_initializer="he_normal")(x)
# x = Dense(512, activation=activation, kernel_initializer="he_normal")(x)
x = Dense(256, activation=activation, kernel_initializer="he_normal")(x)
x = Dropout(0.2)(x)
x = Dense(128, activation=activation, kernel_initializer="he_normal")(x)
x = Dropout(0.2)(x)
# and a logistic layer -- let's say we have 200 classes
predictions = Dense(OUTPUT_DIM, activation="softmax")(x)

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# first: train only the top layers (which were randomly initialized)
# i.e. freeze all base_model layers
# for layer in base_model.layers:
#    layer.trainable = False

# compile the model (should be done *after* setting layers to non-trainable)
es = EarlyStopping(
    monitor="val_loss",
    verbose=1,
    mode="min",
    patience=patience,
    restore_best_weights=True,
)
# We need to recompile the model for these modifications to take effect
es.set_model(model)
model.compile(optimizer=optimizer, loss=LOSS)

In [None]:
model.summary()

In [None]:
dataset_train

In [None]:
# we train our model again (this time fine-tuning the top 2 inception blocks
# alongside the top Dense layers
history = model.fit(
    dataset_train,
    callbacks=[es],
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=dataset_valid,
    validation_steps=steps_per_epoch_val,
)

In [None]:
truth_test = []
for _, labels in dataset_test:
    truth_test.append(np.argmax(labels, axis=1))
truth_test = np.hstack(truth_test)
truth_test

truth_valid = []
for i, (_, labels) in enumerate(dataset_valid):
    truth_valid.append(np.argmax(labels, axis=1))
    if i >= steps_per_epoch_val - 1:
        break
truth_valid = np.hstack(truth_valid)
truth_valid.shape

In [None]:
preds_valid = model.predict(dataset_valid, steps=steps_per_epoch_val)
preds_valid.shape
if OUTPUT_DIM > 2:
    print(
        f"Accuracy valid: {accuracy_score(truth_valid, np.argmax(preds_valid, axis=1))}"
    )
if OUTPUT_DIM == 2:
    print(f"ROC AUC valid: {roc_auc_score(truth_valid, preds_valid[:, 1])}")
preds_test = model.predict(dataset_test)
preds_test.shape
if OUTPUT_DIM > 2:
    print(f"Accuracy test: {accuracy_score(truth_test, np.argmax(preds_test, axis=1))}")
if OUTPUT_DIM == 2:
    print(f"ROC AUC test: {roc_auc_score(truth_test, preds_test[:, 1])}")