In [None]:
%load_ext nb_black
import os
from pathlib import Path

In [None]:
!ls /work/data

In [None]:
# dataset_name = "santander-customer-satisfaction"
# target = "TARGET"
# dataset_name = "census-income"
# target = "taxable income amount"
dataset_name = "bank-marketing"
target = "y"
# dataset_name = "open-payments"
# target = "status"
# dataset_name = "bnp-cardif"
# target = "target"
# dataset_name = "give-me-some-credit"
# target = "SeriousDlqin2yrs"
# dataset_name = "springleaf-marketing-response" #(9h)
# target = "target"
# dataset_name = "segment"
# target = "class"
# dataset_name = "rl"
# target = "target"
# dataset_name = "portoseguro"
# target = "target"
# dataset_name = "road-safety"  # (3h)
# target = "Sex_of_Driver_df_res"
# dataset_name = "titanic"
# target = "Survived"
# dataset_name = "cat-in-the-dat-ii"  # 20min
# target = "target"

panda_kwargs = {}

FEATURE_SIZE = 32
IMAGE_SIZE = None  # 96
CUT_LENGTH = None
ONE_CHANNEL = True


FONT_FOLDER = Path(os.getcwd())
DATASET_FOLDER = Path(os.getcwd()) / f"data/{dataset_name}"
DATASET_FILENAME = "train_bench.csv"
DATASET_URL = None

TAILORED_COLUMN = "Set"
MAX_MEMORY_USE = 1  # IN GB
NB_CHANNEL = 1 if ONE_CHANNEL else 3

## Import + utilities

In [None]:
import json
import io
from requests import get
from pathlib import Path
import shutil
import gzip
import gc

from itertools import repeat

from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

np.random.seed(0)

from PIL import Image, ImageDraw, ImageFont
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def download(url, out, force=False, verify=True):
    out.parent.mkdir(parents=True, exist_ok=True)
    if force:
        print(f"Removing file at {str(out)}")
        out.unlink()

    if out.exists():
        print("File already exists.")
        return
    print(f"Downloading {url} at {str(out)} ...")
    # open in binary mode
    with out.open(mode="wb") as file:
        # get request
        response = get(url, verify=verify)
        for chunk in response.iter_content(100000):
            # write to file
            file.write(chunk)

In [None]:
def do_parallel_numpy(map_func, iter_params, constant_params=None):
    repeated_params = (
        [] if constant_params is None else list(map(repeat, constant_params))
    )
    results = None
    with PoolExecutor() as executor:
        results = np.stack(
            list(executor.map(map_func, *iter_params, *repeated_params)), axis=0
        )
    return results

In [None]:
def save_numpy_as_image_gz(arr, path, one_channel=False):
    mode = "L" if one_channel else "RGB"

    im = Image.fromarray(arr, mode=mode)
    output = io.BytesIO()
    im.save(output, "jpeg", optimize=True)
    with gzip.open(path, "wb") as f:
        f.write(output.getvalue())
    return True

## Download font

In [None]:
font_url = "https://ff.static.1001fonts.net/r/o/roboto-condensed.regular.ttf"

dataset_path = DATASET_FOLDER / DATASET_FILENAME
out_font = FONT_FOLDER / f"RobotoCondensed-Regular.ttf"

if DATASET_URL is not None:
    download(DATASET_URL, dataset_path)
download(font_url, out_font)

In [None]:
def format_number(nb):
    return np.format_float_scientific(
        nb, precision=9, unique=False, pad_left=None, exp_digits=2, sign=True
    )

## Numpy to img preprocessing

In [None]:
# https://he-arc.github.io/livre-python/pillow/index.html#methodes-de-dessin
# https://stackoverflow.com/questions/26649716/how-to-show-pil-image-in-ipython-notebook
# https://stackoverflow.com/questions/384759/how-to-convert-a-pil-image-into-a-numpy-array
# line = np.array(pic, dtypes="uint8")
# from https://arxiv.org/pdf/1902.02160.pdf page 2

In [None]:
def word_to_square_image(text, size, cut_length=None, one_channel=False):

    if not isinstance(text, str) and np.isfinite(text):
        text = format_number(text)
    truncated = text[:cut_length] if cut_length is not None else text
    max_x = np.ceil(np.sqrt(len(truncated))).astype("int")
    character_size = np.floor(size / max_x).astype("int")
    padding = np.floor((size - (max_x * character_size)) / 2).astype("int")
    # Do we need pt to px conversion ? Seems like not
    # font_size =  int(np.floor(character_size*0.75))
    font_size = character_size

    fnt = ImageFont.truetype(out_font.as_posix(), font_size)

    # 1 (1-bit pixels, black and white, stored with one pixel per byte)
    # L (8-bit pixels, black and white)
    # RGB (3x8-bit pixels, true color)
    # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
    mode = "L" if one_channel else "RGB"
    WHITE = 255 if one_channel else (255, 255, 255)
    BLACK = 0 if one_channel else (0, 0, 0)

    image = Image.new(mode, (size, size), BLACK)
    # Obtention du contexte graphique
    draw = ImageDraw.Draw(image)
    x = 0
    y = 0
    for letter in truncated:
        draw.text(
            (padding + x * character_size, padding + y * character_size),
            letter,
            font=fnt,
            fill=WHITE,
        )
        if x + 1 < max_x:
            x += 1
        else:
            y += 1
            x = 0
    return np.array(image)

In [None]:
img_1 = word_to_square_image("Example", 24, cut_length=None, one_channel=False)
print(img_1.shape)
imshow(img_1)

In [None]:
img_2 = word_to_square_image("Example", 32, cut_length=None, one_channel=True)
print(img_2.shape)
imshow(img_2)

In [None]:
img_2 = word_to_square_image("+1.550000000e+01", 32, cut_length=None, one_channel=True)
print(img_2.shape)
imshow(img_2)

In [None]:
imshow(
    word_to_square_image(
        "This is a long sentence", 24, cut_length=None, one_channel=False
    )
)

In [None]:
imshow(
    word_to_square_image("This is a long sentence", 32, cut_length=9, one_channel=False)
)

In [None]:
def features_to_square_image(
    features, image_size=224, cut_length=None, one_channel=False
):
    nb_channel = 1 if one_channel else 3
    square_nb = np.ceil(np.sqrt(len(features))).astype("int")
    word_size = np.floor(image_size / square_nb).astype("int")
    max_features = len(features)
    padding = np.floor((image_size - square_nb * word_size) / 2).astype("int")
    if one_channel:
        result_image = np.zeros((image_size, image_size), dtype="uint8")
    else:
        result_image = np.zeros((image_size, image_size, nb_channel), dtype="uint8")
    results = []
    i_feature = 0
    features_str = features.astype("str")
    for x in range(0, square_nb):
        if i_feature is None:
            break
        for y in range(0, square_nb):
            i_feature = x * (square_nb) + y
            if i_feature >= max_features:
                i_feature = None
                break
            x_pos = x * word_size + padding
            y_pos = y * word_size + padding
            result_image[
                x_pos : x_pos + word_size, y_pos : y_pos + word_size
            ] = word_to_square_image(
                features_str[i_feature],
                size=word_size,
                cut_length=cut_length,
                one_channel=one_channel,
            )
    return result_image

In [None]:
img_3 = features_to_square_image(
    np.array(
        [
            10,
            "test",
            1.0,
            True,
            np.nan,
            "blabla",
            150000,
            "a long sentence just to see",
            "A",
        ]
    ),
    image_size=3 * 16,
)
print(img_3.shape)
imshow(img_3)

In [None]:
def features_to_square_image_params(values, params):
    return features_to_square_image(
        values,
        image_size=params["image_size"],
        cut_length=params["cut_length"],
        one_channel=params["one_channel"],
    )

## Load info from dataset

In [None]:
columns = pd.read_csv(dataset_path, **panda_kwargs, nrows=1).columns.tolist()
print(len(columns))
columns

In [None]:
target_values = pd.read_csv(
    dataset_path, **panda_kwargs, usecols=[target]
).values.reshape(-1)
CLASSNAMES = np.unique(target_values).astype("str")
NB_LINES = target_values.shape[0]
del target_values
print(NB_LINES)
print(CLASSNAMES)

## Prepare split

In [None]:
split = None
if TAILORED_COLUMN not in columns:
    split = np.random.choice(
        ["train", "valid", "test"], p=[0.8, 0.1, 0.1], size=(NB_LINES,)
    )
else:
    split = pd.read_csv(
        dataset_path, **panda_kwargs, usecols=[TAILORED_COLUMN]
    ).values.reshape(-1)


train_indices = np.argwhere(split == "train").reshape(-1)
# np.random.shuffle(train_indices)
valid_indices = np.argwhere(split == "valid").reshape(-1)
test_indices = np.argwhere(split == "test").reshape(-1)

In [None]:
used_columns = list(set(columns) - set([TAILORED_COLUMN, target]))
used_columns

In [None]:
extract_df = pd.read_csv(
    dataset_path, **panda_kwargs, nrows=1000, usecols=used_columns, low_memory=False
)
extract_df.dtypes
nb_idx = (extract_df.dtypes == "int64") | (extract_df.dtypes == "float64")
nb_idx = extract_df.columns[nb_idx]
del extract_df
nb_idx

## Calculate image size (width, and memory weight)

In [None]:
square_side_nb_feature = np.ceil(np.sqrt(len(used_columns))).astype("int")
IMAGE_SIZE = square_side_nb_feature * FEATURE_SIZE if IMAGE_SIZE is None else IMAGE_SIZE
memory_image_size = (
    square_side_nb_feature ** 2 * FEATURE_SIZE ** 2 * NB_CHANNEL
)  # in bytes
chunk_size = np.floor((MAX_MEMORY_USE * 1024 ** 3) / memory_image_size).astype("int")

In [None]:
IMAGE_SIZE

In [None]:
memory_image_size

In [None]:
CHUNK = chunk_size
CHUNK

In [None]:
params = {
    "image_size": IMAGE_SIZE,
    "cut_length": CUT_LENGTH,
    "one_channel": ONE_CHANNEL,
}

## Preparing datasets

In [None]:
file_list = {
    "train": [],
    "valid": [],
    "test": [],
}
for set_label in ["train", "valid", "test"]:
    prep_data_folder = DATASET_FOLDER / f"prep_data/{IMAGE_SIZE}/{set_label}/"
    if prep_data_folder.exists():
        shutil.rmtree(prep_data_folder)
    prep_data_folder.mkdir(parents=True, exist_ok=True)
    for classname in CLASSNAMES:
        out_folder = prep_data_folder / classname
        out_folder.mkdir(parents=True, exist_ok=True)

In [None]:
prep_data_folder = DATASET_FOLDER / "prep_data" / str(IMAGE_SIZE)

In [None]:
for i, chunk in tqdm(
    enumerate(
        pd.read_csv(
            dataset_path,
            **panda_kwargs,
            chunksize=CHUNK,
            usecols=used_columns + [target]
        )
    ),
    total=(NB_LINES // CHUNK) + (1 if NB_LINES % CHUNK > 0 else 0),
):
    # for idx in nb_idx:
    #    chunk[idx] = chunk[idx].apply(format_number)

    # chunk[nb_idx] = format_number(chunk[nb_idx])
    X = chunk[used_columns].values
    Y = chunk[target].values.reshape(-1)  # .astype("str")
    image_X = do_parallel_numpy(features_to_square_image_params, [X], [params])

    chunk_list = []
    for j, label in enumerate(Y):
        idx = i * CHUNK + j
        set_label = (
            "train"
            if idx in train_indices
            else "valid"
            if idx in valid_indices
            else "test"
        )
        full_path = (
            prep_data_folder
            / set_label
            / str(label)
            / (str(j + i * CHUNK) + ".jpeg.gz")
        ).as_posix()
        chunk_list.append(full_path)
        file_list[set_label].append(full_path)

    assert all(
        do_parallel_numpy(
            save_numpy_as_image_gz, [image_X, chunk_list], [ONE_CHANNEL]
        ).reshape(-1)
    )

## Output data info

In [None]:
json_file = DATASET_FOLDER / f"prep_data/{IMAGE_SIZE}/file_list.json"

with json_file.open(mode="w") as fp:
    json.dump(file_list, fp)
json_file

In [None]:
classnames_file = DATASET_FOLDER / f"prep_data/{IMAGE_SIZE}/classnames.json"

with classnames_file.open(mode="w") as fp:
    json.dump(CLASSNAMES.tolist(), fp)
classnames_file