In [70]:
import numpy as np
from pathlib import Path
import lmdb
import six
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
import japanize_matplotlib
import json
import cv2
import time
from typing import List, Dict, Union, Tuple, Any
from pprint import pprint

In [188]:
root_dir = Path().cwd().parents[2]
data_dir = root_dir / 'data' / 'data0007'
if (data_dir / 'lmdb').exists():
    lmdb_dir = data_dir / 'lmdb'
else:
    lmdb_dir = data_dir / 'valid' / 'lmdb'
env = lmdb.open(str(lmdb_dir), max_readers=32,
                readonly=True, lock=False, readahead=False, meminit=False)

with env.begin(write=False) as txn:
    n_samples = int(txn.get('num-samples'.encode()))

In [189]:
n_samples

1107

In [190]:
def round_float(value):
    """
    Convert a float value to a string with the specified number of decimal places. 
    If there is more than 1 digit in the integer, then we will truncate to 1 decimal.
    Otherwise, will truncate to 4 decimals.

    Args:
        value (int, float, str): The float value to convert

    Returns:
        str: The rounded float value as a string
    """
    if isinstance(value, float):
        value = str(value)

        if "." in value:
            integer, decimal = value.split(".")
            if abs(float(integer)) > 1:
                decimal = decimal[:1]
            else:
                decimal = decimal[:4]

            value = integer + "." + decimal
    return value

In [191]:
anns = []
for idx in tqdm(range(n_samples), total=n_samples):
    with env.begin(write=False) as txn:
        # load json
        label_key = f'label-{str(idx+1).zfill(8)}'.encode()
        label = txn.get(label_key).decode('utf-8')
    json_dict = json.loads(label)
    if json_dict['chart-type'] not in ['scatter']:
        continue
    anns.append(json_dict)

100%|██████████| 1107/1107 [00:00<00:00, 41954.41it/s]


In [196]:
pprint(anns[2]['data-series'])

[{'x': 1.0601275432736108, 'y': 45.302159391020666},
 {'x': 1.0674157303370786, 'y': 51.578375019418985},
 {'x': 1.1484500922703171, 'y': 55.86429416474469},
 {'x': 1.1578041672660242, 'y': 46.7068673703834},
 {'x': 1.349029711117424, 'y': 46.004750760472476},
 {'x': 1.3672125020439627, 'y': 49.03478686918178},
 {'x': 1.4421717062166501, 'y': 41.79897467764487},
 {'x': 1.5233323210851302, 'y': 43.494899280203},
 {'x': 1.5132098390525357, 'y': 45.048418000103574},
 {'x': 1.4544994432634883, 'y': 46.13588110403398},
 {'x': 1.460572932483045, 'y': 46.4465848480141},
 {'x': 1.6286061342241118, 'y': 46.601936720004154},
 {'x': 1.7197084725174614, 'y': 44.27165864015329},
 {'x': 2.1590241927320575, 'y': 40.33607788307183},
 {'x': 2.6955157404595607, 'y': 48.6215110558749},
 {'x': 2.401963761514323, 'y': 52.1428201543162},
 {'x': 2.3331308836926814, 'y': 53.178499300916585},
 {'x': 2.1731956675776902, 'y': 52.194604111646214},
 {'x': 2.037554408340925, 'y': 51.00357309305578},
 {'x': 1.798663

In [50]:
f = anns[4]['data-series'][0]['y']

'714950.7'

In [52]:
num = [
    0.00045262,
    0.54354,
    543523.2543,
    0.4,
    541,
    543.51
]

for n in num:
    f_num = round_float(n)
    print(f'{n} -> {f_num}')

0.00045262 -> 0.0004
0.54354 -> 0.5435
543523.2543 -> 543523.2
0.4 -> 0.4
541 -> 541
543.51 -> 543.5


In [167]:
def rmse(y_true: List[float], y_pred: List[float]) -> float:
    """
    Calculate the Root Mean Square Error (RMSE) between the true and predicted values.

    Args:
        y_true (List[float]): The true values.
        y_pred (List[float]): The predicted values.

    Returns:
        float: The Root Mean Square Error.
    """
    return np.sqrt(np.mean(np.square(np.subtract(y_true, y_pred))))


def normalized_rmse(y_true: List[float], y_pred: List[float]) -> float:
    """
    Calculate the normalized Root Mean Square Error (RMSE) between the true and predicted values.

    Args:
        y_true (List[float]): The true values.
        y_pred (List[float]): The predicted values.

    Returns:
        float: The normalized Root Mean Square Error.
    """
    numerator = rmse(y_true, y_pred)
    denominator = rmse(y_true, np.mean(y_true))

    # https://www.kaggle.com/competitions/benetech-making-graphs-accessible/discussion/396947
    if denominator == 0:
        if numerator == 0:
            return 1.0  # 正解が1つ & 正解したら
        return 0.0

    return sigmoid(numerator / denominator)

def sigmoid(x):
    return 2 - 2 / (1 + np.exp(-x))

def reduce_precision(arr):
    for i in range(-7, 7):
        # Round array
        prec = np.round(arr, decimals=i)
        if i <= 0:
            prec = prec.astype(int)
        prec = list(prec)
        # Check if nrmse is close enough
        if normalized_rmse(arr, prec) >= 0.96:
            return prec
    return arr

In [168]:
def round_data(value):
    """
    Convert a float value to a string with the specified number of decimal places. 
    If there is more than 1 digit in the integer, then we will truncate to 1 decimal.
    Otherwise, will truncate to 4 decimals.

    Args:
        value (int, float, str): The float value to convert

    Returns:
        str: The rounded float value as a string
    """
    if isinstance(value, float):
        value = str(value)

        if "." in value:
            integer, decimal = value.split(".")
            if abs(float(integer)) > 1:
                decimal = decimal[:1]
            else:
                decimal = decimal[:4]

            value = integer + "." + decimal
    return value

In [172]:
reduce_precision([0.43, 0.56, 0.435, 0.64, 0.45, 3])

[0.4, 0.6, 0.4, 0.6, 0.4, 3.0]

In [152]:
for i in range(-4, 5):
    print(np.round([0.54265, 0.654625], i))

[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[1. 1.]
[0.5 0.7]
[0.54 0.65]
[0.543 0.655]
[0.5426 0.6546]


In [154]:
normalized_rmse([0.54265, 0.654625], [0.54, 0.65])

0.9663519848851483