In [None]:
#dataset 압축 해제
!tar xzf EnglishFnt.tgz -C sample_data/

In [None]:
!pip install "protobuf==3.20.3" tflite-support

1. EnglishFnt에서 D/N/E 데이터셋 자동 구축

In [None]:
import os
from pathlib import Path
from PIL import Image
import shutil
import random

#데이터셋 중 'D', 'N', 'E' 클래스만 추출
FNT_BASE = Path("sample_data/English/Fnt")
OUT_BASE = Path("dataset")
CLASS_MAP = {'Sample014': 'D', 'Sample024': 'N', 'Sample015': 'E'}

# dataset/D, dataset/N, dataset/E 폴더 생성 및 초기화
for c in ['D', 'N', 'E']:
    d = OUT_BASE / c
    if d.exists():
        shutil.rmtree(d)
    d.mkdir(parents=True, exist_ok=True)

# 클래스별로 최대 2000장씩 저장
for sample, label in CLASS_MAP.items():
    src = FNT_BASE / sample
    dst = OUT_BASE / label
    files = list(src.glob("*.png"))
    random.shuffle(files)
    for i, f in enumerate(files[:2000]):
        img = Image.open(f).convert("L").resize((32,32)) #32x32 흑백으로 리사이즈
        img.save(dst / f"{label}_{i:04d}.png")

print("EnglishFnt → dataset/D,N,E 자동 구축 완료")

2. Keras 데이터셋 로드 및 증강

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

IMG_SIZE    = (32, 32)
BATCH_SIZE  = 64
EPOCHS      = 20
TFLITE_PATH = "dne_classifier.tflite"
CLASS_NAMES = ["D", "N", "E"]
VALID_SPLIT = 0.2 #검증데이터
SEED        = 42

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    "dataset",
    labels="inferred",
    label_mode="categorical", #one-hot encoding
    class_names=CLASS_NAMES,
    color_mode="grayscale", #흑백
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    validation_split=VALID_SPLIT,
    subset="training",
    seed=SEED
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    "dataset",
    labels="inferred",
    label_mode="categorical",
    class_names=CLASS_NAMES,
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    validation_split=VALID_SPLIT,
    subset="validation",
    seed=SEED
)

normalization = layers.Rescaling(1.0 / 255) #정규화
data_augmentation = tf.keras.Sequential([
    layers.RandomBrightness(0.2), #밝기
    layers.RandomContrast(0.2), #대비
    layers.RandomRotation(0.1), #회전
])

def preprocess_train(x, y):
    x = tf.expand_dims(x, -1) if x.shape[-1] != 1 else x
    x = data_augmentation(x)
    x = normalization(x)
    return x, y

train_ds = train_ds.map(preprocess_train).prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds   = val_ds.map(lambda x, y: (normalization(x), y)).prefetch(buffer_size=tf.data.AUTOTUNE)

3. 셀 크롭 및 리사이즈

In [None]:
def tight_crop_and_resize(cell_img, out_size=(32,32), extra_crop=3):
    if cell_img is None or cell_img.size == 0:
        return np.full(out_size, 255, dtype=np.uint8)
    _, threshed = cv2.threshold(cell_img, 200, 255, cv2.THRESH_BINARY)
    inv = 255 - threshed
    coords = cv2.findNonZero(inv)
    if coords is not None:
        x, y, w, h = cv2.boundingRect(coords)
        # 더 타이트하게 자르기 (상하좌우 extra_crop 만큼 추가로 크롭)
        x1 = max(x + extra_crop, 0)
        y1 = max(y + extra_crop, 0)
        x2 = min(x + w - extra_crop, cell_img.shape[1])
        y2 = min(y + h - extra_crop, cell_img.shape[0])
        if x2 > x1 and y2 > y1:
            cropped = cell_img[y1:y2, x1:x2]
        else:
            cropped = cell_img[y:y+h, x:x+w]  # 만약 잘못 잘라지면 fallback
    else:
        cropped = cell_img
    resized = cv2.resize(cropped, out_size)
    return resized

4. 셀 분류 함수

In [None]:
def classify_dne(cell_img, interpreter, input_details, output_details, debug=False, r=0, c=0):
    tight_img = tight_crop_and_resize(cell_img)
    white_ratio = np.mean(tight_img > 180)
    if debug:
        cv2.imwrite(f'debug_cell_r{r}_c{c}.png', tight_img)
    # 1. white_ratio로 빈칸 필터링
    if white_ratio > WHITE_THRESH: #기준보다 높으면 '-'로 판단
        if debug: print("Blank cell detected by white_ratio:", white_ratio)
        return '-'
    # 2. 모델 예측
    h, w = input_details[0]['shape'][1:3]
    inp = tight_img.astype(np.float32) / 255.0
    sample = inp.reshape(1, h, w, 1)
    interpreter.set_tensor(input_details[0]['index'], sample)
    interpreter.invoke()
    out = interpreter.get_tensor(output_details[0]['index'])[0]
    max_prob = np.max(out)
    if debug:
        print(f"Probabilities: D={out[0]:.3f}, N={out[1]:.3f}, E={out[2]:.3f} (max={max_prob:.3f})")
    # 3. max 확률 0.7 미만이면 빈칸으로 처리
    if max_prob < 0.7:
        if debug: print("Blank cell detected by prob:", max_prob)
        return '-'
    # 4. 확률이 충분히 높으면 클래스 리턴
    return CLASS_LABELS[np.argmax(out)]

5. 메인 실행 및 JSON 저장

In [None]:
def main():
    img, data_rows, data_cols = get_table_cells(
        IMAGE_PATH, header_skip_y=HEADER_SKIP_Y, header_skip_x=HEADER_SKIP_X,
        vis_path="table_detected.png"
    )

    interpreter = tf.lite.Interpreter(model_path=TFLITE_MODEL)
    interpreter.allocate_tensors()
    input_details  = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    result = {}
    debug_count = 0
    for r, (y1, y2) in enumerate(data_rows):
        row_dict = {}
        for c, (x1, x2) in enumerate(data_cols):
            cell_img = img[y1:y2, x1:x2]
            debug = (debug_count < 30)
            val = classify_dne(cell_img, interpreter, input_details, output_details,
                               debug=debug, r=r, c=c)
            row_dict[str(c+1)] = val
            if debug: debug_count += 1
        result[str(r+1)] = row_dict

    with open("schedule_inferred.json", "w", encoding="utf-8") as f: #json파일 추출
        json.dump(result, f, ensure_ascii=False, indent=2)
    print("schedule_inferred.json 저장 완료")
    print(json.dumps(result, ensure_ascii=False, indent=2))

In [None]:
if __name__ == '__main__':
    main()