In [6]:
import os
from glob import glob
import shutil
from zipfile import ZipFile
import json
from tqdm import tqdm
import pandas as pd
from PIL import Image
from io import BytesIO
import hashlib
src = "/home/kai/workspace/DeepDocs_Project/datalake/source/table_image_text_pair"

In [2]:
data = glob(f"{src}/**/*.zip", recursive=True)

In [None]:
for file in data:
    print(f"Unzipping {file}...")
    with ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall(os.path.dirname(file).replace(src, "./"))
    print(f"Unzipped {file} to {os.path.dirname(file)}")

In [3]:
dir_ = "032.표 이미지-텍스트 쌍 데이터"
html_files = sorted(glob(f"{dir_}/**/*.html", recursive=True))
images = sorted(glob(f"{dir_}/**/*.jpg", recursive=True))

In [4]:
jsons = sorted(glob(f"{dir_}/**/Validation/02.라벨링데이터/**/*.json", recursive=True))

In [5]:
from bs4 import BeautifulSoup
def extract_tables_with_thead(html):
    soup = BeautifulSoup(html, "html.parser")
    tables = soup.find_all("table")
    for tag in soup(["style", "script"]):
        tag.decompose()
    results = []
    for table in tables:
        tds = table.find_all("th", attrs={"rowspan": "1", "colspan": "1"})
        tds += table.find_all("td", attrs={"rowspan": "1", "colspan": "1"})
        tds += table.find_all("tr", attrs={"rowspan": "1", "colspan": "1"})
        for td in tds:
            td.attrs.pop("rowspan", None)
            td.attrs.pop("colspan", None)
            td.string = td.get_text(strip=True)
            
        results.append(str(table))
    return results

In [33]:
from PIL import Image
import numpy as np
import hashlib
def get_sha256(file_path):
    with open(file_path, "rb") as f:
        file_content = f.read()
        hash_val = hashlib.sha256(file_content).hexdigest()
    return hash_val

In [36]:
images_dir = "./images"
os.makedirs(images_dir, exist_ok=True)
records = []
for html_file, image_file in tqdm(zip(html_files, images), total=len(html_files)):
    
    
    # html_files과 images의 이름이 같은지 확인
    html_name = os.path.basename(html_file).replace(".html", "")
    image_name = os.path.basename(image_file).replace(".jpg", "")
    if html_name != image_name:
        print(f"Mismatch: {html_name} != {image_name}")
        
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()
    tables = extract_tables_with_thead(html_content)
    if len(tables) != 1:
        print(f"Skipping {html_file} due to multiple tables: {len(tables)} found")
        continue
    table = tables[0]
    
    img_sha256 = get_sha256(image_file)
    sha256= os.path.join(images_dir, img_sha256[:2], img_sha256+".jpg")
    os.makedirs(os.path.dirname(sha256), exist_ok=True)
    shutil.copy(image_file, sha256)
    with Image.open(sha256) as img:
        width, height = img.size
    
    data = {
        "image_path": sha256.replace(images_dir + "/", ""),
        "width": width,
        "height": height,
        "label": table,
    }
    records.append(data)

 10%|█         | 41753/404023 [03:38<24:02, 251.09it/s]  

Skipping 032.표 이미지-텍스트 쌍 데이터/3.개방데이터/1.데이터/Training/01.원천데이터/T01_C01_50002_1180_168.html due to multiple tables: 0 found


 45%|████▌     | 183467/404023 [12:49<10:20, 355.17it/s] 

Skipping 032.표 이미지-텍스트 쌍 데이터/3.개방데이터/1.데이터/Training/01.원천데이터/T02_C06_51000_1012_017.html due to multiple tables: 0 found
Skipping 032.표 이미지-텍스트 쌍 데이터/3.개방데이터/1.데이터/Training/01.원천데이터/T02_C06_51000_1012_018.html due to multiple tables: 0 found


100%|██████████| 404023/404023 [34:29<00:00, 195.26it/s] 


In [40]:
df = pd.DataFrame(records)

In [41]:
# find duplicate images
# find duplicate label
duplicates = df[df.duplicated(subset=["image_path"], keep=False)]
duplicates_label = df[df.duplicated(subset=["label"], keep=False)]

# 제거
df_cleaned = df.drop_duplicates(subset=["image_path"], keep='first')
df_cleaned = df.drop_duplicates(subset=["label"], keep='first')
df_cleaned = df_cleaned.reset_index(drop=True)

In [42]:
df_cleaned

Unnamed: 0,image_path,width,height,label
0,e3/e38f0b278ee3cdea8481572e9d977fe85b807c187ce...,998,237,<table><tr><td>달성여부목표치의적극성</td><td>초과달성 (100%초...
1,57/570d580cc81680d2ffbd711588531b20b29af36592d...,1276,216,"<table><tr><td>(전년동월비,%)</td><td>'19.8</td><td..."
2,f9/f9b8583eebf90bf0a7ec409f3e632a5bdd3d4c2020b...,1126,231,<table><tr><td>구분</td><td>'16년</td><td>17년</td...
3,6a/6abcfe60f8f47f475c4b36f37cc2b4ff85119b95f71...,1222,404,<table><tr><td>재정투자</td><td>민간자본활용</td><td>법·제...
4,08/08e527e1a8c2857b4827fbda9948d4338488a77fc33...,894,199,<table><tr><td>항목</td><td>2019년(A)</td><td>201...
...,...,...,...,...
361756,98/98a514782f8c561c485014d18a246b2a1592c0390d8...,1325,790,"<table><tr><td colspan=""2"" rowspan=""1"">구분</td>..."
361757,55/55c8c0ea99210216b9cfab6150c02868887442319d6...,1224,521,<table><tr><td>위치</td><td>2014</td><td>확대설치</t...
361758,0d/0d6d6ae1384f6b1326804a5157ea965b34f64b90e31...,990,1212,<table><tr><td>젠더이슈</td><td>시설기준의 구성</td><td>시...
361759,08/08dc92668cc5c11448d31712be8167ca54987addc59...,971,660,<table><tr><td>젠더이슈</td><td>시설 기준의 구성</td><td>...


In [43]:
df_cleaned.to_parquet("table_image_text_pair.parquet", index=False)

In [76]:
len(glob(f"{images_dir}/**/*.jpg", recursive=True)), len(df_cleaned)

(364660, 361761)

In [88]:
import io
for row in df_cleaned.itertuples():
    image_path = row.image_path
    image = Image.open(os.path.join(images_dir, image_path))
    buf = io.BytesIO()
    image.save(buf, format='JPEG')
    byte_data = buf.getvalue()
    hash_val = hashlib.sha256(byte_data).hexdigest()
    # get sha256
    image.save("./test.jpg", "JPEG")
    hash = get_sha256("./test.jpg")
    break
    

**label은 안씀**

 label 예시:
```json
{'file_id': 1254,
 'file_name': 'T01_C01_50000_1025_42.jpg',
 'file_format': 'jpg',
 'img_size': 40692,
 'table_meta': {'table_meta.doc_title': '2021년도 자체평가계획',
  'table_meta.publisher': '기상청',
  'table_meta.publish_year': '2021',
  'table_meta.table_type': '기본표',
  'table_meta.table_field': '경제',
  'table_meta.table_unit': '',
  'table_meta.table_title': '성과지표 달성도',
  'table_meta.table_header': 'Y',
  'table_meta.table_row_number': 4,
  'table_meta.table_column_number': 5,
  'table_meta.table_header_bold': 'N',
  'table_meta.table_background': 'N',
  'table_meta.html_path': '/원천데이터/T01/C01/T01_C01_50000_1025_42.html'},
 'table_data': {'table_data.text_explanation': "1. 적극성이 높을수록, 목표를 초과 달성하거나 100% 달성했을 때의 등급이 'S'로 높다.  2. 적극성이 보통이거나 낮을 경우, 목표를 100% 초과 달성하거나 100% 달성했을 때의 등급이 각각 'S'와 'A', 'A'와 'B'로 낮아진다.  3. 적극성에 관계없이 목표를 90% 미만 달성할 경우 등급이 극도로 저하되는데, 적극성이 높은 경우 'B' 등급, 보통인 경우 'C' 등급, 낮은 경우 'D' 등급으로 분류된다.",
  'table_data.text_summary': '표는 성과지표 달성도에 관한 표로 4행 5열이며 목표치의 적극성과 달성여부에 따른 등급 분류를 나타낸 표이다.'}}
```