In [1]:
import json
import os
import cv2
import tqdm

In [2]:
root = "20231123_dataset2"


In [10]:
def convert_input_yolo_format(path_json, path_json_out):
    """Convert to final result

    Args:
        ocr_result (dataframe): output of model XGBoost

    Returns:
        json: final result
    """
    # print(path_image)
    try:    
        with open(path_json) as f:
            ocr_result = json.load(f)
    except Exception as e:
        print(e)
        return None
    if len(ocr_result)==0:
        return ""
    name_image = path_json.split("/")[-1].replace("out.json","jpg")
    annotation_1_file = {"image_name": name_image,"ocr":None}
    final_ocr_result = []
    for idx, box in enumerate(ocr_result):
        final_ocr_result.append({"box":[box["points"][0][0], 
                                  box["points"][0][1],
                                  box["points"][2][0],
                                  box["points"][2][1]],
                           "text":box["transcription"]})
        
    annotation_1_file["ocr"] =  final_ocr_result
         
    with open(path_json_out, "w", encoding="utf-8") as fff:
        json.dump(annotation_1_file, fff, ensure_ascii=False)
    # return json.dumps(annotation_1_file, ensure_ascii=False)

In [11]:
for folder in os.listdir(root):
    path_folder = os.path.join(root, folder)
    path_annotation = os.path.join(path_folder, "annotation")
    path_out_annotation = os.path.join(path_folder, "annotation_convert")
    os.makedirs(path_out_annotation, exist_ok = True)
    for json_file in os.listdir(path_annotation):
        path_json_file = os.path.join(path_annotation, json_file)
        path_json_out =  os.path.join(path_out_annotation, json_file)
        convert_input_yolo_format(path_json_file, path_json_out)

# convert result to yolo format

In [11]:
root_out_ser = "/mnt/28857F714F734EE8/projects/chatgpt/infordio-key_value_extraction/key-value-extractor/output/163_output/ser"
root_out_re = "/mnt/28857F714F734EE8/projects/chatgpt/infordio-key_value_extraction/key-value-extractor/output/163_output/re"
root_yolo = "/mnt/28857F714F734EE8/projects/chatgpt/infordio-key_value_extraction/key-value-extractor/output/163_output/yolo"
os.makedirs(root_yolo, exist_ok=True)
classes_path = "output/classes.txt"

In [12]:
with open(classes_path, "r") as f:
    content = f.read().splitlines()
dict_classes = {class_:idx for idx,class_ in enumerate(content)}
dict_classes_sigle = {class_:idx for class_,idx in dict_classes.items() if "." not in class_ and "k19" not in class_}
dict_classes_multiple = {class_:idx for class_,idx in dict_classes.items() if "."  in class_}
dict_idx_classes_sigle = {idx:class_ for class_,idx in dict_classes.items() if "." not in class_ and "k19" not in class_}





In [13]:
dict_classes_sigle.keys()

dict_keys(['other', 'k1', 'v1', 'k2', 'v2', 'k3', 'v3', 'k4', 'v4', 'k5', 'v5', 'k6', 'v6', 'k7', 'v7', 'k8', 'v8', 'k9', 'v9', 'k10', 'v10', 'k11', 'v11', 'k12', 'v12', 'k13', 'v13', 'k14', 'v14', 'k15', 'v15', 'k16', 'v16', 'k17', 'v17', 'k18', 'v18', 'header', 'k20', 'v20', 'k21', 'v21', 'k22', 'v22', 'k23', 'v23', 'k24', 'v24', 'k25', 'v25', 'k26', 'v26', 'k27', 'v27', 'k28', 'v28', 'k29', 'v29', 'k30', 'v30', 'k31', 'v31', 'k32', 'v32', 'k33', 'v33', 'k34', 'v34', 'k35', 'v35', 'k36', 'v36', 'k37', 'v37', 'k38', 'v38'])

In [14]:
def mapping_ser(path_json, path_image):
    try:    
        with open(path_json) as f:
            ocr_result = json.load(f)
    except Exception as e:
        print(e)
        return None
    final_dict = {}
    # print(path_image)
    image = cv2.imread(path_image)
    h,w,c = image.shape
    for value in ocr_result:
        x_center = (value["bbox"][0] + value["bbox"][2])/(2*w)
        y_center = (value["bbox"][1] + value["bbox"][3])/(2*h)
        w_box = (value["bbox"][2] - value["bbox"][0])/(w)
        h_box = (value["bbox"][3] - value["bbox"][1])/(h)
        if value["pred"] == "TITLE":
            label = dict_classes_sigle["header"]
        else:
            label = dict_classes_sigle["other"]
            
        final_dict[value["id"]] = f"{label} {x_center} {y_center} {w_box} {h_box}"
        
    return final_dict
    

In [15]:
def change_value_content(content, new_class_id):
    content = content.split()
    content = " ".join([str(new_class_id)] + content[1:])
    return content
    

In [16]:
def mapping_re(path_json, result_ser):
    try:    
        with open(path_json) as f:
            ocr_result = json.load(f)
    except Exception as e:
        print(e)
        return None
    final_dict = {}
    # print(path_image)
    start_idx = 1
    for idx, pair_value in enumerate(ocr_result):
        key_id = pair_value[0]["id"]
        value_id = pair_value[1]["id"]
        # print(key_id)
        # print(result_ser)
        # print(result_ser[key_id])
        
        # print(int(result_ser[key_id].split()[0]))
        # print(int(result_ser[key_id].split()))
        if int(result_ser[key_id].split()[0]) != 0:
            label_class = dict_idx_classes_sigle[int(result_ser[key_id].split()[0])].replace("k","v")
            idx_label = dict_classes_sigle[label_class]
            result_ser[value_id] = change_value_content(result_ser[value_id], idx_label)
        else:
            # print("asaaa")
            new_index_key = f"k{start_idx}"
            while not new_index_key in dict_classes_sigle:
                start_idx += 1
                new_index_key = f"k{start_idx}"
            
            # print(new_index_key)
            
            result_ser[key_id] = change_value_content(result_ser[key_id], dict_classes_sigle[new_index_key])
            result_ser[value_id] = change_value_content(result_ser[value_id], dict_classes_sigle[new_index_key.replace("k","v")])
            start_idx += 1
            
    return result_ser
            
            
        
    

In [17]:
ser_result = {}
for file in os.listdir(root_out_ser):
    if "json" not in file:
        continue
    path_json = os.path.join(root_out_ser, file)
    path_image = path_json.replace("json","jpg")
    dict_ser = mapping_ser(path_json, path_image)
    ser_result[file] = dict_ser
    

In [18]:
i = 0
for file in tqdm.tqdm(os.listdir(root_out_re)):
    if "json" not in file:
        continue
    # if i >0:
    #     break
    # if "07-001_2520___original__2.外注費_代理店_SPITII_2207-0.out" not in file:
    #     continue
    # print(file)
    
    path_json = os.path.join(root_out_re, file)
    # print(path_json)
    # print(ser_result[file])
    result = mapping_re(path_json, ser_result[file])
    path_out_yolo = os.path.join(root_yolo,file.replace("json","txt"))
    content = "\n".join(list(result.values()))
    with open(path_out_yolo, "w") as f:
        f.write(content)
    # ser_result[file] = mapping_re(path_json, ser_result[file])
    # print(result)
    i += 1

100%|██████████| 326/326 [00:00<00:00, 6660.09it/s]


# convert data format's customer  to inference format

In [7]:
import json
import os
import cv2
import tqdm
import shutil
    

In [4]:
root_txt = "/home/infordio-ai/quan/code/information_extraction/PaddleOCR/train_data/dataset2/no2/163_image/163_images_no2.txt"
root_image = "/home/infordio-ai/quan/code/information_extraction/PaddleOCR/train_data/dataset2/no2/imgs"
root_bbox = "/home/infordio-ai/quan/code/information_extraction/PaddleOCR/train_data/dataset2/no2/bbox"

root_out_annotations_folder = "/home/infordio-ai/quan/code/information_extraction/PaddleOCR/train_data/dataset2/no2/163_image/annotations"
root_out_image_folder = "/home/infordio-ai/quan/code/information_extraction/PaddleOCR/train_data/dataset2/no2/163_image/images"

In [8]:
def convert_bbox2inf(path_init_json, path_json_out):
    name_image = path_init_json.split("/")[-1].replace("out.json","jpg")
    final_json = {"image_name":name_image, "ocr":[]}
    try:    
        with open(path_init_json) as f:
                ocr_result = json.load(f)
    except Exception as e:
        print(e)
        return None
    annotation_1_file = []
    if len(ocr_result)==0:
        return ""
    for idx, box in enumerate(ocr_result):
        points = [[box["x"], box["y"]], 
                  [box["x"]+box["w"], box["y"]],
                  [box["x"]+box["w"], box["y"]+box["h"]],
                  [box["x"], box["y"]+box["h"]]]

        text = box["text"]
        if type(text) != str: continue
        
        # annotation_1_file.append({"transcription": text, "label": "other", "points": points, "id": idx, "linking": []})
        final_json["ocr"].append({"box":[points[0][0], points[0][1], points[2][0], points[2][1]], "text": text})
            
    with open(path_json_out, "w", encoding="utf-8") as fff:
        json.dump(final_json, fff, ensure_ascii=False)

In [5]:
with open(root_txt, "r") as f:
    contents = f.read().splitlines()
    

In [13]:
for image in contents:
    path_in_image = os.path.join(root_image, image)
    path_out_image = os.path.join(root_out_image_folder, image)
    shutil.copy(path_in_image, path_out_image)
    path_init_json = os.path.join(root_bbox, image.replace("jpg","out.json"))
    # print("path_init_json", path_init_json)
    path_out_json = os.path.join(root_out_annotations_folder, image.replace("jpg","json"))
        
    convert_bbox2inf(path_init_json, path_out_json)
    