In [93]:
import os
from PIL import Image
import pandas as pd
from glob import glob
from tqdm import tqdm
import json
import numpy as np
from sklearn.model_selection import train_test_split

In [80]:
bbox_list=glob('../../data/total/bbox/*.json')
relation_list=[f.replace('/data/total/bbox', '/data/total/relation') for f in bbox_list]
df = pd.DataFrame(columns=['image','bounding box','label',"entity1","entity2","relation","english","korean"])
for i in tqdm(range(len(bbox_list))):
    with open(bbox_list[i], 'r') as f:

        json_data = json.load(f)

    image_name=json_data['images'][0]['file_name']
    width_ratio=256/json_data['images'][0]['width']
    height_ratio=256/json_data['images'][0]['height']
    annotation_count=len(json_data['annotations'])
    annotation_list=[]
    label_list=[]
    entity1_list=[]
    entity2_list=[]
    relation_list1=[]
    english_list=[]
    korean_list=[]
    for j in range(annotation_count):
        resize_list=[int(json_data['annotations'][j]['bbox'][0]*width_ratio),int(json_data['annotations'][j]['bbox'][1]*height_ratio),int(json_data['annotations'][j]['bbox'][2]*width_ratio),int(json_data['annotations'][j]['bbox'][3]*height_ratio)]
        annotation_list.append(resize_list)
        label_list.append(json_data['annotations'][j]['category_id'])

    with open(relation_list[i], 'r', encoding='utf-8-sig') as f:

        json_data = json.load(f)
        
    annotation_count=len(json_data['annotations'][0]['text'])    
    for k in range(annotation_count):
        entity1_list.append(json_data['annotations'][0]['text'][k]['entity1'])
        entity2_list.append(json_data['annotations'][0]['text'][k]['entity2'])
        relation_list1.append(json_data['annotations'][0]['text'][k]['relation'])
        english_list.append(json_data['annotations'][0]['text'][k]['english'])
        korean_list.append(json_data['annotations'][0]['text'][k]['korean'])
        'image','bounding box','label',"entity1","entity2","relation","english","korean"
    df.at[i,'image']=image_name
    df.at[i,'bounding box']=annotation_list
    df.at[i,'label']=label_list
    df.at[i,"entity1"]=entity1_list
    df.at[i,"entity2"]=entity2_list
    df.at[i,"relation"]=relation_list1
    df.at[i,"english"]=english_list
    df.at[i,"korean"]=korean_list
df.to_csv('../../data/total/total.csv', mode='w',encoding='cp949', index=False)

100%|██████████| 21047/21047 [03:41<00:00, 95.07it/s] 


In [106]:
df_shuffled = df.sample(frac=1, random_state=42)  
train_ratio = 0.8
valid_ratio = 0.1
train, valid, test = np.split(df_shuffled, [int(train_ratio*len(df_shuffled)), int((train_ratio+valid_ratio)*len(df_shuffled))])
print(f"학습 세트 크기: {len(train)}")
print(f"검증 세트 크기: {len(valid)}")
print(f"테스트 세트 크기: {len(test)}")
train.to_csv('../../data/dataset/train/label.csv', mode='w',encoding='cp949', index=False)
test.to_csv('../../data/dataset/test/label.csv', mode='w',encoding='cp949', index=False)
valid.to_csv('../../data/dataset/validation/label.csv', mode='w',encoding='cp949', index=False)


학습 세트 크기: 16837
검증 세트 크기: 2105
테스트 세트 크기: 2105


In [111]:
total_image_path='../../data/total/img/'
image_list=list(train['image'])
for i in tqdm(range(len(image_list))):
    img=Image.open(total_image_path+image_list[i]).resize((256,256))
    img.save('../../data/dataset/train/image/'+image_list[i])

image_list=list(test['image'])
for i in tqdm(range(len(image_list))):
    img=Image.open(total_image_path+image_list[i]).resize((256,256))
    img.save('../../data/dataset/test/image/'+image_list[i])

image_list=list(valid['image'])
for i in tqdm(range(len(image_list))):
    img=Image.open(total_image_path+image_list[i]).resize((256,256))
    img.save('../../data/dataset/validation/image/'+image_list[i])

100%|██████████| 16837/16837 [28:20<00:00,  9.90it/s] 
100%|██████████| 2105/2105 [03:31<00:00,  9.95it/s]
100%|██████████| 2105/2105 [03:10<00:00, 11.05it/s]


In [128]:
df_count=len(train)
bbox_count=0
caption_count=0
train=train.reset_index()
bbox_df=pd.DataFrame(columns=['image','x1','y1','x2','y2','label'])
caption_df=pd.DataFrame(columns=['image','english','korean'])
for i in tqdm(range(df_count)):
    anno_count=len(train.loc[i]['bounding box'])
    for j in range(anno_count):
        image_name=train.loc[i]['image']
        bbox_df.at[bbox_count,'image']=image_name
        bbox_df.at[bbox_count,'x1']=train.loc[i]['bounding box'][j][0]
        bbox_df.at[bbox_count,'y1']=train.loc[i]['bounding box'][j][1]
        bbox_df.at[bbox_count,'x2']=train.loc[i]['bounding box'][j][2]+train.loc[i]['bounding box'][j][0]
        bbox_df.at[bbox_count,'y2']=train.loc[i]['bounding box'][j][0]+train.loc[i]['bounding box'][j][1]
        bbox_df.at[bbox_count,'label']=train.loc[i]['bounding box'][j][0]+train.loc[i]['label'][j]
        bbox_count+=1

    cap_count=len(train.loc[i]['english'])
    for j in range(cap_count):
        image_name=train.loc[i]['image']
        caption_df.at[caption_count,'image']=image_name
        caption_df.at[caption_count,'english']=train.loc[i]['english'][j]
        caption_df.at[caption_count,'korean']=train.loc[i]['korean'][j]
        caption_count+=1

bbox_df.to_csv('../../data/dataset/train/bbox_label.csv', mode='w',encoding='cp949', index=False)
caption_df.to_csv('../../data/dataset/train/caption_label.csv', mode='w',encoding='cp949', index=False)

100%|██████████| 16837/16837 [07:37<00:00, 36.82it/s]


In [129]:
df_count=len(test)
bbox_count=0
caption_count=0
test=test.reset_index()
bbox_df=pd.DataFrame(columns=['image','x1','y1','x2','y2','label'])
caption_df=pd.DataFrame(columns=['image','english','korean'])
for i in tqdm(range(df_count)):
    anno_count=len(test.loc[i]['bounding box'])
    for j in range(anno_count):
        image_name=test.loc[i]['image']
        bbox_df.at[bbox_count,'image']=image_name
        bbox_df.at[bbox_count,'x1']=test.loc[i]['bounding box'][j][0]
        bbox_df.at[bbox_count,'y1']=test.loc[i]['bounding box'][j][1]
        bbox_df.at[bbox_count,'x2']=test.loc[i]['bounding box'][j][2]+test.loc[i]['bounding box'][j][0]
        bbox_df.at[bbox_count,'y2']=test.loc[i]['bounding box'][j][0]+test.loc[i]['bounding box'][j][1]
        bbox_df.at[bbox_count,'label']=test.loc[i]['bounding box'][j][0]+test.loc[i]['label'][j]
        bbox_count+=1

    cap_count=len(test.loc[i]['english'])
    for j in range(cap_count):
        image_name=test.loc[i]['image']
        caption_df.at[caption_count,'image']=image_name
        caption_df.at[caption_count,'english']=test.loc[i]['english'][j]
        caption_df.at[caption_count,'korean']=test.loc[i]['korean'][j]
        caption_count+=1

bbox_df.to_csv('../../data/dataset/test/bbox_label.csv', mode='w',encoding='cp949', index=False)
caption_df.to_csv('../../data/dataset/test/caption_label.csv', mode='w',encoding='cp949', index=False)

  0%|          | 0/2105 [00:00<?, ?it/s]

100%|██████████| 2105/2105 [00:16<00:00, 125.83it/s]


In [130]:
df_count=len(valid)
bbox_count=0
caption_count=0
valid=valid.reset_index()
bbox_df=pd.DataFrame(columns=['image','x1','y1','x2','y2','label'])
caption_df=pd.DataFrame(columns=['image','english','korean'])
for i in tqdm(range(df_count)):
    anno_count=len(valid.loc[i]['bounding box'])
    for j in range(anno_count):
        image_name=valid.loc[i]['image']
        bbox_df.at[bbox_count,'image']=image_name
        bbox_df.at[bbox_count,'x1']=valid.loc[i]['bounding box'][j][0]
        bbox_df.at[bbox_count,'y1']=valid.loc[i]['bounding box'][j][1]
        bbox_df.at[bbox_count,'x2']=valid.loc[i]['bounding box'][j][2]+valid.loc[i]['bounding box'][j][0]
        bbox_df.at[bbox_count,'y2']=valid.loc[i]['bounding box'][j][0]+valid.loc[i]['bounding box'][j][1]
        bbox_df.at[bbox_count,'label']=valid.loc[i]['bounding box'][j][0]+valid.loc[i]['label'][j]
        bbox_count+=1

    cap_count=len(valid.loc[i]['english'])
    for j in range(cap_count):
        image_name=valid.loc[i]['image']
        caption_df.at[caption_count,'image']=image_name
        caption_df.at[caption_count,'english']=valid.loc[i]['english'][j]
        caption_df.at[caption_count,'korean']=valid.loc[i]['korean'][j]
        caption_count+=1

bbox_df.to_csv('../../data/dataset/validation/bbox_label.csv', mode='w',encoding='cp949', index=False)
caption_df.to_csv('../../data/dataset/validation/caption_label.csv', mode='w',encoding='cp949', index=False)

100%|██████████| 2105/2105 [00:16<00:00, 124.39it/s]
