# Merge annotation documents

## Libraries

In [2]:
import os
import pandas as pd
from tqdm import tqdm

## Load all files

In [25]:
## Path of annotated data ##
path = "../Data/Annotated_Images/"

## List folder with annotated data ##
folders_list = os.listdir(path)
print("# of folders:",len(folders_list))


## Counter of read csv files ##
c_csv = 0

## DF to save all annotations ##
annotated_df = pd.DataFrame([])

## Loop ##
for folder in tqdm(folders_list):
    
    ## create name of annotated information ##
    annotation_name = f"annotations_{folder}.csv"
    
    ## path of each folder ##
    annotation_file = os.path.join(path, folder, annotation_name)   
    
    ## Load annotations ##
    annotation_df = pd.read_csv(annotation_file)
    ## concatenate annotations ##
    annotated_df = pd.concat([annotated_df, annotation_df], axis=0)
    ## Increase loaded files ##
    c_csv += 1
            

annotated_df.reset_index(drop=True, inplace=True)

print("# of read files:", c_csv, annotated_df.shape)
annotated_df.head()


# of folders: 500


100%|██████████| 500/500 [00:00<00:00, 2018.18it/s]

# of read files: 500 (2137, 7)





Unnamed: 0,image_name,docid,label,x,y,w,h
0,gpq38e00-page02_2,gpq38e00-page02_2_obj0.png,l,852,101,973,406
1,gpq38e00-page02_2,gpq38e00-page02_2_obj1.png,f,805,2170,442,320
2,dlk65e00,dlk65e00_obj0.png,l,880,135,816,384
3,dlk65e00,dlk65e00_obj1.png,t,330,1345,1705,176
4,dlk65e00,dlk65e00_obj2.png,t,470,1239,148,81


## Check the assigned labels

In [26]:
annotated_df["label"].value_counts()

label
t            1234
f             529
l             232
b             114
g              21
e               3
logo            1
signature       1
tt              1
ff              1
Name: count, dtype: int64

## Re-define the labels

In [27]:

object_database_file = "../Data/Objects_Database/Objects_Information.csv"



map_dict = {
    "other": ["t", "o", "tt"],
    "signature": ["f", "ff", "s", "signature"],
    "logo": ["l", "logo"],
    "table": ["b"],
    "graphic": ["g"],
    "empty": ["e"],
}


label_list = []
for i in range(len(annotated_df)):
    label = annotated_df.loc[i, "label"]
    
    for key in map_dict.keys():

        if(label in map_dict[key]):
            label_list.append(key)

print(len(annotated_df))
print(len(label_list))


annotated_df_2 = annotated_df.copy()

annotated_df_2["label"] = label_list

annotated_df_2.to_csv(object_database_file, index=False)
annotated_df_2["label"].value_counts()

2137
2137


label
other        1235
signature     531
logo          233
table         114
graphic        21
empty           3
Name: count, dtype: int64

# Split images to database folder

## Libraries

In [8]:
import os
import shutil
import pandas as pd

## Functions

In [4]:
def create_dir(path):
    dirs = path.split("/")
    # print(dirs)
    n_path = ""
    for i in range(len(dirs)):
        
        n_path = os.path.join(n_path, dirs[i])
        # print(n_path)
        a = os.path.isdir(n_path)   
        # print(a) 
        if(not(a)):
            os.mkdir(n_path) 

## Copy images

In [14]:
object_database_file = "../Data/Objects_Database/Objects_Information.csv"
objects_info_df = pd.read_csv(object_database_file)


src_path = "../Data/Annotated_Images/"
db_path = "../Data/Objects_Database"

object_interest = ["logo", "other", "signature"]

for object in object_interest:

    dst_path = os.path.join(db_path, object)

    create_dir(dst_path)

    objects_df = objects_info_df.loc[objects_info_df["label"]==object, :]
    samples_df = objects_df.sample(n=233, random_state=21, ignore_index=True)
    

    for i in range(samples_df.shape[0]):
        
        src_file = os.path.join(src_path, samples_df.loc[i, "image_name"], samples_df.loc[i, "docid"])
        dst_file = os.path.join(dst_path, samples_df.loc[i, "docid"]) 

        shutil.copy2(src_file, dst_file)



## Redefine file of selected objects

## Libraries

In [1]:
import os
import pandas as pd

## Redefine file

In [3]:
object_database_file = "../Data/Objects_Database/Objects_Information.csv"
selected_object_db_file = "../Data/Objects_Database/Selected_Objects_Information.csv"

objects_info_df = pd.read_csv(object_database_file)
objects_info_df.index = list(objects_info_df["docid"])

db_path = "../Data/Objects_Database"

object_interest = ["logo", "other", "signature"]

object_list = []

for object in object_interest:
    dst_path = os.path.join(db_path, object)

    obj_list = os.listdir(dst_path)
    object_list.extend(obj_list)



selected_objects_df = objects_info_df.loc[object_list, :]
print(selected_objects_df.shape)


selected_objects_df.to_csv(selected_object_db_file, index=False)

(639, 7)
