In [43]:
from pathlib import Path
import pickle
from xml.etree import ElementTree as ET
from collections import Counter

In [44]:
xml_dir = Path("D:/ml/code/datasets/VOC07+12+test/VOCdevkit/VOC2007/Annotations")
unique_ids_path = Path("D:/ml/code/datasets/VOC07+12+test/VOCdevkit/VOC2007/unique_ids.pkl")
ids_num_path = Path("D:/ml/code/datasets/VOC07+12+test/VOCdevkit/VOC2007/ids_num.pkl")

In [45]:
assert xml_dir.exists(), f"{xml_dir} not exists"

In [46]:
cls2id = {
    "aeroplane": 0,
    "bicycle": 1,
    "bird": 2,
    "boat": 3,
    "bottle": 4,
    "bus": 5,
    "car": 6,
    "cat": 7,
    "chair": 8,
    "cow": 9,
    "diningtable": 10,
    "dog": 11,
    "horse": 12,
    "motorbike": 13,
    "person": 14,
    "pottedplant": 15,
    "sheep": 16,
    "sofa": 17,
    "train": 18,
    "tvmonitor": 19,
}

# 读取xml中的数据

In [47]:
def get_xml_data(path: Path) -> dict[str, list[str]]:
    """获取xml中数据,按照行分割

    Args:
        path (Path): xml文件夹路径

    Returns:
        dict[str, list[str]]:
            {
                img1: [line1],
                img2: [line1, line2],
                ....
            }
    """
    xmls = path.glob("*.xml")
    data = {}
    for xml in xmls:
        with open(xml, "r", encoding="utf-8") as f:
            tree = ET.parse(f)
        root = tree.getroot()
        stem = Path(root.find("filename").text).stem
        ids = []
        for o in root.findall("object"):
            ids.append(cls2id[o.find("name").text])
        data[stem] = ids
    return data

In [48]:
data = get_xml_data(xml_dir)
data

{'000001': [11, 14],
 '000002': [18],
 '000003': [17, 8],
 '000004': [6, 6, 6, 6, 6, 6, 6],
 '000005': [8, 8, 8, 8, 8],
 '000006': [15, 10, 8, 8, 8, 8, 8, 8],
 '000007': [6],
 '000008': [8],
 '000009': [12, 14, 14, 14],
 '000010': [12, 14],
 '000011': [7],
 '000012': [6],
 '000013': [9],
 '000014': [5, 6, 6, 14, 14, 14, 6],
 '000015': [1],
 '000016': [1],
 '000017': [14, 12],
 '000018': [11],
 '000019': [7, 7],
 '000020': [6],
 '000021': [11, 14, 14, 14],
 '000022': [12, 14],
 '000023': [1, 1, 1, 14, 14, 14],
 '000024': [18],
 '000025': [9, 9, 9, 14, 14, 14, 9, 14, 14, 14, 14],
 '000026': [6],
 '000027': [14],
 '000028': [7],
 '000029': [11],
 '000030': [1, 14, 14],
 '000031': [18],
 '000032': [0, 0, 14, 14],
 '000033': [0, 0, 0],
 '000034': [18, 18],
 '000035': [14, 14, 14, 10],
 '000036': [11],
 '000037': [11],
 '000038': [1, 13, 14],
 '000039': [19],
 '000040': [2],
 '000041': [19, 14, 14],
 '000042': [18, 18],
 '000043': [14, 14, 14],
 '000044': [8, 7],
 '000045': [19],
 '000046': 

# get unique id list

In [49]:
new_data = {}
for k, v in data.items():
    new_data[k] = list(sorted(set(v)))
new_data

{'000001': [11, 14],
 '000002': [18],
 '000003': [8, 17],
 '000004': [6],
 '000005': [8],
 '000006': [8, 10, 15],
 '000007': [6],
 '000008': [8],
 '000009': [12, 14],
 '000010': [12, 14],
 '000011': [7],
 '000012': [6],
 '000013': [9],
 '000014': [5, 6, 14],
 '000015': [1],
 '000016': [1],
 '000017': [12, 14],
 '000018': [11],
 '000019': [7],
 '000020': [6],
 '000021': [11, 14],
 '000022': [12, 14],
 '000023': [1, 14],
 '000024': [18],
 '000025': [9, 14],
 '000026': [6],
 '000027': [14],
 '000028': [7],
 '000029': [11],
 '000030': [1, 14],
 '000031': [18],
 '000032': [0, 14],
 '000033': [0],
 '000034': [18],
 '000035': [10, 14],
 '000036': [11],
 '000037': [11],
 '000038': [1, 13, 14],
 '000039': [19],
 '000040': [2],
 '000041': [14, 19],
 '000042': [18],
 '000043': [14],
 '000044': [7, 8],
 '000045': [19],
 '000046': [2],
 '000047': [6, 8],
 '000048': [2, 14],
 '000049': [7],
 '000050': [4, 8, 14],
 '000051': [13, 14],
 '000052': [15],
 '000053': [7],
 '000054': [5],
 '000055': [13, 1

In [50]:
with open(unique_ids_path, mode="wb") as f:
    pickle.dump(new_data, f)

In [51]:
with open(unique_ids_path, mode="rb") as f:
    print(pickle.load(f))

{'000001': [11, 14], '000002': [18], '000003': [8, 17], '000004': [6], '000005': [8], '000006': [8, 10, 15], '000007': [6], '000008': [8], '000009': [12, 14], '000010': [12, 14], '000011': [7], '000012': [6], '000013': [9], '000014': [5, 6, 14], '000015': [1], '000016': [1], '000017': [12, 14], '000018': [11], '000019': [7], '000020': [6], '000021': [11, 14], '000022': [12, 14], '000023': [1, 14], '000024': [18], '000025': [9, 14], '000026': [6], '000027': [14], '000028': [7], '000029': [11], '000030': [1, 14], '000031': [18], '000032': [0, 14], '000033': [0], '000034': [18], '000035': [10, 14], '000036': [11], '000037': [11], '000038': [1, 13, 14], '000039': [19], '000040': [2], '000041': [14, 19], '000042': [18], '000043': [14], '000044': [7, 8], '000045': [19], '000046': [2], '000047': [6, 8], '000048': [2, 14], '000049': [7], '000050': [4, 8, 14], '000051': [13, 14], '000052': [15], '000053': [7], '000054': [5], '000055': [13, 14], '000056': [12], '000057': [2], '000058': [13, 14],

# get id counter

In [52]:
new_data = {}
for k, v in data.items():
    new_data[k] = dict(Counter(v))
new_data

{'000001': {11: 1, 14: 1},
 '000002': {18: 1},
 '000003': {17: 1, 8: 1},
 '000004': {6: 7},
 '000005': {8: 5},
 '000006': {15: 1, 10: 1, 8: 6},
 '000007': {6: 1},
 '000008': {8: 1},
 '000009': {12: 1, 14: 3},
 '000010': {12: 1, 14: 1},
 '000011': {7: 1},
 '000012': {6: 1},
 '000013': {9: 1},
 '000014': {5: 1, 6: 3, 14: 3},
 '000015': {1: 1},
 '000016': {1: 1},
 '000017': {14: 1, 12: 1},
 '000018': {11: 1},
 '000019': {7: 2},
 '000020': {6: 1},
 '000021': {11: 1, 14: 3},
 '000022': {12: 1, 14: 1},
 '000023': {1: 3, 14: 3},
 '000024': {18: 1},
 '000025': {9: 4, 14: 7},
 '000026': {6: 1},
 '000027': {14: 1},
 '000028': {7: 1},
 '000029': {11: 1},
 '000030': {1: 1, 14: 2},
 '000031': {18: 1},
 '000032': {0: 2, 14: 2},
 '000033': {0: 3},
 '000034': {18: 2},
 '000035': {14: 3, 10: 1},
 '000036': {11: 1},
 '000037': {11: 1},
 '000038': {1: 1, 13: 1, 14: 1},
 '000039': {19: 1},
 '000040': {2: 1},
 '000041': {19: 1, 14: 2},
 '000042': {18: 2},
 '000043': {14: 3},
 '000044': {8: 1, 7: 1},
 '0000

In [53]:
with open(ids_num_path, mode="wb") as f:
    pickle.dump(new_data, f)

In [54]:
with open(ids_num_path, mode="rb") as f:
    print(pickle.load(f))

{'000001': {11: 1, 14: 1}, '000002': {18: 1}, '000003': {17: 1, 8: 1}, '000004': {6: 7}, '000005': {8: 5}, '000006': {15: 1, 10: 1, 8: 6}, '000007': {6: 1}, '000008': {8: 1}, '000009': {12: 1, 14: 3}, '000010': {12: 1, 14: 1}, '000011': {7: 1}, '000012': {6: 1}, '000013': {9: 1}, '000014': {5: 1, 6: 3, 14: 3}, '000015': {1: 1}, '000016': {1: 1}, '000017': {14: 1, 12: 1}, '000018': {11: 1}, '000019': {7: 2}, '000020': {6: 1}, '000021': {11: 1, 14: 3}, '000022': {12: 1, 14: 1}, '000023': {1: 3, 14: 3}, '000024': {18: 1}, '000025': {9: 4, 14: 7}, '000026': {6: 1}, '000027': {14: 1}, '000028': {7: 1}, '000029': {11: 1}, '000030': {1: 1, 14: 2}, '000031': {18: 1}, '000032': {0: 2, 14: 2}, '000033': {0: 3}, '000034': {18: 2}, '000035': {14: 3, 10: 1}, '000036': {11: 1}, '000037': {11: 1}, '000038': {1: 1, 13: 1, 14: 1}, '000039': {19: 1}, '000040': {2: 1}, '000041': {19: 1, 14: 2}, '000042': {18: 2}, '000043': {14: 3}, '000044': {8: 1, 7: 1}, '000045': {19: 1}, '000046': {2: 1}, '000047': {6