In [1]:
from pathlib import Path
import pickle
from collections import Counter

In [2]:
txt_dir = Path("D:/ml/code/datasets/coco128/labels/train2017")
unique_ids_path = Path("D:/ml/code/datasets/coco128/labels/unique_ids.pkl")
ids_num_path = Path("D:/ml/code/datasets/coco128/labels/ids_num.pkl")

In [3]:
assert txt_dir.exists(), f"{txt_dir} not exists"

# 读取txt中的数据

In [4]:
def get_txts_data(path: Path) -> dict[str, list[str]]:
    """获取txt中数据,按照行分割

    Args:
        path (Path): txt文件夹路径

    Returns:
        dict[str, list[str]]:
            {
                img1: [line1],
                img2: [line1, line2],
                ....
            }
    """
    txts = path.glob("*.txt")
    data = {}
    for txt in txts:
        stem = txt.stem
        with open(txt) as f:
            lines = f.read().splitlines()  # splitlines可以去除单个空行
            lines = [line for line in lines if line != ""]  # 去除空行
        data[stem] = lines
    return data

In [5]:
data = get_txts_data(txt_dir)
print(len(data.keys()))
k = list(data.keys())[0]
print(k, ":", data[k])

128
000000000009 : ['45 0.479492 0.688771 0.955609 0.5955', '45 0.736516 0.247188 0.498875 0.476417', '50 0.637063 0.732938 0.494125 0.510583', '45 0.339438 0.418896 0.678875 0.7815', '49 0.646836 0.132552 0.118047 0.096937', '49 0.773148 0.129802 0.090734 0.097229', '49 0.668297 0.226906 0.131281 0.146896', '49 0.642859 0.079219 0.148063 0.148062']


# get unique id list

In [6]:
def get_txt_unique_ids(data: dict[str, list[str]]) -> dict[str, list[int]]:
    """获取每张图片中的类别id,去重

    Args:
        data (dict[str, list[str]]): get_txts_data 的返回值

    Returns:
        dict[str, list[int]]:
            {
                img1: [0],
                img2: [0, 1],
                ...
            }
    """
    unique_data = {}
    # key: filename
    # value: [line1, line2...]
    for stem, lines in data.items():
        # get line [0] label
        ids = [int(line.split(" ")[0]) for line in lines]
        # unique label
        unique_ids = sorted(set(ids))
        unique_data[stem] = unique_ids
    return unique_data

In [7]:
txt_unique_ids = get_txt_unique_ids(data)
print(len(txt_unique_ids.keys()))
k = list(txt_unique_ids.keys())[0]
print(k, ":", txt_unique_ids[k])

128
000000000009 : [45, 49, 50]


In [8]:
with open(unique_ids_path, mode="wb") as f:
    pickle.dump(txt_unique_ids, f)

In [9]:
with open(unique_ids_path, mode="rb") as f:
    print(pickle.load(f))

{'000000000009': [45, 49, 50], '000000000025': [23], '000000000030': [58, 75], '000000000034': [22], '000000000036': [0, 25], '000000000042': [16], '000000000049': [0, 17, 58], '000000000061': [0, 20], '000000000064': [2, 7, 11, 74], '000000000071': [2, 6, 7], '000000000072': [23], '000000000073': [3], '000000000074': [0, 1, 16], '000000000077': [0, 36], '000000000078': [74], '000000000081': [4], '000000000086': [0, 3, 26], '000000000089': [43, 68, 69, 73], '000000000092': [42, 55], '000000000094': [2, 7], '000000000109': [0, 13, 16], '000000000110': [0, 41, 42, 43, 53, 56, 60], '000000000113': [0, 41, 43, 55, 56, 60], '000000000127': [0, 13, 25, 26, 41, 43, 44, 55, 58, 60, 73], '000000000133': [59, 77], '000000000136': [0, 23], '000000000138': [45, 58, 69, 71, 72, 74, 75], '000000000142': [39, 46, 48, 60], '000000000143': [14], '000000000144': [23], '000000000149': [0, 2, 33], '000000000151': [0, 6, 11], '000000000154': [22], '000000000164': [39, 40, 41, 45, 56, 60, 68, 69, 72], '0000

# get id counter

In [10]:
def get_txt_ids_num(data: dict[str, list[str]]) -> dict[str, dict[int, int]]:
    """获取每张图片中的类别id及其数量

    Args:
        data (dict[str, list[str]]): get_txts_data 的返回值

    Returns:
        dict[str, dict[int, int]]:
            {
                img1: {1: 3, 2: 1},
                img2: {1: 2, 3: 2, 4: 1},
                ...
            }
    """
    data_num = {}
    # key: filename
    # value: [line1, line2...]
    for stem, lines in data.items():
        # get line [0] label
        ids = sorted([int(line.split(" ")[0]) for line in lines])
        # get label num
        ids_num = dict(Counter(ids))
        data_num[stem] = ids_num
    return data_num

In [11]:
txt_ids_num = get_txt_ids_num(data)
print(len(txt_ids_num.keys()))
k = list(txt_ids_num.keys())[0]
print(k, ":", txt_ids_num[k])

128
000000000009 : {45: 3, 49: 4, 50: 1}


In [12]:
with open(ids_num_path, mode="wb") as f:
    pickle.dump(txt_ids_num, f)

In [13]:
with open(ids_num_path, mode="rb") as f:
    print(pickle.load(f))

{'000000000009': {45: 3, 49: 4, 50: 1}, '000000000025': {23: 2}, '000000000030': {58: 1, 75: 1}, '000000000034': {22: 1}, '000000000036': {0: 1, 25: 1}, '000000000042': {16: 1}, '000000000049': {0: 6, 17: 2, 58: 1}, '000000000061': {0: 3, 20: 2}, '000000000064': {2: 1, 7: 1, 11: 1, 74: 1}, '000000000071': {2: 13, 6: 1, 7: 2}, '000000000072': {23: 2}, '000000000073': {3: 2}, '000000000074': {0: 6, 1: 1, 16: 1}, '000000000077': {0: 5, 36: 3}, '000000000078': {74: 1}, '000000000081': {4: 1}, '000000000086': {0: 1, 3: 1, 26: 1}, '000000000089': {43: 5, 68: 1, 69: 1, 73: 3}, '000000000092': {42: 1, 55: 1}, '000000000094': {2: 1, 7: 1}, '000000000109': {0: 5, 13: 2, 16: 1}, '000000000110': {0: 10, 41: 4, 42: 1, 43: 1, 53: 1, 56: 4, 60: 3}, '000000000113': {0: 3, 41: 10, 43: 1, 55: 1, 56: 2, 60: 1}, '000000000127': {0: 1, 13: 4, 25: 3, 26: 1, 41: 1, 43: 1, 44: 1, 55: 1, 58: 1, 60: 2, 73: 1}, '000000000133': {59: 1, 77: 1}, '000000000136': {0: 2, 23: 2}, '000000000138': {45: 1, 58: 1, 69: 1, 7