In [4]:
!pip install datasets
!pip install tqdm



In [None]:
# Import Dataset
from datasets import load_dataset

data = load_dataset(
    "isek-ai/danbooru-tags-2024",
    split="train",
)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 8603394/8603394 [00:10<00:00, 785024.41 examples/s] 


In [8]:
# Remove original characters, and incomplete data
data = data.filter(lambda x: x["copyright"] is not None and x["character"] is not None and x["general"] is not None and "original" not in x["copyright"])

Filter: 100%|██████████| 8603394/8603394 [02:17<00:00, 62373.35 examples/s]


In [11]:
# Create raw_characters.json (takes about 20 minutes)
from tqdm import tqdm
import json

characters = {}

for temp in tqdm(data):
    data_characters = temp["character"].split(", ")
    data_copyright = temp["copyright"].split(", ")
    data_general = temp["general"].split(", ")

    # process character data
    for character in data_characters:
        if character not in characters:
            characters[character] = {
                "count": 1,
                "image": "",
                "copyright": {},
                "tags": {}
            }
        else:
            characters[character]["count"] += 1
        
        # process copyright data
        for copyright in data_copyright:
            if copyright not in characters[character]["copyright"]:
                characters[character]["copyright"][copyright] = 1
            else:
                characters[character]["copyright"][copyright] += 1
        
        # process tags data
        if "solo" in data_general:
            for general in data_general:
                if general not in characters[character]["tags"]:
                    characters[character]["tags"][general] = 1
                else:
                    characters[character]["tags"][general] += 1

# Save the characters dictionary to a JSON file
with open("raw_characters.json", "w") as f:
    json.dump(characters, f, indent=4)

  0%|          | 0/7235553 [00:00<?, ?it/s]

100%|██████████| 7235553/7235553 [20:51<00:00, 5781.65it/s]


In [18]:
# Create characters.json (takes about 30 seconds)

from tqdm import tqdm
import json

# Filter out copyrights with image count less than {copyright_threshold}% of the total images
copyright_threshold = 60

# Filter out tags with image count less than {tag_threshold}% of the total images
tag_threshold = 10

# Filter out characters less than {character_threshold} images
character_threshold = 15

with open("raw_characters.json", "r") as f:
    characters = json.load(f) 

for character in tqdm(characters):
    count = characters[character]["count"]
    copyright_count = copyright_threshold * count / 100
    tag_count = tag_threshold * count / 100

    # Convert copyrights and tags to list
    characters[character]["copyright"] = {k: v for k, v in characters[character]["copyright"].items() if v >= copyright_count}
    characters[character]["tags"] = {k: v for k, v in characters[character]["tags"].items() if v >= tag_count}
    
    # Sort the copyrights and tags by count in ascending order in list format
    characters[character]["copyright"] = sorted(characters[character]["copyright"].items(), key=lambda x: x[1], reverse=True)
    characters[character]["tags"] = sorted(characters[character]["tags"].items(), key=lambda x: x[1], reverse=True)

# Convert the dictionary to a list of dictionaries
characters_list = []
for character, data in characters.items():
    if data["count"] < character_threshold:
        continue

    characters_list.append({
        "name": character,
        "count": data["count"],
        "copyright": data["copyright"],
        "tags": data["tags"]
    })

# Save the characters list to a JSON file
with open("characters.json", "w") as f:
    json.dump(characters_list, f, indent=4)

100%|██████████| 235726/235726 [00:03<00:00, 59108.04it/s]


In [19]:
# How many characters in characters.json?
len(characters_list)

52597

In [26]:
# Create copyrights.json

from tqdm import tqdm
import json

with open("characters.json", "r") as f:
    characters = json.load(f)

copyrights = {}

for index in tqdm(range(len(characters))):
    for copyright in characters[index]["copyright"]:
        if copyright[0] not in copyrights:
            copyrights[copyright[0]] = {
                "count": 0,
                "popularity": 0,
                "characters": []
            }
        
        copyrights[copyright[0]]["count"] += 1
        copyrights[copyright[0]]["characters"].append(index)
        copyrights[copyright[0]]["popularity"] = max(copyrights[copyright[0]]["popularity"], characters[index]["count"])
    
# Sort the copyrights by popularity in ascending order in list format
copyrights = sorted(copyrights.items(), key=lambda x: x[1]["popularity"], reverse=True)

with open("copyrights.json", "w") as f:
    json.dump(copyrights, f, indent=4)

100%|██████████| 52597/52597 [00:00<00:00, 706483.04it/s]
