In [None]:
import aiohttp
import asyncio
import nest_asyncio
import multiprocessing
import os
import random
from PIL import Image

import pandas as pd

In [None]:
BASE_URL = "https://danbooru.donmai.us/"
TEST_URL = "https://testbooru.donmai.us/"

USERNAME = 'Lighter_01'
API_KEY = '181TrypvXuZXX1WJZScybDaf'

dataset_path = os.path.join('..', 'datasets', 'extended_dataset', 'train')

nest_asyncio.apply()

In [None]:
characters_list = [os.path.basename(x.path) for x in os.scandir(os.path.join('..', 'datasets', 'splitted', 'train'))]
characters_list

In [None]:
def create_dirs(characters_list):
    for character in characters_list:
        os.makedirs(os.path.join(dataset_path, character), exist_ok=True)

create_dirs(characters_list)

In [None]:
character_tags_list = [
    'portgas_d._ace',
    'sakazuki_(akainu)',
    'brook_(one_piece)',
    'tony_tony_chopper',
    'crocodile_(one_piece)',
    'franky_(one_piece)',
    'jinbe_(one_piece)',
    'marshall_d._teach',
    'trafalgar_law',
    'monkey_d._luffy',
    'dracule_mihawk',
    'nami_(one_piece)',
    'silvers_rayleigh',
    'nico_robin',
    'sanji_(one_piece)',
    'shanks_(one_piece)',
    'usopp',
    'roronoa_zoro'
]

In [None]:
characters_dict = {k:v for k,v in zip(characters_list, character_tags_list)}
characters_dict

In [None]:
async def get_total_images(tags, session):
    url = BASE_URL + "counts/posts.json"
    params = {
        "tags": tags,
        "login": USERNAME,
        "api_key": API_KEY,
    }
    async with session.get(url, params=params) as response:
        if response.status == 200:
            data = await response.json()
            return data.get("counts", {}).get("posts", 0)
        else:
            print(f"Failed to fetch total count for {tags}: {response.status}")
            return 0

In [None]:
async def get_number_of_images_per_characters(characters_dict):
    character_images_counter = {}

    async with aiohttp.ClientSession() as session:
        for character, tag in characters_dict.items():
            tags = f'{tag} -rating:e chartags:1'
            print(f"Fetching total images for: {character}")
            total_images = await get_total_images(tags, session)
            character_images_counter[character] = total_images
            print(f"{character}: {total_images} images")
    return character_images_counter

In [None]:
counter = asyncio.run(get_number_of_images_per_characters(characters_dict))

In [None]:
characters_counter = pd.DataFrame(data=counter.items())
characters_counter = characters_counter.T
characters_counter.columns = characters_counter.iloc[0]
characters_counter = characters_counter.drop(index=0).reset_index(drop=True)
characters_counter

---

In [None]:
async def fetch_metadata(character, page, session, delay=0.5):
    tags = f'{character} -rating:e chartags:1'
    params = {
        "page": page,
        "tags": tags,
        "limit": 200,
        "login": USERNAME,
        "api_key": API_KEY
    }
    url = f'{BASE_URL}posts.json'
    await asyncio.sleep(delay)
    async with session.get(url, params=params) as response:
        if response.status != 200:
            print(f"Failed to fetch page {page} for {character}: {response.status}")
            return []
        return await response.json()

In [None]:
total_pages = {k:((v + 199) // 200) for k,v in counter.items()}

async def fetch_all_metadata(character, pages):
    async with aiohttp.ClientSession() as session:
        urls = []
        for page in range(1, pages + 1):
            metadata = await fetch_metadata(characters_dict[character], page, session)
            for post in metadata:
                if 'file_url' in post and post['file_url'].endswith('.png'):
                    urls.append(post['file_url'])
    return urls

In [None]:
async def get_urls(characters_list):
    images_urls_dict = {}
    tasks = [
        fetch_all_metadata(character, total_pages[character])
        for character in characters_list
    ]
    images_urls = await asyncio.gather(*tasks)
    
    for i, character in enumerate(characters_list):
        images_urls_dict[character] = images_urls[i]
        
    return images_urls_dict

In [None]:
img_urls = asyncio.run(get_urls(characters_list))

In [None]:
for character, url_list in img_urls.items():
    print(f'{character}: {len(url_list)}')

print(f'Total number of urls: {sum([len(l) for l in img_urls.values()])}')

In [None]:
async def download_image(url, output_dir, session, delay=0.5):
    try:
        await asyncio.sleep(delay)
        async with session.get(url) as response:
            if response.status == 200:
                filename = os.path.join(output_dir, url.split("/")[-1])
                with open(filename, "wb") as f:
                    f.write(await response.read())
                # print(f"Downloaded {filename}")
            else:
                print(f"Failed to download {url}: {response.status}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

In [None]:
async def download_all_images(character, url_list):
    output_dir = os.path.join(dataset_path, character)
    os.makedirs(output_dir, exist_ok=True)

    async with aiohttp.ClientSession() as session:
        tasks = []
        for img_url in url_list:
            tasks.append(download_image(img_url, output_dir, session))

        await asyncio.gather(*tasks)
    
    return tasks

In [None]:
async def download_for_all_characters(img_urls_dict):
    tasks = [
        download_all_images(character, urls)
        for character, urls in img_urls_dict.items()
    ]
    await asyncio.gather(*tasks)

In [None]:
def download_for_all_characters(img_urls_dict):
    for character, url_list in img_urls_dict.items():
        print(character, len(url_list))
        asyncio.run(download_all_images(character, url_list))

In [None]:
# asyncio.run(download_for_all_characters(img_urls))
download_for_all_characters(img_urls)

In [None]:
def get_number_of_files_in_directories(directory_path):
    f = {}
    for (dirpath, dirnames, filenames) in os.walk(directory_path):
        if len(filenames) != 0:
            character_name = dirpath.split('\\')[-1]
            if character_name not in f:
                f[character_name] = []
            f[character_name].extend(filenames)
    
    for character_name, urls in f.items():
        f[character_name] = len(urls)
        
    return f

In [None]:
get_number_of_files_in_directories(dataset_path)

In [None]:
def find_corrupted_files(dataset_root):
    corrupted_files = []
    for subdir, _, files in os.walk(dataset_root):
        for file in files:
            file_path = os.path.join(subdir, file)
            try:
                with Image.open(file_path) as img:
                    img.verify()
            except (IOError, SyntaxError) as e:
                corrupted_files.append(file_path)
                print(f"Corrupted file: {file_path}")
    return corrupted_files

In [None]:
corrupted_files = find_corrupted_files(dataset_path)

In [None]:
def remove_corrupted_files(corrupted_files):
    for file in corrupted_files:
        try:
            os.remove(file)
            print(f"Removed corrupted file: {file}")
        except Exception as e:
            print(f"Failed to remove {file}: {e}")

In [None]:
remove_corrupted_files(corrupted_files[1:])