<a href="https://colab.research.google.com/github/Linaqruf/Scraper/blob/main/Bandori_Wiki_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title ## **Install Image Browser**
from google.colab import drive
from IPython.display import clear_output
import os

mount_drive = True # @param {type:'boolean'}
root_dir    = "/content"
drive_dir   = os.path.join(root_dir, "drive", "MyDrive")
repo_dir    = os.path.join(root_dir, f"infinite-image-browsing")
repo_url    = "https://github.com/zanllp/sd-we"+"bui-infinite-image-browsing.git"

def clone_repo(url, dir, branch):
    if not os.path.exists(dir):
       !git clone -b {branch} {url} {dir}

def mount_to_drive(dir):
    if mount_drive:
        if not os.path.exists(dir):
            drive.mount(os.path.dirname(dir))

def install_dependencies():
    requirements = os.path.join(repo_dir, "requirements.txt")

    !apt install aria2
    !pip install --upgrade -r {requirements}

def main():
    mount_to_drive(drive_dir)
    os.chdir(root_dir)
    clone_repo(repo_url, repo_dir, "main")
    os.chdir(repo_dir)
    install_dependencies()
    clear_output(wait=True)

main()


In [None]:
import requests
import os
import threading
import json
import traceback
from bs4 import BeautifulSoup

def get_rarity(rarity_url):
    if 'Rarity1.png' in rarity_url:
        return 1
    elif 'Rarity2.png' in rarity_url:
        return 2
    elif 'Rarity3.png' in rarity_url:
        return 3
    elif 'Rarity4.png' in rarity_url:
        return 4
    elif 'Rarity5.png' in rarity_url:
        return 5
    else:
        return 0

def download(directory, url):
    if not os.path.exists(directory + url.split('/')[-1]):
        with open(directory + url.split('/')[-1], 'wb') as f:
            f.write(requests.get(url).content)
    else:
        return 0

def clean_art_url(art_url):
    return art_url.split("/revision/latest")[0]

base_url = 'https://bandori.fandom.com/wiki/Cards_'

max_index = 1800
step = 100
urls = []

for i in range(1, max_index, step):
    start_range = i
    end_range = min(i + step - 1, max_index)
    url = f"{base_url}{start_range}-{end_range}"
    urls.append(url)

for url_index, url in enumerate(urls):
    print(f"Processing URL {url_index + 1}/{len(urls)}: {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', {'style': 'border:1px solid #AAA; border-collapse: collapse;'})
    cards = []

    for row in table.find_all('tr')[1:]:
        card_no = row.find('th').text.strip()
        id = int(card_no.split(' ')[-1])
        if id > 1719:
            break

        rarity = row.find('td', {'width': None})
        rarity_url = None
        if rarity and rarity.find('img'):
            rarity_url = rarity.find('img').get('data-src') or rarity.find('img').get('src')
        name = row.find('a')['title'] if row.find('a') else None
        art = row.find('td', {'width': '50%'})
        art_url = None
        if art and art.find('img'):
            art_url = art.find('img').get('data-src') or art.find('img').get('src')
            art_url = clean_art_url(art_url)
        else:
            continue

        if rarity_url is None:
            continue

        card_data = {
            'id': id,
            'rarity': get_rarity(rarity_url),
            'name': name,
            'art_url': art_url
        }
        cards.append(card_data)

    for card in cards:
        rarity_folder = f'bandori/art_url/rarity{card["rarity"]}/'
        os.makedirs(rarity_folder, exist_ok=True)
        threading.Thread(target=download, args=(rarity_folder, card['art_url'])).start()

        metadata_folder = f'bandori/metadata/rarity{card["rarity"]}.json'
        os.makedirs('bandori/metadata/', exist_ok=True)
        if not os.path.exists(metadata_folder):
            with open(metadata_folder, 'w', encoding='utf-8') as f:
                json.dump([], f, indent=4, ensure_ascii=False)

        with open(metadata_folder, 'r+', encoding='utf-8') as f:
            metadata = json.load(f)
            metadata.append(card)
            f.seek(0)
            json.dump(metadata, f, indent=4, ensure_ascii=False)
            f.truncate()


In [None]:
# @title ## **1.4. Image Browser**
import os
import json
import portpicker
from IPython.display import clear_output
from threading import Thread
from google.colab.output import serve_kernel_port_as_iframe, serve_kernel_port_as_window

%store -r

window_height = 550 #@param {type:"slider", min:0, max:1000, step:1}
image_dir         = "/content/bandori"
main_app          = os.path.join(repo_dir, "app.py")
config_file       = os.path.join(repo_dir, "config.json")
port              = portpicker.pick_unused_port()

config = {
    "outdir_txt2img_samples": image_dir,
}

def write_file(filename, config):
    with open(filename, 'w',) as f:
        json.dump(config, f, indent=4)

def run_app():
    !python {main_app} --port={port} --sd_webui_config={config_file} > /dev/null 2>&1

def launch():
    os.chdir(root_dir)

    os.chdir(image_dir)
    write_file(config_file, config)

    thread = Thread(target=run_app)
    thread.start()

    serve_kernel_port_as_iframe(port, width='100%', height=window_height, cache_in_notebook=False)
    clear_output(wait=True)

launch()