<a href="https://colab.research.google.com/github/Linaqruf/Scraper/blob/main/Bandori_Wiki_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import os
import threading
import json
import traceback
from bs4 import BeautifulSoup

def get_rarity(rarity_url):
    if 'Rarity1.png' in rarity_url:
        return 1
    elif 'Rarity2.png' in rarity_url:
        return 2
    elif 'Rarity3.png' in rarity_url:
        return 3
    elif 'Rarity4.png' in rarity_url:
        return 4
    elif 'Rarity5.png' in rarity_url:
        return 5
    else:
        return 0

def download(directory, url):
    if not os.path.exists(directory + url.split('/')[-1]):
        with open(directory + url.split('/')[-1], 'wb') as f:
            f.write(requests.get(url).content)
    else:
        return 0

def clean_art_url(art_url):
    return art_url.split("/revision/latest")[0]

base_url = 'https://bandori.fandom.com/wiki/Cards_'

max_index = 1800
step = 100
urls = []

for i in range(1, max_index, step):
    start_range = i
    end_range = min(i + step - 1, max_index)
    url = f"{base_url}{start_range}-{end_range}"
    urls.append(url)

for url_index, url in enumerate(urls):
    print(f"Processing URL {url_index + 1}/{len(urls)}: {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', {'style': 'border:1px solid #AAA; border-collapse: collapse;'})
    cards = []

    for row in table.find_all('tr')[1:]:
        card_no = row.find('th').text.strip()
        id = int(card_no.split(' ')[-1])
        if id > 1719:
            break

        rarity = row.find('td', {'width': None})
        rarity_url = None
        if rarity and rarity.find('img'):
            rarity_url = rarity.find('img').get('data-src') or rarity.find('img').get('src')
        name = row.find('a')['title'] if row.find('a') else None
        art = row.find('td', {'width': '50%'})
        art_url = None
        if art and art.find('img'):
            art_url = art.find('img').get('data-src') or art.find('img').get('src')
            art_url = clean_art_url(art_url)
        else:
            continue

        if rarity_url is None:
            continue

        card_data = {
            'id': id,
            'rarity': get_rarity(rarity_url),
            'name': name,
            'art_url': art_url
        }
        cards.append(card_data)

    for card in cards:
        rarity_folder = f'bandori/art_url/rarity{card["rarity"]}/'
        os.makedirs(rarity_folder, exist_ok=True)
        threading.Thread(target=download, args=(rarity_folder, card['art_url'])).start()

        metadata_folder = f'bandori/metadata/rarity{card["rarity"]}.json'
        os.makedirs('bandori/metadata/', exist_ok=True)
        if not os.path.exists(metadata_folder):
            with open(metadata_folder, 'w', encoding='utf-8') as f:
                json.dump([], f, indent=4, ensure_ascii=False)

        with open(metadata_folder, 'r+', encoding='utf-8') as f:
            metadata = json.load(f)
            metadata.append(card)
            f.seek(0)
            json.dump(metadata, f, indent=4, ensure_ascii=False)
            f.truncate()


Processing URL 1/18: https://bandori.fandom.com/wiki/Cards_1-100
Processing URL 2/18: https://bandori.fandom.com/wiki/Cards_101-200
Processing URL 3/18: https://bandori.fandom.com/wiki/Cards_201-300
Processing URL 4/18: https://bandori.fandom.com/wiki/Cards_301-400
Processing URL 5/18: https://bandori.fandom.com/wiki/Cards_401-500
Processing URL 6/18: https://bandori.fandom.com/wiki/Cards_501-600
Processing URL 7/18: https://bandori.fandom.com/wiki/Cards_601-700
Processing URL 8/18: https://bandori.fandom.com/wiki/Cards_701-800
Processing URL 9/18: https://bandori.fandom.com/wiki/Cards_801-900
Processing URL 10/18: https://bandori.fandom.com/wiki/Cards_901-1000
Processing URL 11/18: https://bandori.fandom.com/wiki/Cards_1001-1100
Processing URL 12/18: https://bandori.fandom.com/wiki/Cards_1101-1200
Processing URL 13/18: https://bandori.fandom.com/wiki/Cards_1201-1300
Processing URL 14/18: https://bandori.fandom.com/wiki/Cards_1301-1400
Processing URL 15/18: https://bandori.fandom.com/w