## Import Library

In [1]:
import requests
import pandas as pd
import re
from PIL import Image
import uuid  # Import UUID module
import os

# Code

In [2]:
# https://api.mangadex.org/manga?limit=32&offset=32
# https://api.mangadex.org/manga?limit=32&offset=32+32
# https://api.mangadex.org/statistics/manga?manga[]=6f1f3a84-f2e3-4512-93bf-009fd12cfce6
# https://api.mangadex.org/manga/a15dc49f-2512-46f2-bc1e-201be8234ee5?includes[]=artist&includes[]=author&includes[]=cover_art
# https://mangadex.org/covers/a15dc49f-2512-46f2-bc1e-201be8234ee5/c20365e1-285a-4e93-b701-9fd03ff4ae19.jpg

In [3]:
# limit = 32
# offset = limit
# url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)
# response = requests.get(url)
# data = response.json()
# page_count = (round(data['total'] / 248))

# for i in range(1, page_count):
#     offset += limit
#     url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)

In [4]:
limit = 32
offset = limit
url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)
url_stat = 'https://api.mangadex.org/statistics/manga?manga[]='

def download_image(img_url, save_path):
    if not img_url.startswith(('http://', 'https://')):
        print(f"Invalid URL for {save_path}. Saving a gray placeholder image instead.")
        # Create a 225x225 gray image using PIL as a placeholder
        img = Image.new('RGB', (225, 225), color='gray')
        img.save(save_path)
    else:
        try:
            # Get the image from the valid URL
            get_image = requests.get(img_url)
            with open(save_path, 'wb') as file:
                file.write(get_image.content)
                
            print(f"Image downloaded for {save_path}")
        except Exception as e:
            print(f"Failed to download image for {save_path}: {e}")
            # Save a gray image in case of failure
            img = Image.new('RGB', (225, 225), color='gray')
            img.save(save_path)
            
def getdataurl(url):
    response = requests.get(url)
    if response.status_code == 200:
        json = []
        lan = ['en', 'ja', 'ko', 'ru', 'zh', 'ko-ro', 'zh-hk','es-la']
        data = response.json()
        
        for comic in data['data']:
            url_manga = 'https://api.mangadex.org/manga'
            url_cover = 'https://mangadex.org/covers'
            id = comic['id']
            
            title = next((v for k, v in comic['attributes']['title'].items() if k in lan), None)
            description = next((v for k, v in comic['attributes']['description'].items() if k in lan), None)
            
            alt_titles = []
            for alt_title_dict in comic['attributes'].get('altTitles', []):
                alt_title = next((v for k, v in alt_title_dict.items() if k in lan), None)
                if alt_title:
                    alt_titles.append(alt_title)
            alt_titles = ', '.join(alt_titles)
            
            genres = []
            for tag in comic['attributes'].get('tags', []):
                tag_name = next((v for k, v in tag['attributes']['name'].items() if k in ['en', 'ja', 'ko']), None)
                if tag_name:
                    genres.append(tag_name)
            genres = ', '.join(genres)
        
            released = comic['attributes']['year'] if comic['attributes']['year'] and int(comic['attributes']['year']) else '-'
            
            rate = 0
            response_rate = requests.get(url_stat+id)
            if response_rate.status_code == 200:
                data_stat = response_rate.json()
                rate = data_stat['statistics'][id]['rating']['average']

            author = ''
            artist = ''
            img = ''
            url_manga += '/'+id+'?includes[]=artist&includes[]=author&includes[]=cover_art'
            response_manga = requests.get(url_manga)
            if response_manga.status_code == 200:
                data_manga = response_manga.json()
                for relation in data_manga['data']['relationships']:
                    if relation['type'] == 'author' and 'attributes' in relation and 'name' in relation['attributes']:
                        author = relation['attributes']['name']
                    if relation['type'] == 'artist' and 'attributes' in relation and 'name' in relation['attributes']:
                        artist = relation['attributes']['name']
                    if relation['type'] == 'cover_art' and 'attributes' in relation and 'fileName' in relation['attributes']:
                        img = f"{url_cover}/{id}/{relation['attributes']['fileName']}"

            uuid_data = str(uuid.uuid4())
            
            json.append({
                'id': uuid_data,
                'title': title,
                'alt_title': alt_titles,
                'type': comic['type'],
                'description': description,
                'genre': genres,
                'author':author,
                'artist':artist,
                'rate': rate,
                'image': img,
                'released': released,
            })
            
            if not os.path.exists('image-mangadex'):
                os.makedirs('image-mangadex')
                
            download_image(img, f'image-mangadex/{uuid_data}.jpg')
            
        return json

In [67]:
datas = []
limit = 32
offset = limit
url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)
data = (requests.get(url)).json()
page_count = (round(data['total'] / 248))

# get data manually 300 page's
indexNum = 0
page_count = 9968
save_temp = 1000

for start in range(indexNum, page_count, 32):
    print(f"Scraping from {start} Start... ")
    end = min(start + 32, page_count)
    json_data = []
    
    print(f"Now you are in offset {end}")
    url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(end)            
    getdata = getdataurl(url)
    datas = datas + getdata
    
    df = pd.DataFrame(datas)
    
    if len(datas) >= save_temp or end == page_count:
        if not os.path.exists('data-mangadex'):
            os.makedirs('data-mangadex')
        df.to_csv(f'data-mangadex/mangadex-{end}.csv', index=False, encoding='utf-8')
        
        save_temp += 1000
        print(f"Scraping to {end} End.")

df

Scraping from 0 Start... 
Now you are in offset 32
Image downloaded for image/ce97a07e-5480-4622-b23d-8e09ea7cabc0.jpg
Image downloaded for image/ef8d2b45-f13f-4606-911f-d232bfb3b7a0.jpg
Image downloaded for image/01a43a5c-eb06-4055-a1a2-3547b1da037e.jpg
Image downloaded for image/0e8d4c93-8e28-4d89-9497-9fe828e44668.jpg
Image downloaded for image/f5c7c060-f737-4b0f-8dfe-c949322dd2e3.jpg
Image downloaded for image/799db417-1be9-497a-9933-c9ab61e7f1a7.jpg
Image downloaded for image/d99eb5a0-78cb-417b-92f5-53f0fbfb8d3e.jpg
Image downloaded for image/4e611639-e28c-49f0-9110-f289a2d17241.jpg
Image downloaded for image/5255643a-524e-41bf-98c2-adffb0cc482c.jpg
Image downloaded for image/f78b3218-a32b-49b0-b3b8-655a168fd603.jpg
Image downloaded for image/b4712d31-cc3c-4522-b999-87376b67a952.jpg
Image downloaded for image/84ffb920-3948-4c4b-90a1-233c331ca1e5.jpg
Image downloaded for image/fb981b54-b01e-4e91-8e48-09e27a3ae084.jpg
Image downloaded for image/40a4bbc2-9e60-4916-8bb5-37e9393b3cb4.j

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [None]:
df.to_csv('data/mangadex.csv', index=False, encoding='utf-8')