## Import Library

In [1]:
import requests
import pandas as pd
import re
from PIL import Image
import uuid  # Import UUID module
import os

# Code

In [2]:
# https://api.mangadex.org/manga?limit=32&offset=32
# https://api.mangadex.org/manga?limit=32&offset=32+32
# https://api.mangadex.org/statistics/manga?manga[]=6f1f3a84-f2e3-4512-93bf-009fd12cfce6
# https://api.mangadex.org/manga/a15dc49f-2512-46f2-bc1e-201be8234ee5?includes[]=artist&includes[]=author&includes[]=cover_art
# https://mangadex.org/covers/a15dc49f-2512-46f2-bc1e-201be8234ee5/c20365e1-285a-4e93-b701-9fd03ff4ae19.jpg

In [3]:
# limit = 32
# offset = limit
# url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)
# response = requests.get(url)
# data = response.json()
# page_count = (round(data['total'] / 248))

# for i in range(1, page_count):
#     offset += limit
#     url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)

In [4]:
limit = 32
offset = limit
url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)
url_stat = 'https://api.mangadex.org/statistics/manga?manga[]='

def download_image(img_url, save_path):
    if not img_url.startswith(('http://', 'https://')):
        print(f"Invalid URL for {save_path}. Saving a gray placeholder image instead.")
        # Create a 225x225 gray image using PIL as a placeholder
        img = Image.new('RGB', (225, 225), color='gray')
        img.save(save_path)
    else:
        try:
            # Get the image from the valid URL
            get_image = requests.get(img_url)
            with open(save_path, 'wb') as file:
                file.write(get_image.content)
                
            print(f"Image downloaded for {save_path}")
        except Exception as e:
            print(f"Failed to download image for {save_path}: {e}")
            # Save a gray image in case of failure
            img = Image.new('RGB', (225, 225), color='gray')
            img.save(save_path)
            
def getdataurl(url):
    response = requests.get(url)
    if response.status_code == 200:
        json = []
        lan = ['en', 'ja', 'ko', 'ru', 'zh', 'ko-ro', 'zh-hk','es-la']
        data = response.json()
        
        for comic in data['data']:
            url_manga = 'https://api.mangadex.org/manga'
            url_cover = 'https://mangadex.org/covers'
            id = comic['id']
            
            title = next((v for k, v in comic['attributes']['title'].items() if k in lan), None)
            description = next((v for k, v in comic['attributes']['description'].items() if k in lan), None)
            
            alt_titles = []
            for alt_title_dict in comic['attributes'].get('altTitles', []):
                alt_title = next((v for k, v in alt_title_dict.items() if k in lan), None)
                if alt_title:
                    alt_titles.append(alt_title)
            alt_titles = ', '.join(alt_titles)
            
            genres = []
            for tag in comic['attributes'].get('tags', []):
                tag_name = next((v for k, v in tag['attributes']['name'].items() if k in ['en', 'ja', 'ko']), None)
                if tag_name:
                    genres.append(tag_name)
            genres = ', '.join(genres)
        
            released = comic['attributes']['year'] if comic['attributes']['year'] and int(comic['attributes']['year']) else '-'
            
            rate = 0
            response_rate = requests.get(url_stat+id)
            if response_rate.status_code == 200:
                data_stat = response_rate.json()
                rate = data_stat['statistics'][id]['rating']['average']

            author = ''
            artist = ''
            img = ''
            url_manga += '/'+id+'?includes[]=artist&includes[]=author&includes[]=cover_art'
            response_manga = requests.get(url_manga)
            if response_manga.status_code == 200:
                data_manga = response_manga.json()
                for relation in data_manga['data']['relationships']:
                    if relation['type'] == 'author' and 'attributes' in relation and 'name' in relation['attributes']:
                        author = relation['attributes']['name']
                    if relation['type'] == 'artist' and 'attributes' in relation and 'name' in relation['attributes']:
                        artist = relation['attributes']['name']
                    if relation['type'] == 'cover_art' and 'attributes' in relation and 'fileName' in relation['attributes']:
                        img = f"{url_cover}/{id}/{relation['attributes']['fileName']}"

            uuid_data = str(uuid.uuid4())
            
            json.append({
                'id': uuid_data,
                'title': title,
                'alt_title': alt_titles,
                'type': comic['type'],
                'description': description,
                'genre': genres,
                'author':author,
                'artist':artist,
                'rate': rate,
                'image': img,
                'released': released,
            })
            
            if not os.path.exists('image'):
                os.makedirs('image')
                
            download_image(img, f'image/{uuid_data}.jpg')
            
        return json

In [None]:
df = []
limit = 32
offset = limit
url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)
data = (requests.get(url)).json()
page_count = (round(data['total'] / 248))

# get data manually 300 page's
indexNum = 0 # 901 # 701 # 601 # 501 # 401 #315 # 201 # 100 # 0
page_count = 1100 # 1100 # 901 # 701 # 601 # 501 # 401 # 315 # 201 # 100

for start in range(indexNum, page_count, 100):
    end = min(start + 100, page_count)
    json_data = []
    
    for i in range(indexNum, page_count):
        offset += limit
        url = 'https://api.mangadex.org/manga?limit=' + str(limit) + '&offset=' + str(offset)
        getdata = getdataurl(url)
        df = df + getdata

    df = pd.DataFrame(df)
    
    if not os.path.exists('data'):
        os.makedirs('data')
        
    df.to_csv(f'data/mangadex-{end}.csv', index=False, encoding='utf-8')
    

df

In [6]:
df.to_csv('data/mangadex.csv', index=False, encoding='utf-8')