# Import Library

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from PIL import Image
import uuid  # Import UUID module
import os

# Scrape

## KomikCast

In [12]:
def download_image(img_url, save_path):
    if not img_url.startswith(('http://', 'https://')):
        print(f"Invalid URL for {save_path}. Saving a gray placeholder image instead.")
        # Create a 225x225 gray image using PIL as a placeholder
        img = Image.new('RGB', (225, 225), color='gray')
        img.save(save_path)
    else:
        try:
            # Get the image from the valid URL
            get_image = requests.get(img_url)
            with open(save_path, 'wb') as file:
                file.write(get_image.content)
                
            print(f"Image downloaded for {save_path}")
        except Exception as e:
            print(f"Failed to download image for {save_path}: {e}")
            # Save a gray image in case of failure
            img = Image.new('RGB', (225, 225), color='gray')
            img.save(save_path)
            
def getdataurl(url):
    data = []
    response = requests.get(url) # Send a GET request to the URL
    soup = BeautifulSoup(response.content, 'html.parser')
    comics = soup.find_all('div', class_='list-update_item')
    for comic in comics:
        # raw data comics
        title = comic.find('h3', class_='title').text
        img = comic.find('img', class_='ts-post-image').get('src')
        raw_rate = comic.find('div', class_='numscore').text.replace(',', '.').replace('..', '.').strip()
        rate = float(raw_rate) if re.match(r'^\d+(\.\d+)?$', raw_rate) else 0.0

        type = comic.find('span', class_='type').text
        
        # sub-comics to get details
        soup = BeautifulSoup((requests.get(comic.find('a').get('href'))).content, 'html.parser')
        raw_description = soup.find('div', class_="komik_info-description-sinopsis").text.strip()
        description = re.sub(r'[^a-zA-Z0-9\s]', '', raw_description)
        alt_title = soup.find('span',class_='komik_info-content-native').text.strip()
        released = (soup.find('span', class_='komik_info-content-info-release').text.strip()).replace('Released:', '').strip() 
        author = (soup.find('span', class_='komik_info-content-info').text.strip()).replace('Author:', '').strip()
        
        # Extract genres
        raw_genre = soup.find('span', class_='komik_info-content-genre')
        if raw_genre:
            genres = [a.text.strip() for a in raw_genre.find_all('a')]
            genre = ', '.join(genres)
        else:
            genre = ""
        
        uuid_data = str(uuid.uuid4())
        data.append({
                'id': uuid_data,
                'title': title,
                'alt_title': alt_title,
                'type': type,
                'description': description,
                'genre': genre,
                'author':author,
                'artist':"-",
                'rate': rate,
                'image': img,
                'released': released,
            })
        
        if not os.path.exists('image-komikcast'):
            os.makedirs('image-komikcast')
            
        download_image(img, f'image-komikcast/{uuid_data}.jpg')
        
    return data

In [13]:
url = 'https://komikcast.cz/daftar-komik'

response = requests.get(url)

if response.status_code == 200:
    json = []
    url += "/page"
    soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content of the page
    comics = soup.find_all('div', class_='list-update_item')
    last_page = int((soup.find_all('a', class_='page-numbers')[-2]).text) + 1
    
    indexNum = 0
    last_page = 400
    
    for start in range(indexNum, last_page, 100):
        end = min(start + 100, last_page)
        json_data = []
        for i in range(start, end):
            url_page = url + "/" + str(i)
            getdata = getdataurl(url_page)
            json_data.extend(getdata)
        
        df = pd.DataFrame(json_data)
        
        if not os.path.exists('data-komikcast'):
            os.makedirs('data-komikcast')
        
        df.to_csv(f'data-komikcast/komikcast-{end}.csv', index=False, encoding='utf-8')
else:
    print(f'Failed to retrieve the webpage. Status code: {response.status_code}')

Image downloaded for image/6f9991cf-644d-4f1e-b404-7be92bfff761.jpg
Image downloaded for image/5daa6d61-c6ad-43cc-a69d-47434d44a8de.jpg
Image downloaded for image/941eebbc-12ca-407a-a4a2-1fa5e2a26c7e.jpg
Image downloaded for image/d448a9ba-4c4c-4944-b3eb-11a97d3ddef4.jpg
Image downloaded for image/9383729f-3864-478d-9c48-0f4e20bd2c96.jpg
Image downloaded for image/94a32e0c-2a2b-4a26-95e9-0e09469900ca.jpg
Image downloaded for image/f6dacef1-76a5-4b76-aada-1af44ae53bd6.jpg
Image downloaded for image/a2a8b3dc-d8b8-4984-bed6-0e4e5cec21b6.jpg
Image downloaded for image/c118c098-feea-4ce3-af6a-f3accea2002d.jpg
Image downloaded for image/9df8c1c5-7d4f-4173-a50b-cff7be84fa1f.jpg
Image downloaded for image/71b08d95-319e-48b1-b2c0-fd1391d0dce6.jpg
Image downloaded for image/2b2cac65-3153-4bbc-a0e8-09009acf0ea1.jpg
Image downloaded for image/cb488feb-4254-4e86-9fbd-4d0b3ce6f879.jpg
Image downloaded for image/ef4cf200-a4d8-4f3c-83c8-6e6253a8e699.jpg
Image downloaded for image/20b4e89c-de4d-4984-ab